1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SmallPtrSet.h"
73 #include "llvm/ADT/SmallSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
91 #include "llvm/Analysis/ProfileSummaryInfo.h"
92 #include "llvm/Analysis/ScalarEvolution.h"
93 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
94 #include "llvm/Analysis/TargetLibraryInfo.h"
95 #include "llvm/Analysis/TargetTransformInfo.h"
96 #include "llvm/Analysis/VectorUtils.h"
97 #include "llvm/IR/Attributes.h"
98 #include "llvm/IR/BasicBlock.h"
99 #include "llvm/IR/CFG.h"
100 #include "llvm/IR/Constant.h"
101 #include "llvm/IR/Constants.h"
102 #include "llvm/IR/DataLayout.h"
103 #include "llvm/IR/DebugInfoMetadata.h"
104 #include "llvm/IR/DebugLoc.h"
105 #include "llvm/IR/DerivedTypes.h"
106 #include "llvm/IR/DiagnosticInfo.h"
107 #include "llvm/IR/Dominators.h"
108 #include "llvm/IR/Function.h"
109 #include "llvm/IR/IRBuilder.h"
110 #include "llvm/IR/InstrTypes.h"
111 #include "llvm/IR/Instruction.h"
112 #include "llvm/IR/Instructions.h"
113 #include "llvm/IR/IntrinsicInst.h"
114 #include "llvm/IR/Intrinsics.h"
115 #include "llvm/IR/LLVMContext.h"
116 #include "llvm/IR/Metadata.h"
117 #include "llvm/IR/Module.h"
118 #include "llvm/IR/Operator.h"
119 #include "llvm/IR/PatternMatch.h"
120 #include "llvm/IR/Type.h"
121 #include "llvm/IR/Use.h"
122 #include "llvm/IR/User.h"
123 #include "llvm/IR/Value.h"
124 #include "llvm/IR/ValueHandle.h"
125 #include "llvm/IR/Verifier.h"
126 #include "llvm/InitializePasses.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/InstructionCost.h"
134 #include "llvm/Support/MathExtras.h"
135 #include "llvm/Support/raw_ostream.h"
136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
137 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
138 #include "llvm/Transforms/Utils/LoopSimplify.h"
139 #include "llvm/Transforms/Utils/LoopUtils.h"
140 #include "llvm/Transforms/Utils/LoopVersioning.h"
141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
142 #include "llvm/Transforms/Utils/SizeOpts.h"
143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
144 #include <algorithm>
145 #include <cassert>
146 #include <cstdint>
147 #include <cstdlib>
148 #include <functional>
149 #include <iterator>
150 #include <limits>
151 #include <memory>
152 #include <string>
153 #include <tuple>
154 #include <utility>
155 
156 using namespace llvm;
157 
158 #define LV_NAME "loop-vectorize"
159 #define DEBUG_TYPE LV_NAME
160 
161 #ifndef NDEBUG
162 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
163 #endif
164 
165 /// @{
166 /// Metadata attribute names
167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
168 const char LLVMLoopVectorizeFollowupVectorized[] =
169     "llvm.loop.vectorize.followup_vectorized";
170 const char LLVMLoopVectorizeFollowupEpilogue[] =
171     "llvm.loop.vectorize.followup_epilogue";
172 /// @}
173 
174 STATISTIC(LoopsVectorized, "Number of loops vectorized");
175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
177 
178 static cl::opt<bool> EnableEpilogueVectorization(
179     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
180     cl::desc("Enable vectorization of epilogue loops."));
181 
182 static cl::opt<unsigned> EpilogueVectorizationForceVF(
183     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
184     cl::desc("When epilogue vectorization is enabled, and a value greater than "
185              "1 is specified, forces the given VF for all applicable epilogue "
186              "loops."));
187 
188 static cl::opt<unsigned> EpilogueVectorizationMinVF(
189     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
190     cl::desc("Only loops with vectorization factor equal to or larger than "
191              "the specified value are considered for epilogue vectorization."));
192 
193 /// Loops with a known constant trip count below this number are vectorized only
194 /// if no scalar iteration overheads are incurred.
195 static cl::opt<unsigned> TinyTripCountVectorThreshold(
196     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
197     cl::desc("Loops with a constant trip count that is smaller than this "
198              "value are vectorized only if no scalar iteration overheads "
199              "are incurred."));
200 
201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
202     "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
203     cl::desc("The maximum allowed number of runtime memory checks with a "
204              "vectorize(enable) pragma."));
205 
206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
207 // that predication is preferred, and this lists all options. I.e., the
208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
209 // and predicate the instructions accordingly. If tail-folding fails, there are
210 // different fallback strategies depending on these values:
211 namespace PreferPredicateTy {
212   enum Option {
213     ScalarEpilogue = 0,
214     PredicateElseScalarEpilogue,
215     PredicateOrDontVectorize
216   };
217 } // namespace PreferPredicateTy
218 
219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
220     "prefer-predicate-over-epilogue",
221     cl::init(PreferPredicateTy::ScalarEpilogue),
222     cl::Hidden,
223     cl::desc("Tail-folding and predication preferences over creating a scalar "
224              "epilogue loop."),
225     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
226                          "scalar-epilogue",
227                          "Don't tail-predicate loops, create scalar epilogue"),
228               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
229                          "predicate-else-scalar-epilogue",
230                          "prefer tail-folding, create scalar epilogue if tail "
231                          "folding fails."),
232               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
233                          "predicate-dont-vectorize",
234                          "prefers tail-folding, don't attempt vectorization if "
235                          "tail-folding fails.")));
236 
237 static cl::opt<bool> MaximizeBandwidth(
238     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
239     cl::desc("Maximize bandwidth when selecting vectorization factor which "
240              "will be determined by the smallest type in loop."));
241 
242 static cl::opt<bool> EnableInterleavedMemAccesses(
243     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
244     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
245 
246 /// An interleave-group may need masking if it resides in a block that needs
247 /// predication, or in order to mask away gaps.
248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
249     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
250     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
251 
252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
253     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
254     cl::desc("We don't interleave loops with a estimated constant trip count "
255              "below this number"));
256 
257 static cl::opt<unsigned> ForceTargetNumScalarRegs(
258     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
259     cl::desc("A flag that overrides the target's number of scalar registers."));
260 
261 static cl::opt<unsigned> ForceTargetNumVectorRegs(
262     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
263     cl::desc("A flag that overrides the target's number of vector registers."));
264 
265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
266     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
267     cl::desc("A flag that overrides the target's max interleave factor for "
268              "scalar loops."));
269 
270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
271     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
272     cl::desc("A flag that overrides the target's max interleave factor for "
273              "vectorized loops."));
274 
275 static cl::opt<unsigned> ForceTargetInstructionCost(
276     "force-target-instruction-cost", cl::init(0), cl::Hidden,
277     cl::desc("A flag that overrides the target's expected cost for "
278              "an instruction to a single constant value. Mostly "
279              "useful for getting consistent testing."));
280 
281 static cl::opt<bool> ForceTargetSupportsScalableVectors(
282     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
283     cl::desc(
284         "Pretend that scalable vectors are supported, even if the target does "
285         "not support them. This flag should only be used for testing."));
286 
287 static cl::opt<unsigned> SmallLoopCost(
288     "small-loop-cost", cl::init(20), cl::Hidden,
289     cl::desc(
290         "The cost of a loop that is considered 'small' by the interleaver."));
291 
292 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
293     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
294     cl::desc("Enable the use of the block frequency analysis to access PGO "
295              "heuristics minimizing code growth in cold regions and being more "
296              "aggressive in hot regions."));
297 
298 // Runtime interleave loops for load/store throughput.
299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
300     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
301     cl::desc(
302         "Enable runtime interleaving until load/store ports are saturated"));
303 
304 /// Interleave small loops with scalar reductions.
305 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
306     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
307     cl::desc("Enable interleaving for loops with small iteration counts that "
308              "contain scalar reductions to expose ILP."));
309 
310 /// The number of stores in a loop that are allowed to need predication.
311 static cl::opt<unsigned> NumberOfStoresToPredicate(
312     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
313     cl::desc("Max number of stores to be predicated behind an if."));
314 
315 static cl::opt<bool> EnableIndVarRegisterHeur(
316     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
317     cl::desc("Count the induction variable only once when interleaving"));
318 
319 static cl::opt<bool> EnableCondStoresVectorization(
320     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
321     cl::desc("Enable if predication of stores during vectorization."));
322 
323 static cl::opt<unsigned> MaxNestedScalarReductionIC(
324     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
325     cl::desc("The maximum interleave count to use when interleaving a scalar "
326              "reduction in a nested loop."));
327 
328 static cl::opt<bool>
329     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
330                            cl::Hidden,
331                            cl::desc("Prefer in-loop vector reductions, "
332                                     "overriding the targets preference."));
333 
334 static cl::opt<bool> ForceOrderedReductions(
335     "force-ordered-reductions", cl::init(false), cl::Hidden,
336     cl::desc("Enable the vectorisation of loops with in-order (strict) "
337              "FP reductions"));
338 
339 static cl::opt<bool> PreferPredicatedReductionSelect(
340     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
341     cl::desc(
342         "Prefer predicating a reduction operation over an after loop select."));
343 
344 cl::opt<bool> EnableVPlanNativePath(
345     "enable-vplan-native-path", cl::init(false), cl::Hidden,
346     cl::desc("Enable VPlan-native vectorization path with "
347              "support for outer loop vectorization."));
348 
349 // FIXME: Remove this switch once we have divergence analysis. Currently we
350 // assume divergent non-backedge branches when this switch is true.
351 cl::opt<bool> EnableVPlanPredication(
352     "enable-vplan-predication", cl::init(false), cl::Hidden,
353     cl::desc("Enable VPlan-native vectorization path predicator with "
354              "support for outer loop vectorization."));
355 
356 // This flag enables the stress testing of the VPlan H-CFG construction in the
357 // VPlan-native vectorization path. It must be used in conjuction with
358 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
359 // verification of the H-CFGs built.
360 static cl::opt<bool> VPlanBuildStressTest(
361     "vplan-build-stress-test", cl::init(false), cl::Hidden,
362     cl::desc(
363         "Build VPlan for every supported loop nest in the function and bail "
364         "out right after the build (stress test the VPlan H-CFG construction "
365         "in the VPlan-native vectorization path)."));
366 
367 cl::opt<bool> llvm::EnableLoopInterleaving(
368     "interleave-loops", cl::init(true), cl::Hidden,
369     cl::desc("Enable loop interleaving in Loop vectorization passes"));
370 cl::opt<bool> llvm::EnableLoopVectorization(
371     "vectorize-loops", cl::init(true), cl::Hidden,
372     cl::desc("Run the Loop vectorization passes"));
373 
374 cl::opt<bool> PrintVPlansInDotFormat(
375     "vplan-print-in-dot-format", cl::init(false), cl::Hidden,
376     cl::desc("Use dot format instead of plain text when dumping VPlans"));
377 
378 /// A helper function that returns true if the given type is irregular. The
379 /// type is irregular if its allocated size doesn't equal the store size of an
380 /// element of the corresponding vector type.
381 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
382   // Determine if an array of N elements of type Ty is "bitcast compatible"
383   // with a <N x Ty> vector.
384   // This is only true if there is no padding between the array elements.
385   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
386 }
387 
388 /// A helper function that returns the reciprocal of the block probability of
389 /// predicated blocks. If we return X, we are assuming the predicated block
390 /// will execute once for every X iterations of the loop header.
391 ///
392 /// TODO: We should use actual block probability here, if available. Currently,
393 ///       we always assume predicated blocks have a 50% chance of executing.
394 static unsigned getReciprocalPredBlockProb() { return 2; }
395 
396 /// A helper function that returns an integer or floating-point constant with
397 /// value C.
398 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
399   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
400                            : ConstantFP::get(Ty, C);
401 }
402 
403 /// Returns "best known" trip count for the specified loop \p L as defined by
404 /// the following procedure:
405 ///   1) Returns exact trip count if it is known.
406 ///   2) Returns expected trip count according to profile data if any.
407 ///   3) Returns upper bound estimate if it is known.
408 ///   4) Returns None if all of the above failed.
409 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
410   // Check if exact trip count is known.
411   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
412     return ExpectedTC;
413 
414   // Check if there is an expected trip count available from profile data.
415   if (LoopVectorizeWithBlockFrequency)
416     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
417       return EstimatedTC;
418 
419   // Check if upper bound estimate is known.
420   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
421     return ExpectedTC;
422 
423   return None;
424 }
425 
426 // Forward declare GeneratedRTChecks.
427 class GeneratedRTChecks;
428 
429 namespace llvm {
430 
431 AnalysisKey ShouldRunExtraVectorPasses::Key;
432 
433 /// InnerLoopVectorizer vectorizes loops which contain only one basic
434 /// block to a specified vectorization factor (VF).
435 /// This class performs the widening of scalars into vectors, or multiple
436 /// scalars. This class also implements the following features:
437 /// * It inserts an epilogue loop for handling loops that don't have iteration
438 ///   counts that are known to be a multiple of the vectorization factor.
439 /// * It handles the code generation for reduction variables.
440 /// * Scalarization (implementation using scalars) of un-vectorizable
441 ///   instructions.
442 /// InnerLoopVectorizer does not perform any vectorization-legality
443 /// checks, and relies on the caller to check for the different legality
444 /// aspects. The InnerLoopVectorizer relies on the
445 /// LoopVectorizationLegality class to provide information about the induction
446 /// and reduction variables that were found to a given vectorization factor.
447 class InnerLoopVectorizer {
448 public:
449   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
450                       LoopInfo *LI, DominatorTree *DT,
451                       const TargetLibraryInfo *TLI,
452                       const TargetTransformInfo *TTI, AssumptionCache *AC,
453                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
454                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
455                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
456                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
457       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
458         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
459         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
460         PSI(PSI), RTChecks(RTChecks) {
461     // Query this against the original loop and save it here because the profile
462     // of the original loop header may change as the transformation happens.
463     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
464         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
465   }
466 
467   virtual ~InnerLoopVectorizer() = default;
468 
469   /// Create a new empty loop that will contain vectorized instructions later
470   /// on, while the old loop will be used as the scalar remainder. Control flow
471   /// is generated around the vectorized (and scalar epilogue) loops consisting
472   /// of various checks and bypasses. Return the pre-header block of the new
473   /// loop.
474   /// In the case of epilogue vectorization, this function is overriden to
475   /// handle the more complex control flow around the loops.
476   virtual BasicBlock *createVectorizedLoopSkeleton();
477 
478   /// Widen a single call instruction within the innermost loop.
479   void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
480                             VPTransformState &State);
481 
482   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
483   void fixVectorizedLoop(VPTransformState &State);
484 
485   // Return true if any runtime check is added.
486   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
487 
488   /// A type for vectorized values in the new loop. Each value from the
489   /// original loop, when vectorized, is represented by UF vector values in the
490   /// new unrolled loop, where UF is the unroll factor.
491   using VectorParts = SmallVector<Value *, 2>;
492 
493   /// Vectorize a single first-order recurrence or pointer induction PHINode in
494   /// a block. This method handles the induction variable canonicalization. It
495   /// supports both VF = 1 for unrolled loops and arbitrary length vectors.
496   void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR,
497                            VPTransformState &State);
498 
499   /// A helper function to scalarize a single Instruction in the innermost loop.
500   /// Generates a sequence of scalar instances for each lane between \p MinLane
501   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
502   /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
503   /// Instr's operands.
504   void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe,
505                             const VPIteration &Instance, bool IfPredicateInstr,
506                             VPTransformState &State);
507 
508   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
509   /// is provided, the integer induction variable will first be truncated to
510   /// the corresponding type.
511   void widenIntOrFpInduction(PHINode *IV, const InductionDescriptor &ID,
512                              Value *Start, TruncInst *Trunc, VPValue *Def,
513                              VPTransformState &State);
514 
515   /// Construct the vector value of a scalarized value \p V one lane at a time.
516   void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
517                                  VPTransformState &State);
518 
519   /// Try to vectorize interleaved access group \p Group with the base address
520   /// given in \p Addr, optionally masking the vector operations if \p
521   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
522   /// values in the vectorized loop.
523   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
524                                 ArrayRef<VPValue *> VPDefs,
525                                 VPTransformState &State, VPValue *Addr,
526                                 ArrayRef<VPValue *> StoredValues,
527                                 VPValue *BlockInMask = nullptr);
528 
529   /// Set the debug location in the builder \p Ptr using the debug location in
530   /// \p V. If \p Ptr is None then it uses the class member's Builder.
531   void setDebugLocFromInst(const Value *V,
532                            Optional<IRBuilder<> *> CustomBuilder = None);
533 
534   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
535   void fixNonInductionPHIs(VPTransformState &State);
536 
537   /// Returns true if the reordering of FP operations is not allowed, but we are
538   /// able to vectorize with strict in-order reductions for the given RdxDesc.
539   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
540 
541   /// Create a broadcast instruction. This method generates a broadcast
542   /// instruction (shuffle) for loop invariant values and for the induction
543   /// value. If this is the induction variable then we extend it to N, N+1, ...
544   /// this is needed because each iteration in the loop corresponds to a SIMD
545   /// element.
546   virtual Value *getBroadcastInstrs(Value *V);
547 
548   /// Add metadata from one instruction to another.
549   ///
550   /// This includes both the original MDs from \p From and additional ones (\see
551   /// addNewMetadata).  Use this for *newly created* instructions in the vector
552   /// loop.
553   void addMetadata(Instruction *To, Instruction *From);
554 
555   /// Similar to the previous function but it adds the metadata to a
556   /// vector of instructions.
557   void addMetadata(ArrayRef<Value *> To, Instruction *From);
558 
559 protected:
560   friend class LoopVectorizationPlanner;
561 
562   /// A small list of PHINodes.
563   using PhiVector = SmallVector<PHINode *, 4>;
564 
565   /// A type for scalarized values in the new loop. Each value from the
566   /// original loop, when scalarized, is represented by UF x VF scalar values
567   /// in the new unrolled loop, where UF is the unroll factor and VF is the
568   /// vectorization factor.
569   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
570 
571   /// Set up the values of the IVs correctly when exiting the vector loop.
572   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
573                     Value *CountRoundDown, Value *EndValue,
574                     BasicBlock *MiddleBlock);
575 
576   /// Create a new induction variable inside L.
577   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
578                                    Value *Step, Instruction *DL);
579 
580   /// Handle all cross-iteration phis in the header.
581   void fixCrossIterationPHIs(VPTransformState &State);
582 
583   /// Create the exit value of first order recurrences in the middle block and
584   /// update their users.
585   void fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, VPTransformState &State);
586 
587   /// Create code for the loop exit value of the reduction.
588   void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
589 
590   /// Clear NSW/NUW flags from reduction instructions if necessary.
591   void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
592                                VPTransformState &State);
593 
594   /// Fixup the LCSSA phi nodes in the unique exit block.  This simply
595   /// means we need to add the appropriate incoming value from the middle
596   /// block as exiting edges from the scalar epilogue loop (if present) are
597   /// already in place, and we exit the vector loop exclusively to the middle
598   /// block.
599   void fixLCSSAPHIs(VPTransformState &State);
600 
601   /// Iteratively sink the scalarized operands of a predicated instruction into
602   /// the block that was created for it.
603   void sinkScalarOperands(Instruction *PredInst);
604 
605   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
606   /// represented as.
607   void truncateToMinimalBitwidths(VPTransformState &State);
608 
609   /// This function adds
610   /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
611   /// to each vector element of Val. The sequence starts at StartIndex.
612   /// \p Opcode is relevant for FP induction variable.
613   virtual Value *
614   getStepVector(Value *Val, Value *StartIdx, Value *Step,
615                 Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd);
616 
617   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
618   /// variable on which to base the steps, \p Step is the size of the step, and
619   /// \p EntryVal is the value from the original loop that maps to the steps.
620   /// Note that \p EntryVal doesn't have to be an induction variable - it
621   /// can also be a truncate instruction.
622   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
623                         const InductionDescriptor &ID, VPValue *Def,
624                         VPTransformState &State);
625 
626   /// Create a vector induction phi node based on an existing scalar one. \p
627   /// EntryVal is the value from the original loop that maps to the vector phi
628   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
629   /// truncate instruction, instead of widening the original IV, we widen a
630   /// version of the IV truncated to \p EntryVal's type.
631   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
632                                        Value *Step, Value *Start,
633                                        Instruction *EntryVal, VPValue *Def,
634                                        VPTransformState &State);
635 
636   /// Returns true if an instruction \p I should be scalarized instead of
637   /// vectorized for the chosen vectorization factor.
638   bool shouldScalarizeInstruction(Instruction *I) const;
639 
640   /// Returns true if we should generate a scalar version of \p IV.
641   bool needsScalarInduction(Instruction *IV) const;
642 
643   /// Generate a shuffle sequence that will reverse the vector Vec.
644   virtual Value *reverseVector(Value *Vec);
645 
646   /// Returns (and creates if needed) the original loop trip count.
647   Value *getOrCreateTripCount(Loop *NewLoop);
648 
649   /// Returns (and creates if needed) the trip count of the widened loop.
650   Value *getOrCreateVectorTripCount(Loop *NewLoop);
651 
652   /// Returns a bitcasted value to the requested vector type.
653   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
654   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
655                                 const DataLayout &DL);
656 
657   /// Emit a bypass check to see if the vector trip count is zero, including if
658   /// it overflows.
659   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
660 
661   /// Emit a bypass check to see if all of the SCEV assumptions we've
662   /// had to make are correct. Returns the block containing the checks or
663   /// nullptr if no checks have been added.
664   BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass);
665 
666   /// Emit bypass checks to check any memory assumptions we may have made.
667   /// Returns the block containing the checks or nullptr if no checks have been
668   /// added.
669   BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
670 
671   /// Compute the transformed value of Index at offset StartValue using step
672   /// StepValue.
673   /// For integer induction, returns StartValue + Index * StepValue.
674   /// For pointer induction, returns StartValue[Index * StepValue].
675   /// FIXME: The newly created binary instructions should contain nsw/nuw
676   /// flags, which can be found from the original scalar operations.
677   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
678                               const DataLayout &DL,
679                               const InductionDescriptor &ID,
680                               BasicBlock *VectorHeader) const;
681 
682   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
683   /// vector loop preheader, middle block and scalar preheader. Also
684   /// allocate a loop object for the new vector loop and return it.
685   Loop *createVectorLoopSkeleton(StringRef Prefix);
686 
687   /// Create new phi nodes for the induction variables to resume iteration count
688   /// in the scalar epilogue, from where the vectorized loop left off (given by
689   /// \p VectorTripCount).
690   /// In cases where the loop skeleton is more complicated (eg. epilogue
691   /// vectorization) and the resume values can come from an additional bypass
692   /// block, the \p AdditionalBypass pair provides information about the bypass
693   /// block and the end value on the edge from bypass to this loop.
694   void createInductionResumeValues(
695       Loop *L, Value *VectorTripCount,
696       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
697 
698   /// Complete the loop skeleton by adding debug MDs, creating appropriate
699   /// conditional branches in the middle block, preparing the builder and
700   /// running the verifier. Take in the vector loop \p L as argument, and return
701   /// the preheader of the completed vector loop.
702   BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
703 
704   /// Add additional metadata to \p To that was not present on \p Orig.
705   ///
706   /// Currently this is used to add the noalias annotations based on the
707   /// inserted memchecks.  Use this for instructions that are *cloned* into the
708   /// vector loop.
709   void addNewMetadata(Instruction *To, const Instruction *Orig);
710 
711   /// Collect poison-generating recipes that may generate a poison value that is
712   /// used after vectorization, even when their operands are not poison. Those
713   /// recipes meet the following conditions:
714   ///  * Contribute to the address computation of a recipe generating a widen
715   ///    memory load/store (VPWidenMemoryInstructionRecipe or
716   ///    VPInterleaveRecipe).
717   ///  * Such a widen memory load/store has at least one underlying Instruction
718   ///    that is in a basic block that needs predication and after vectorization
719   ///    the generated instruction won't be predicated.
720   void collectPoisonGeneratingRecipes(VPTransformState &State);
721 
722   /// Allow subclasses to override and print debug traces before/after vplan
723   /// execution, when trace information is requested.
724   virtual void printDebugTracesAtStart(){};
725   virtual void printDebugTracesAtEnd(){};
726 
727   /// The original loop.
728   Loop *OrigLoop;
729 
730   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
731   /// dynamic knowledge to simplify SCEV expressions and converts them to a
732   /// more usable form.
733   PredicatedScalarEvolution &PSE;
734 
735   /// Loop Info.
736   LoopInfo *LI;
737 
738   /// Dominator Tree.
739   DominatorTree *DT;
740 
741   /// Alias Analysis.
742   AAResults *AA;
743 
744   /// Target Library Info.
745   const TargetLibraryInfo *TLI;
746 
747   /// Target Transform Info.
748   const TargetTransformInfo *TTI;
749 
750   /// Assumption Cache.
751   AssumptionCache *AC;
752 
753   /// Interface to emit optimization remarks.
754   OptimizationRemarkEmitter *ORE;
755 
756   /// LoopVersioning.  It's only set up (non-null) if memchecks were
757   /// used.
758   ///
759   /// This is currently only used to add no-alias metadata based on the
760   /// memchecks.  The actually versioning is performed manually.
761   std::unique_ptr<LoopVersioning> LVer;
762 
763   /// The vectorization SIMD factor to use. Each vector will have this many
764   /// vector elements.
765   ElementCount VF;
766 
767   /// The vectorization unroll factor to use. Each scalar is vectorized to this
768   /// many different vector instructions.
769   unsigned UF;
770 
771   /// The builder that we use
772   IRBuilder<> Builder;
773 
774   // --- Vectorization state ---
775 
776   /// The vector-loop preheader.
777   BasicBlock *LoopVectorPreHeader;
778 
779   /// The scalar-loop preheader.
780   BasicBlock *LoopScalarPreHeader;
781 
782   /// Middle Block between the vector and the scalar.
783   BasicBlock *LoopMiddleBlock;
784 
785   /// The unique ExitBlock of the scalar loop if one exists.  Note that
786   /// there can be multiple exiting edges reaching this block.
787   BasicBlock *LoopExitBlock;
788 
789   /// The vector loop body.
790   BasicBlock *LoopVectorBody;
791 
792   /// The scalar loop body.
793   BasicBlock *LoopScalarBody;
794 
795   /// A list of all bypass blocks. The first block is the entry of the loop.
796   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
797 
798   /// The new Induction variable which was added to the new block.
799   PHINode *Induction = nullptr;
800 
801   /// The induction variable of the old basic block.
802   PHINode *OldInduction = nullptr;
803 
804   /// Store instructions that were predicated.
805   SmallVector<Instruction *, 4> PredicatedInstructions;
806 
807   /// Trip count of the original loop.
808   Value *TripCount = nullptr;
809 
810   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
811   Value *VectorTripCount = nullptr;
812 
813   /// The legality analysis.
814   LoopVectorizationLegality *Legal;
815 
816   /// The profitablity analysis.
817   LoopVectorizationCostModel *Cost;
818 
819   // Record whether runtime checks are added.
820   bool AddedSafetyChecks = false;
821 
822   // Holds the end values for each induction variable. We save the end values
823   // so we can later fix-up the external users of the induction variables.
824   DenseMap<PHINode *, Value *> IVEndValues;
825 
826   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
827   // fixed up at the end of vector code generation.
828   SmallVector<PHINode *, 8> OrigPHIsToFix;
829 
830   /// BFI and PSI are used to check for profile guided size optimizations.
831   BlockFrequencyInfo *BFI;
832   ProfileSummaryInfo *PSI;
833 
834   // Whether this loop should be optimized for size based on profile guided size
835   // optimizatios.
836   bool OptForSizeBasedOnProfile;
837 
838   /// Structure to hold information about generated runtime checks, responsible
839   /// for cleaning the checks, if vectorization turns out unprofitable.
840   GeneratedRTChecks &RTChecks;
841 };
842 
843 class InnerLoopUnroller : public InnerLoopVectorizer {
844 public:
845   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
846                     LoopInfo *LI, DominatorTree *DT,
847                     const TargetLibraryInfo *TLI,
848                     const TargetTransformInfo *TTI, AssumptionCache *AC,
849                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
850                     LoopVectorizationLegality *LVL,
851                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
852                     ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
853       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
854                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
855                             BFI, PSI, Check) {}
856 
857 private:
858   Value *getBroadcastInstrs(Value *V) override;
859   Value *getStepVector(
860       Value *Val, Value *StartIdx, Value *Step,
861       Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd) override;
862   Value *reverseVector(Value *Vec) override;
863 };
864 
865 /// Encapsulate information regarding vectorization of a loop and its epilogue.
866 /// This information is meant to be updated and used across two stages of
867 /// epilogue vectorization.
868 struct EpilogueLoopVectorizationInfo {
869   ElementCount MainLoopVF = ElementCount::getFixed(0);
870   unsigned MainLoopUF = 0;
871   ElementCount EpilogueVF = ElementCount::getFixed(0);
872   unsigned EpilogueUF = 0;
873   BasicBlock *MainLoopIterationCountCheck = nullptr;
874   BasicBlock *EpilogueIterationCountCheck = nullptr;
875   BasicBlock *SCEVSafetyCheck = nullptr;
876   BasicBlock *MemSafetyCheck = nullptr;
877   Value *TripCount = nullptr;
878   Value *VectorTripCount = nullptr;
879 
880   EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
881                                 ElementCount EVF, unsigned EUF)
882       : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
883     assert(EUF == 1 &&
884            "A high UF for the epilogue loop is likely not beneficial.");
885   }
886 };
887 
888 /// An extension of the inner loop vectorizer that creates a skeleton for a
889 /// vectorized loop that has its epilogue (residual) also vectorized.
890 /// The idea is to run the vplan on a given loop twice, firstly to setup the
891 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
892 /// from the first step and vectorize the epilogue.  This is achieved by
893 /// deriving two concrete strategy classes from this base class and invoking
894 /// them in succession from the loop vectorizer planner.
895 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
896 public:
897   InnerLoopAndEpilogueVectorizer(
898       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
899       DominatorTree *DT, const TargetLibraryInfo *TLI,
900       const TargetTransformInfo *TTI, AssumptionCache *AC,
901       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
902       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
903       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
904       GeneratedRTChecks &Checks)
905       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
906                             EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI,
907                             Checks),
908         EPI(EPI) {}
909 
910   // Override this function to handle the more complex control flow around the
911   // three loops.
912   BasicBlock *createVectorizedLoopSkeleton() final override {
913     return createEpilogueVectorizedLoopSkeleton();
914   }
915 
916   /// The interface for creating a vectorized skeleton using one of two
917   /// different strategies, each corresponding to one execution of the vplan
918   /// as described above.
919   virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
920 
921   /// Holds and updates state information required to vectorize the main loop
922   /// and its epilogue in two separate passes. This setup helps us avoid
923   /// regenerating and recomputing runtime safety checks. It also helps us to
924   /// shorten the iteration-count-check path length for the cases where the
925   /// iteration count of the loop is so small that the main vector loop is
926   /// completely skipped.
927   EpilogueLoopVectorizationInfo &EPI;
928 };
929 
930 /// A specialized derived class of inner loop vectorizer that performs
931 /// vectorization of *main* loops in the process of vectorizing loops and their
932 /// epilogues.
933 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
934 public:
935   EpilogueVectorizerMainLoop(
936       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
937       DominatorTree *DT, const TargetLibraryInfo *TLI,
938       const TargetTransformInfo *TTI, AssumptionCache *AC,
939       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
940       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
941       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
942       GeneratedRTChecks &Check)
943       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
944                                        EPI, LVL, CM, BFI, PSI, Check) {}
945   /// Implements the interface for creating a vectorized skeleton using the
946   /// *main loop* strategy (ie the first pass of vplan execution).
947   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
948 
949 protected:
950   /// Emits an iteration count bypass check once for the main loop (when \p
951   /// ForEpilogue is false) and once for the epilogue loop (when \p
952   /// ForEpilogue is true).
953   BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
954                                              bool ForEpilogue);
955   void printDebugTracesAtStart() override;
956   void printDebugTracesAtEnd() override;
957 };
958 
959 // A specialized derived class of inner loop vectorizer that performs
960 // vectorization of *epilogue* loops in the process of vectorizing loops and
961 // their epilogues.
962 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
963 public:
964   EpilogueVectorizerEpilogueLoop(
965       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
966       DominatorTree *DT, const TargetLibraryInfo *TLI,
967       const TargetTransformInfo *TTI, AssumptionCache *AC,
968       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
969       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
970       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
971       GeneratedRTChecks &Checks)
972       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
973                                        EPI, LVL, CM, BFI, PSI, Checks) {}
974   /// Implements the interface for creating a vectorized skeleton using the
975   /// *epilogue loop* strategy (ie the second pass of vplan execution).
976   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
977 
978 protected:
979   /// Emits an iteration count bypass check after the main vector loop has
980   /// finished to see if there are any iterations left to execute by either
981   /// the vector epilogue or the scalar epilogue.
982   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
983                                                       BasicBlock *Bypass,
984                                                       BasicBlock *Insert);
985   void printDebugTracesAtStart() override;
986   void printDebugTracesAtEnd() override;
987 };
988 } // end namespace llvm
989 
990 /// Look for a meaningful debug location on the instruction or it's
991 /// operands.
992 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
993   if (!I)
994     return I;
995 
996   DebugLoc Empty;
997   if (I->getDebugLoc() != Empty)
998     return I;
999 
1000   for (Use &Op : I->operands()) {
1001     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
1002       if (OpInst->getDebugLoc() != Empty)
1003         return OpInst;
1004   }
1005 
1006   return I;
1007 }
1008 
1009 void InnerLoopVectorizer::setDebugLocFromInst(
1010     const Value *V, Optional<IRBuilder<> *> CustomBuilder) {
1011   IRBuilder<> *B = (CustomBuilder == None) ? &Builder : *CustomBuilder;
1012   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) {
1013     const DILocation *DIL = Inst->getDebugLoc();
1014 
1015     // When a FSDiscriminator is enabled, we don't need to add the multiply
1016     // factors to the discriminators.
1017     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
1018         !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) {
1019       // FIXME: For scalable vectors, assume vscale=1.
1020       auto NewDIL =
1021           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
1022       if (NewDIL)
1023         B->SetCurrentDebugLocation(NewDIL.getValue());
1024       else
1025         LLVM_DEBUG(dbgs()
1026                    << "Failed to create new discriminator: "
1027                    << DIL->getFilename() << " Line: " << DIL->getLine());
1028     } else
1029       B->SetCurrentDebugLocation(DIL);
1030   } else
1031     B->SetCurrentDebugLocation(DebugLoc());
1032 }
1033 
1034 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
1035 /// is passed, the message relates to that particular instruction.
1036 #ifndef NDEBUG
1037 static void debugVectorizationMessage(const StringRef Prefix,
1038                                       const StringRef DebugMsg,
1039                                       Instruction *I) {
1040   dbgs() << "LV: " << Prefix << DebugMsg;
1041   if (I != nullptr)
1042     dbgs() << " " << *I;
1043   else
1044     dbgs() << '.';
1045   dbgs() << '\n';
1046 }
1047 #endif
1048 
1049 /// Create an analysis remark that explains why vectorization failed
1050 ///
1051 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
1052 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
1053 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
1054 /// the location of the remark.  \return the remark object that can be
1055 /// streamed to.
1056 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
1057     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
1058   Value *CodeRegion = TheLoop->getHeader();
1059   DebugLoc DL = TheLoop->getStartLoc();
1060 
1061   if (I) {
1062     CodeRegion = I->getParent();
1063     // If there is no debug location attached to the instruction, revert back to
1064     // using the loop's.
1065     if (I->getDebugLoc())
1066       DL = I->getDebugLoc();
1067   }
1068 
1069   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
1070 }
1071 
1072 /// Return a value for Step multiplied by VF.
1073 static Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF,
1074                               int64_t Step) {
1075   assert(Ty->isIntegerTy() && "Expected an integer step");
1076   Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue());
1077   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
1078 }
1079 
1080 namespace llvm {
1081 
1082 /// Return the runtime value for VF.
1083 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) {
1084   Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
1085   return VF.isScalable() ? B.CreateVScale(EC) : EC;
1086 }
1087 
1088 static Value *getRuntimeVFAsFloat(IRBuilder<> &B, Type *FTy, ElementCount VF) {
1089   assert(FTy->isFloatingPointTy() && "Expected floating point type!");
1090   Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
1091   Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
1092   return B.CreateUIToFP(RuntimeVF, FTy);
1093 }
1094 
1095 void reportVectorizationFailure(const StringRef DebugMsg,
1096                                 const StringRef OREMsg, const StringRef ORETag,
1097                                 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1098                                 Instruction *I) {
1099   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
1100   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1101   ORE->emit(
1102       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1103       << "loop not vectorized: " << OREMsg);
1104 }
1105 
1106 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1107                              OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1108                              Instruction *I) {
1109   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
1110   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1111   ORE->emit(
1112       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1113       << Msg);
1114 }
1115 
1116 } // end namespace llvm
1117 
1118 #ifndef NDEBUG
1119 /// \return string containing a file name and a line # for the given loop.
1120 static std::string getDebugLocString(const Loop *L) {
1121   std::string Result;
1122   if (L) {
1123     raw_string_ostream OS(Result);
1124     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1125       LoopDbgLoc.print(OS);
1126     else
1127       // Just print the module name.
1128       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1129     OS.flush();
1130   }
1131   return Result;
1132 }
1133 #endif
1134 
1135 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1136                                          const Instruction *Orig) {
1137   // If the loop was versioned with memchecks, add the corresponding no-alias
1138   // metadata.
1139   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1140     LVer->annotateInstWithNoAlias(To, Orig);
1141 }
1142 
1143 void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
1144     VPTransformState &State) {
1145 
1146   // Collect recipes in the backward slice of `Root` that may generate a poison
1147   // value that is used after vectorization.
1148   SmallPtrSet<VPRecipeBase *, 16> Visited;
1149   auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
1150     SmallVector<VPRecipeBase *, 16> Worklist;
1151     Worklist.push_back(Root);
1152 
1153     // Traverse the backward slice of Root through its use-def chain.
1154     while (!Worklist.empty()) {
1155       VPRecipeBase *CurRec = Worklist.back();
1156       Worklist.pop_back();
1157 
1158       if (!Visited.insert(CurRec).second)
1159         continue;
1160 
1161       // Prune search if we find another recipe generating a widen memory
1162       // instruction. Widen memory instructions involved in address computation
1163       // will lead to gather/scatter instructions, which don't need to be
1164       // handled.
1165       if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
1166           isa<VPInterleaveRecipe>(CurRec))
1167         continue;
1168 
1169       // This recipe contributes to the address computation of a widen
1170       // load/store. Collect recipe if its underlying instruction has
1171       // poison-generating flags.
1172       Instruction *Instr = CurRec->getUnderlyingInstr();
1173       if (Instr && Instr->hasPoisonGeneratingFlags())
1174         State.MayGeneratePoisonRecipes.insert(CurRec);
1175 
1176       // Add new definitions to the worklist.
1177       for (VPValue *operand : CurRec->operands())
1178         if (VPDef *OpDef = operand->getDef())
1179           Worklist.push_back(cast<VPRecipeBase>(OpDef));
1180     }
1181   });
1182 
1183   // Traverse all the recipes in the VPlan and collect the poison-generating
1184   // recipes in the backward slice starting at the address of a VPWidenRecipe or
1185   // VPInterleaveRecipe.
1186   auto Iter = depth_first(
1187       VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry()));
1188   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1189     for (VPRecipeBase &Recipe : *VPBB) {
1190       if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
1191         Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr();
1192         VPDef *AddrDef = WidenRec->getAddr()->getDef();
1193         if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr &&
1194             Legal->blockNeedsPredication(UnderlyingInstr->getParent()))
1195           collectPoisonGeneratingInstrsInBackwardSlice(
1196               cast<VPRecipeBase>(AddrDef));
1197       } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1198         VPDef *AddrDef = InterleaveRec->getAddr()->getDef();
1199         if (AddrDef) {
1200           // Check if any member of the interleave group needs predication.
1201           const InterleaveGroup<Instruction> *InterGroup =
1202               InterleaveRec->getInterleaveGroup();
1203           bool NeedPredication = false;
1204           for (int I = 0, NumMembers = InterGroup->getNumMembers();
1205                I < NumMembers; ++I) {
1206             Instruction *Member = InterGroup->getMember(I);
1207             if (Member)
1208               NeedPredication |=
1209                   Legal->blockNeedsPredication(Member->getParent());
1210           }
1211 
1212           if (NeedPredication)
1213             collectPoisonGeneratingInstrsInBackwardSlice(
1214                 cast<VPRecipeBase>(AddrDef));
1215         }
1216       }
1217     }
1218   }
1219 }
1220 
1221 void InnerLoopVectorizer::addMetadata(Instruction *To,
1222                                       Instruction *From) {
1223   propagateMetadata(To, From);
1224   addNewMetadata(To, From);
1225 }
1226 
1227 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1228                                       Instruction *From) {
1229   for (Value *V : To) {
1230     if (Instruction *I = dyn_cast<Instruction>(V))
1231       addMetadata(I, From);
1232   }
1233 }
1234 
1235 namespace llvm {
1236 
1237 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1238 // lowered.
1239 enum ScalarEpilogueLowering {
1240 
1241   // The default: allowing scalar epilogues.
1242   CM_ScalarEpilogueAllowed,
1243 
1244   // Vectorization with OptForSize: don't allow epilogues.
1245   CM_ScalarEpilogueNotAllowedOptSize,
1246 
1247   // A special case of vectorisation with OptForSize: loops with a very small
1248   // trip count are considered for vectorization under OptForSize, thereby
1249   // making sure the cost of their loop body is dominant, free of runtime
1250   // guards and scalar iteration overheads.
1251   CM_ScalarEpilogueNotAllowedLowTripLoop,
1252 
1253   // Loop hint predicate indicating an epilogue is undesired.
1254   CM_ScalarEpilogueNotNeededUsePredicate,
1255 
1256   // Directive indicating we must either tail fold or not vectorize
1257   CM_ScalarEpilogueNotAllowedUsePredicate
1258 };
1259 
1260 /// ElementCountComparator creates a total ordering for ElementCount
1261 /// for the purposes of using it in a set structure.
1262 struct ElementCountComparator {
1263   bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
1264     return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
1265            std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
1266   }
1267 };
1268 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
1269 
1270 /// LoopVectorizationCostModel - estimates the expected speedups due to
1271 /// vectorization.
1272 /// In many cases vectorization is not profitable. This can happen because of
1273 /// a number of reasons. In this class we mainly attempt to predict the
1274 /// expected speedup/slowdowns due to the supported instruction set. We use the
1275 /// TargetTransformInfo to query the different backends for the cost of
1276 /// different operations.
1277 class LoopVectorizationCostModel {
1278 public:
1279   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1280                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1281                              LoopVectorizationLegality *Legal,
1282                              const TargetTransformInfo &TTI,
1283                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1284                              AssumptionCache *AC,
1285                              OptimizationRemarkEmitter *ORE, const Function *F,
1286                              const LoopVectorizeHints *Hints,
1287                              InterleavedAccessInfo &IAI)
1288       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1289         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1290         Hints(Hints), InterleaveInfo(IAI) {}
1291 
1292   /// \return An upper bound for the vectorization factors (both fixed and
1293   /// scalable). If the factors are 0, vectorization and interleaving should be
1294   /// avoided up front.
1295   FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1296 
1297   /// \return True if runtime checks are required for vectorization, and false
1298   /// otherwise.
1299   bool runtimeChecksRequired();
1300 
1301   /// \return The most profitable vectorization factor and the cost of that VF.
1302   /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
1303   /// then this vectorization factor will be selected if vectorization is
1304   /// possible.
1305   VectorizationFactor
1306   selectVectorizationFactor(const ElementCountSet &CandidateVFs);
1307 
1308   VectorizationFactor
1309   selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1310                                     const LoopVectorizationPlanner &LVP);
1311 
1312   /// Setup cost-based decisions for user vectorization factor.
1313   /// \return true if the UserVF is a feasible VF to be chosen.
1314   bool selectUserVectorizationFactor(ElementCount UserVF) {
1315     collectUniformsAndScalars(UserVF);
1316     collectInstsToScalarize(UserVF);
1317     return expectedCost(UserVF).first.isValid();
1318   }
1319 
1320   /// \return The size (in bits) of the smallest and widest types in the code
1321   /// that needs to be vectorized. We ignore values that remain scalar such as
1322   /// 64 bit loop indices.
1323   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1324 
1325   /// \return The desired interleave count.
1326   /// If interleave count has been specified by metadata it will be returned.
1327   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1328   /// are the selected vectorization factor and the cost of the selected VF.
1329   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1330 
1331   /// Memory access instruction may be vectorized in more than one way.
1332   /// Form of instruction after vectorization depends on cost.
1333   /// This function takes cost-based decisions for Load/Store instructions
1334   /// and collects them in a map. This decisions map is used for building
1335   /// the lists of loop-uniform and loop-scalar instructions.
1336   /// The calculated cost is saved with widening decision in order to
1337   /// avoid redundant calculations.
1338   void setCostBasedWideningDecision(ElementCount VF);
1339 
1340   /// A struct that represents some properties of the register usage
1341   /// of a loop.
1342   struct RegisterUsage {
1343     /// Holds the number of loop invariant values that are used in the loop.
1344     /// The key is ClassID of target-provided register class.
1345     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1346     /// Holds the maximum number of concurrent live intervals in the loop.
1347     /// The key is ClassID of target-provided register class.
1348     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1349   };
1350 
1351   /// \return Returns information about the register usages of the loop for the
1352   /// given vectorization factors.
1353   SmallVector<RegisterUsage, 8>
1354   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1355 
1356   /// Collect values we want to ignore in the cost model.
1357   void collectValuesToIgnore();
1358 
1359   /// Collect all element types in the loop for which widening is needed.
1360   void collectElementTypesForWidening();
1361 
1362   /// Split reductions into those that happen in the loop, and those that happen
1363   /// outside. In loop reductions are collected into InLoopReductionChains.
1364   void collectInLoopReductions();
1365 
1366   /// Returns true if we should use strict in-order reductions for the given
1367   /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1368   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1369   /// of FP operations.
1370   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) {
1371     return !Hints->allowReordering() && RdxDesc.isOrdered();
1372   }
1373 
1374   /// \returns The smallest bitwidth each instruction can be represented with.
1375   /// The vector equivalents of these instructions should be truncated to this
1376   /// type.
1377   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1378     return MinBWs;
1379   }
1380 
1381   /// \returns True if it is more profitable to scalarize instruction \p I for
1382   /// vectorization factor \p VF.
1383   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1384     assert(VF.isVector() &&
1385            "Profitable to scalarize relevant only for VF > 1.");
1386 
1387     // Cost model is not run in the VPlan-native path - return conservative
1388     // result until this changes.
1389     if (EnableVPlanNativePath)
1390       return false;
1391 
1392     auto Scalars = InstsToScalarize.find(VF);
1393     assert(Scalars != InstsToScalarize.end() &&
1394            "VF not yet analyzed for scalarization profitability");
1395     return Scalars->second.find(I) != Scalars->second.end();
1396   }
1397 
1398   /// Returns true if \p I is known to be uniform after vectorization.
1399   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1400     if (VF.isScalar())
1401       return true;
1402 
1403     // Cost model is not run in the VPlan-native path - return conservative
1404     // result until this changes.
1405     if (EnableVPlanNativePath)
1406       return false;
1407 
1408     auto UniformsPerVF = Uniforms.find(VF);
1409     assert(UniformsPerVF != Uniforms.end() &&
1410            "VF not yet analyzed for uniformity");
1411     return UniformsPerVF->second.count(I);
1412   }
1413 
1414   /// Returns true if \p I is known to be scalar after vectorization.
1415   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1416     if (VF.isScalar())
1417       return true;
1418 
1419     // Cost model is not run in the VPlan-native path - return conservative
1420     // result until this changes.
1421     if (EnableVPlanNativePath)
1422       return false;
1423 
1424     auto ScalarsPerVF = Scalars.find(VF);
1425     assert(ScalarsPerVF != Scalars.end() &&
1426            "Scalar values are not calculated for VF");
1427     return ScalarsPerVF->second.count(I);
1428   }
1429 
1430   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1431   /// for vectorization factor \p VF.
1432   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1433     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1434            !isProfitableToScalarize(I, VF) &&
1435            !isScalarAfterVectorization(I, VF);
1436   }
1437 
1438   /// Decision that was taken during cost calculation for memory instruction.
1439   enum InstWidening {
1440     CM_Unknown,
1441     CM_Widen,         // For consecutive accesses with stride +1.
1442     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1443     CM_Interleave,
1444     CM_GatherScatter,
1445     CM_Scalarize
1446   };
1447 
1448   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1449   /// instruction \p I and vector width \p VF.
1450   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1451                            InstructionCost Cost) {
1452     assert(VF.isVector() && "Expected VF >=2");
1453     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1454   }
1455 
1456   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1457   /// interleaving group \p Grp and vector width \p VF.
1458   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1459                            ElementCount VF, InstWidening W,
1460                            InstructionCost Cost) {
1461     assert(VF.isVector() && "Expected VF >=2");
1462     /// Broadcast this decicion to all instructions inside the group.
1463     /// But the cost will be assigned to one instruction only.
1464     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1465       if (auto *I = Grp->getMember(i)) {
1466         if (Grp->getInsertPos() == I)
1467           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1468         else
1469           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1470       }
1471     }
1472   }
1473 
1474   /// Return the cost model decision for the given instruction \p I and vector
1475   /// width \p VF. Return CM_Unknown if this instruction did not pass
1476   /// through the cost modeling.
1477   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1478     assert(VF.isVector() && "Expected VF to be a vector VF");
1479     // Cost model is not run in the VPlan-native path - return conservative
1480     // result until this changes.
1481     if (EnableVPlanNativePath)
1482       return CM_GatherScatter;
1483 
1484     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1485     auto Itr = WideningDecisions.find(InstOnVF);
1486     if (Itr == WideningDecisions.end())
1487       return CM_Unknown;
1488     return Itr->second.first;
1489   }
1490 
1491   /// Return the vectorization cost for the given instruction \p I and vector
1492   /// width \p VF.
1493   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1494     assert(VF.isVector() && "Expected VF >=2");
1495     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1496     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1497            "The cost is not calculated");
1498     return WideningDecisions[InstOnVF].second;
1499   }
1500 
1501   /// Return True if instruction \p I is an optimizable truncate whose operand
1502   /// is an induction variable. Such a truncate will be removed by adding a new
1503   /// induction variable with the destination type.
1504   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1505     // If the instruction is not a truncate, return false.
1506     auto *Trunc = dyn_cast<TruncInst>(I);
1507     if (!Trunc)
1508       return false;
1509 
1510     // Get the source and destination types of the truncate.
1511     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1512     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1513 
1514     // If the truncate is free for the given types, return false. Replacing a
1515     // free truncate with an induction variable would add an induction variable
1516     // update instruction to each iteration of the loop. We exclude from this
1517     // check the primary induction variable since it will need an update
1518     // instruction regardless.
1519     Value *Op = Trunc->getOperand(0);
1520     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1521       return false;
1522 
1523     // If the truncated value is not an induction variable, return false.
1524     return Legal->isInductionPhi(Op);
1525   }
1526 
1527   /// Collects the instructions to scalarize for each predicated instruction in
1528   /// the loop.
1529   void collectInstsToScalarize(ElementCount VF);
1530 
1531   /// Collect Uniform and Scalar values for the given \p VF.
1532   /// The sets depend on CM decision for Load/Store instructions
1533   /// that may be vectorized as interleave, gather-scatter or scalarized.
1534   void collectUniformsAndScalars(ElementCount VF) {
1535     // Do the analysis once.
1536     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1537       return;
1538     setCostBasedWideningDecision(VF);
1539     collectLoopUniforms(VF);
1540     collectLoopScalars(VF);
1541   }
1542 
1543   /// Returns true if the target machine supports masked store operation
1544   /// for the given \p DataType and kind of access to \p Ptr.
1545   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1546     return Legal->isConsecutivePtr(DataType, Ptr) &&
1547            TTI.isLegalMaskedStore(DataType, Alignment);
1548   }
1549 
1550   /// Returns true if the target machine supports masked load operation
1551   /// for the given \p DataType and kind of access to \p Ptr.
1552   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1553     return Legal->isConsecutivePtr(DataType, Ptr) &&
1554            TTI.isLegalMaskedLoad(DataType, Alignment);
1555   }
1556 
1557   /// Returns true if the target machine can represent \p V as a masked gather
1558   /// or scatter operation.
1559   bool isLegalGatherOrScatter(Value *V) {
1560     bool LI = isa<LoadInst>(V);
1561     bool SI = isa<StoreInst>(V);
1562     if (!LI && !SI)
1563       return false;
1564     auto *Ty = getLoadStoreType(V);
1565     Align Align = getLoadStoreAlignment(V);
1566     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1567            (SI && TTI.isLegalMaskedScatter(Ty, Align));
1568   }
1569 
1570   /// Returns true if the target machine supports all of the reduction
1571   /// variables found for the given VF.
1572   bool canVectorizeReductions(ElementCount VF) const {
1573     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1574       const RecurrenceDescriptor &RdxDesc = Reduction.second;
1575       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1576     }));
1577   }
1578 
1579   /// Returns true if \p I is an instruction that will be scalarized with
1580   /// predication. Such instructions include conditional stores and
1581   /// instructions that may divide by zero.
1582   /// If a non-zero VF has been calculated, we check if I will be scalarized
1583   /// predication for that VF.
1584   bool isScalarWithPredication(Instruction *I) const;
1585 
1586   // Returns true if \p I is an instruction that will be predicated either
1587   // through scalar predication or masked load/store or masked gather/scatter.
1588   // Superset of instructions that return true for isScalarWithPredication.
1589   bool isPredicatedInst(Instruction *I, bool IsKnownUniform = false) {
1590     // When we know the load is uniform and the original scalar loop was not
1591     // predicated we don't need to mark it as a predicated instruction. Any
1592     // vectorised blocks created when tail-folding are something artificial we
1593     // have introduced and we know there is always at least one active lane.
1594     // That's why we call Legal->blockNeedsPredication here because it doesn't
1595     // query tail-folding.
1596     if (IsKnownUniform && isa<LoadInst>(I) &&
1597         !Legal->blockNeedsPredication(I->getParent()))
1598       return false;
1599     if (!blockNeedsPredicationForAnyReason(I->getParent()))
1600       return false;
1601     // Loads and stores that need some form of masked operation are predicated
1602     // instructions.
1603     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1604       return Legal->isMaskRequired(I);
1605     return isScalarWithPredication(I);
1606   }
1607 
1608   /// Returns true if \p I is a memory instruction with consecutive memory
1609   /// access that can be widened.
1610   bool
1611   memoryInstructionCanBeWidened(Instruction *I,
1612                                 ElementCount VF = ElementCount::getFixed(1));
1613 
1614   /// Returns true if \p I is a memory instruction in an interleaved-group
1615   /// of memory accesses that can be vectorized with wide vector loads/stores
1616   /// and shuffles.
1617   bool
1618   interleavedAccessCanBeWidened(Instruction *I,
1619                                 ElementCount VF = ElementCount::getFixed(1));
1620 
1621   /// Check if \p Instr belongs to any interleaved access group.
1622   bool isAccessInterleaved(Instruction *Instr) {
1623     return InterleaveInfo.isInterleaved(Instr);
1624   }
1625 
1626   /// Get the interleaved access group that \p Instr belongs to.
1627   const InterleaveGroup<Instruction> *
1628   getInterleavedAccessGroup(Instruction *Instr) {
1629     return InterleaveInfo.getInterleaveGroup(Instr);
1630   }
1631 
1632   /// Returns true if we're required to use a scalar epilogue for at least
1633   /// the final iteration of the original loop.
1634   bool requiresScalarEpilogue(ElementCount VF) const {
1635     if (!isScalarEpilogueAllowed())
1636       return false;
1637     // If we might exit from anywhere but the latch, must run the exiting
1638     // iteration in scalar form.
1639     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1640       return true;
1641     return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();
1642   }
1643 
1644   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1645   /// loop hint annotation.
1646   bool isScalarEpilogueAllowed() const {
1647     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1648   }
1649 
1650   /// Returns true if all loop blocks should be masked to fold tail loop.
1651   bool foldTailByMasking() const { return FoldTailByMasking; }
1652 
1653   /// Returns true if the instructions in this block requires predication
1654   /// for any reason, e.g. because tail folding now requires a predicate
1655   /// or because the block in the original loop was predicated.
1656   bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1657     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1658   }
1659 
1660   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1661   /// nodes to the chain of instructions representing the reductions. Uses a
1662   /// MapVector to ensure deterministic iteration order.
1663   using ReductionChainMap =
1664       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1665 
1666   /// Return the chain of instructions representing an inloop reduction.
1667   const ReductionChainMap &getInLoopReductionChains() const {
1668     return InLoopReductionChains;
1669   }
1670 
1671   /// Returns true if the Phi is part of an inloop reduction.
1672   bool isInLoopReduction(PHINode *Phi) const {
1673     return InLoopReductionChains.count(Phi);
1674   }
1675 
1676   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1677   /// with factor VF.  Return the cost of the instruction, including
1678   /// scalarization overhead if it's needed.
1679   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1680 
1681   /// Estimate cost of a call instruction CI if it were vectorized with factor
1682   /// VF. Return the cost of the instruction, including scalarization overhead
1683   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1684   /// scalarized -
1685   /// i.e. either vector version isn't available, or is too expensive.
1686   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1687                                     bool &NeedToScalarize) const;
1688 
1689   /// Returns true if the per-lane cost of VectorizationFactor A is lower than
1690   /// that of B.
1691   bool isMoreProfitable(const VectorizationFactor &A,
1692                         const VectorizationFactor &B) const;
1693 
1694   /// Invalidates decisions already taken by the cost model.
1695   void invalidateCostModelingDecisions() {
1696     WideningDecisions.clear();
1697     Uniforms.clear();
1698     Scalars.clear();
1699   }
1700 
1701 private:
1702   unsigned NumPredStores = 0;
1703 
1704   /// \return An upper bound for the vectorization factors for both
1705   /// fixed and scalable vectorization, where the minimum-known number of
1706   /// elements is a power-of-2 larger than zero. If scalable vectorization is
1707   /// disabled or unsupported, then the scalable part will be equal to
1708   /// ElementCount::getScalable(0).
1709   FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1710                                            ElementCount UserVF,
1711                                            bool FoldTailByMasking);
1712 
1713   /// \return the maximized element count based on the targets vector
1714   /// registers and the loop trip-count, but limited to a maximum safe VF.
1715   /// This is a helper function of computeFeasibleMaxVF.
1716   /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure
1717   /// issue that occurred on one of the buildbots which cannot be reproduced
1718   /// without having access to the properietary compiler (see comments on
1719   /// D98509). The issue is currently under investigation and this workaround
1720   /// will be removed as soon as possible.
1721   ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
1722                                        unsigned SmallestType,
1723                                        unsigned WidestType,
1724                                        const ElementCount &MaxSafeVF,
1725                                        bool FoldTailByMasking);
1726 
1727   /// \return the maximum legal scalable VF, based on the safe max number
1728   /// of elements.
1729   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1730 
1731   /// The vectorization cost is a combination of the cost itself and a boolean
1732   /// indicating whether any of the contributing operations will actually
1733   /// operate on vector values after type legalization in the backend. If this
1734   /// latter value is false, then all operations will be scalarized (i.e. no
1735   /// vectorization has actually taken place).
1736   using VectorizationCostTy = std::pair<InstructionCost, bool>;
1737 
1738   /// Returns the expected execution cost. The unit of the cost does
1739   /// not matter because we use the 'cost' units to compare different
1740   /// vector widths. The cost that is returned is *not* normalized by
1741   /// the factor width. If \p Invalid is not nullptr, this function
1742   /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1743   /// each instruction that has an Invalid cost for the given VF.
1744   using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1745   VectorizationCostTy
1746   expectedCost(ElementCount VF,
1747                SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1748 
1749   /// Returns the execution time cost of an instruction for a given vector
1750   /// width. Vector width of one means scalar.
1751   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1752 
1753   /// The cost-computation logic from getInstructionCost which provides
1754   /// the vector type as an output parameter.
1755   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1756                                      Type *&VectorTy);
1757 
1758   /// Return the cost of instructions in an inloop reduction pattern, if I is
1759   /// part of that pattern.
1760   Optional<InstructionCost>
1761   getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1762                           TTI::TargetCostKind CostKind);
1763 
1764   /// Calculate vectorization cost of memory instruction \p I.
1765   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1766 
1767   /// The cost computation for scalarized memory instruction.
1768   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1769 
1770   /// The cost computation for interleaving group of memory instructions.
1771   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1772 
1773   /// The cost computation for Gather/Scatter instruction.
1774   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1775 
1776   /// The cost computation for widening instruction \p I with consecutive
1777   /// memory access.
1778   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1779 
1780   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1781   /// Load: scalar load + broadcast.
1782   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1783   /// element)
1784   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1785 
1786   /// Estimate the overhead of scalarizing an instruction. This is a
1787   /// convenience wrapper for the type-based getScalarizationOverhead API.
1788   InstructionCost getScalarizationOverhead(Instruction *I,
1789                                            ElementCount VF) const;
1790 
1791   /// Returns whether the instruction is a load or store and will be a emitted
1792   /// as a vector operation.
1793   bool isConsecutiveLoadOrStore(Instruction *I);
1794 
1795   /// Returns true if an artificially high cost for emulated masked memrefs
1796   /// should be used.
1797   bool useEmulatedMaskMemRefHack(Instruction *I);
1798 
1799   /// Map of scalar integer values to the smallest bitwidth they can be legally
1800   /// represented as. The vector equivalents of these values should be truncated
1801   /// to this type.
1802   MapVector<Instruction *, uint64_t> MinBWs;
1803 
1804   /// A type representing the costs for instructions if they were to be
1805   /// scalarized rather than vectorized. The entries are Instruction-Cost
1806   /// pairs.
1807   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1808 
1809   /// A set containing all BasicBlocks that are known to present after
1810   /// vectorization as a predicated block.
1811   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1812 
1813   /// Records whether it is allowed to have the original scalar loop execute at
1814   /// least once. This may be needed as a fallback loop in case runtime
1815   /// aliasing/dependence checks fail, or to handle the tail/remainder
1816   /// iterations when the trip count is unknown or doesn't divide by the VF,
1817   /// or as a peel-loop to handle gaps in interleave-groups.
1818   /// Under optsize and when the trip count is very small we don't allow any
1819   /// iterations to execute in the scalar loop.
1820   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1821 
1822   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1823   bool FoldTailByMasking = false;
1824 
1825   /// A map holding scalar costs for different vectorization factors. The
1826   /// presence of a cost for an instruction in the mapping indicates that the
1827   /// instruction will be scalarized when vectorizing with the associated
1828   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1829   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1830 
1831   /// Holds the instructions known to be uniform after vectorization.
1832   /// The data is collected per VF.
1833   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1834 
1835   /// Holds the instructions known to be scalar after vectorization.
1836   /// The data is collected per VF.
1837   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1838 
1839   /// Holds the instructions (address computations) that are forced to be
1840   /// scalarized.
1841   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1842 
1843   /// PHINodes of the reductions that should be expanded in-loop along with
1844   /// their associated chains of reduction operations, in program order from top
1845   /// (PHI) to bottom
1846   ReductionChainMap InLoopReductionChains;
1847 
1848   /// A Map of inloop reduction operations and their immediate chain operand.
1849   /// FIXME: This can be removed once reductions can be costed correctly in
1850   /// vplan. This was added to allow quick lookup to the inloop operations,
1851   /// without having to loop through InLoopReductionChains.
1852   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1853 
1854   /// Returns the expected difference in cost from scalarizing the expression
1855   /// feeding a predicated instruction \p PredInst. The instructions to
1856   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1857   /// non-negative return value implies the expression will be scalarized.
1858   /// Currently, only single-use chains are considered for scalarization.
1859   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1860                               ElementCount VF);
1861 
1862   /// Collect the instructions that are uniform after vectorization. An
1863   /// instruction is uniform if we represent it with a single scalar value in
1864   /// the vectorized loop corresponding to each vector iteration. Examples of
1865   /// uniform instructions include pointer operands of consecutive or
1866   /// interleaved memory accesses. Note that although uniformity implies an
1867   /// instruction will be scalar, the reverse is not true. In general, a
1868   /// scalarized instruction will be represented by VF scalar values in the
1869   /// vectorized loop, each corresponding to an iteration of the original
1870   /// scalar loop.
1871   void collectLoopUniforms(ElementCount VF);
1872 
1873   /// Collect the instructions that are scalar after vectorization. An
1874   /// instruction is scalar if it is known to be uniform or will be scalarized
1875   /// during vectorization. collectLoopScalars should only add non-uniform nodes
1876   /// to the list if they are used by a load/store instruction that is marked as
1877   /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1878   /// VF values in the vectorized loop, each corresponding to an iteration of
1879   /// the original scalar loop.
1880   void collectLoopScalars(ElementCount VF);
1881 
1882   /// Keeps cost model vectorization decision and cost for instructions.
1883   /// Right now it is used for memory instructions only.
1884   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1885                                 std::pair<InstWidening, InstructionCost>>;
1886 
1887   DecisionList WideningDecisions;
1888 
1889   /// Returns true if \p V is expected to be vectorized and it needs to be
1890   /// extracted.
1891   bool needsExtract(Value *V, ElementCount VF) const {
1892     Instruction *I = dyn_cast<Instruction>(V);
1893     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1894         TheLoop->isLoopInvariant(I))
1895       return false;
1896 
1897     // Assume we can vectorize V (and hence we need extraction) if the
1898     // scalars are not computed yet. This can happen, because it is called
1899     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1900     // the scalars are collected. That should be a safe assumption in most
1901     // cases, because we check if the operands have vectorizable types
1902     // beforehand in LoopVectorizationLegality.
1903     return Scalars.find(VF) == Scalars.end() ||
1904            !isScalarAfterVectorization(I, VF);
1905   };
1906 
1907   /// Returns a range containing only operands needing to be extracted.
1908   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1909                                                    ElementCount VF) const {
1910     return SmallVector<Value *, 4>(make_filter_range(
1911         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1912   }
1913 
1914   /// Determines if we have the infrastructure to vectorize loop \p L and its
1915   /// epilogue, assuming the main loop is vectorized by \p VF.
1916   bool isCandidateForEpilogueVectorization(const Loop &L,
1917                                            const ElementCount VF) const;
1918 
1919   /// Returns true if epilogue vectorization is considered profitable, and
1920   /// false otherwise.
1921   /// \p VF is the vectorization factor chosen for the original loop.
1922   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1923 
1924 public:
1925   /// The loop that we evaluate.
1926   Loop *TheLoop;
1927 
1928   /// Predicated scalar evolution analysis.
1929   PredicatedScalarEvolution &PSE;
1930 
1931   /// Loop Info analysis.
1932   LoopInfo *LI;
1933 
1934   /// Vectorization legality.
1935   LoopVectorizationLegality *Legal;
1936 
1937   /// Vector target information.
1938   const TargetTransformInfo &TTI;
1939 
1940   /// Target Library Info.
1941   const TargetLibraryInfo *TLI;
1942 
1943   /// Demanded bits analysis.
1944   DemandedBits *DB;
1945 
1946   /// Assumption cache.
1947   AssumptionCache *AC;
1948 
1949   /// Interface to emit optimization remarks.
1950   OptimizationRemarkEmitter *ORE;
1951 
1952   const Function *TheFunction;
1953 
1954   /// Loop Vectorize Hint.
1955   const LoopVectorizeHints *Hints;
1956 
1957   /// The interleave access information contains groups of interleaved accesses
1958   /// with the same stride and close to each other.
1959   InterleavedAccessInfo &InterleaveInfo;
1960 
1961   /// Values to ignore in the cost model.
1962   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1963 
1964   /// Values to ignore in the cost model when VF > 1.
1965   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1966 
1967   /// All element types found in the loop.
1968   SmallPtrSet<Type *, 16> ElementTypesInLoop;
1969 
1970   /// Profitable vector factors.
1971   SmallVector<VectorizationFactor, 8> ProfitableVFs;
1972 };
1973 } // end namespace llvm
1974 
1975 /// Helper struct to manage generating runtime checks for vectorization.
1976 ///
1977 /// The runtime checks are created up-front in temporary blocks to allow better
1978 /// estimating the cost and un-linked from the existing IR. After deciding to
1979 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1980 /// temporary blocks are completely removed.
1981 class GeneratedRTChecks {
1982   /// Basic block which contains the generated SCEV checks, if any.
1983   BasicBlock *SCEVCheckBlock = nullptr;
1984 
1985   /// The value representing the result of the generated SCEV checks. If it is
1986   /// nullptr, either no SCEV checks have been generated or they have been used.
1987   Value *SCEVCheckCond = nullptr;
1988 
1989   /// Basic block which contains the generated memory runtime checks, if any.
1990   BasicBlock *MemCheckBlock = nullptr;
1991 
1992   /// The value representing the result of the generated memory runtime checks.
1993   /// If it is nullptr, either no memory runtime checks have been generated or
1994   /// they have been used.
1995   Value *MemRuntimeCheckCond = nullptr;
1996 
1997   DominatorTree *DT;
1998   LoopInfo *LI;
1999 
2000   SCEVExpander SCEVExp;
2001   SCEVExpander MemCheckExp;
2002 
2003 public:
2004   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
2005                     const DataLayout &DL)
2006       : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"),
2007         MemCheckExp(SE, DL, "scev.check") {}
2008 
2009   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
2010   /// accurately estimate the cost of the runtime checks. The blocks are
2011   /// un-linked from the IR and is added back during vector code generation. If
2012   /// there is no vector code generation, the check blocks are removed
2013   /// completely.
2014   void Create(Loop *L, const LoopAccessInfo &LAI,
2015               const SCEVUnionPredicate &UnionPred) {
2016 
2017     BasicBlock *LoopHeader = L->getHeader();
2018     BasicBlock *Preheader = L->getLoopPreheader();
2019 
2020     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
2021     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
2022     // may be used by SCEVExpander. The blocks will be un-linked from their
2023     // predecessors and removed from LI & DT at the end of the function.
2024     if (!UnionPred.isAlwaysTrue()) {
2025       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
2026                                   nullptr, "vector.scevcheck");
2027 
2028       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
2029           &UnionPred, SCEVCheckBlock->getTerminator());
2030     }
2031 
2032     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
2033     if (RtPtrChecking.Need) {
2034       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
2035       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
2036                                  "vector.memcheck");
2037 
2038       MemRuntimeCheckCond =
2039           addRuntimeChecks(MemCheckBlock->getTerminator(), L,
2040                            RtPtrChecking.getChecks(), MemCheckExp);
2041       assert(MemRuntimeCheckCond &&
2042              "no RT checks generated although RtPtrChecking "
2043              "claimed checks are required");
2044     }
2045 
2046     if (!MemCheckBlock && !SCEVCheckBlock)
2047       return;
2048 
2049     // Unhook the temporary block with the checks, update various places
2050     // accordingly.
2051     if (SCEVCheckBlock)
2052       SCEVCheckBlock->replaceAllUsesWith(Preheader);
2053     if (MemCheckBlock)
2054       MemCheckBlock->replaceAllUsesWith(Preheader);
2055 
2056     if (SCEVCheckBlock) {
2057       SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2058       new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
2059       Preheader->getTerminator()->eraseFromParent();
2060     }
2061     if (MemCheckBlock) {
2062       MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2063       new UnreachableInst(Preheader->getContext(), MemCheckBlock);
2064       Preheader->getTerminator()->eraseFromParent();
2065     }
2066 
2067     DT->changeImmediateDominator(LoopHeader, Preheader);
2068     if (MemCheckBlock) {
2069       DT->eraseNode(MemCheckBlock);
2070       LI->removeBlock(MemCheckBlock);
2071     }
2072     if (SCEVCheckBlock) {
2073       DT->eraseNode(SCEVCheckBlock);
2074       LI->removeBlock(SCEVCheckBlock);
2075     }
2076   }
2077 
2078   /// Remove the created SCEV & memory runtime check blocks & instructions, if
2079   /// unused.
2080   ~GeneratedRTChecks() {
2081     SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT);
2082     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT);
2083     if (!SCEVCheckCond)
2084       SCEVCleaner.markResultUsed();
2085 
2086     if (!MemRuntimeCheckCond)
2087       MemCheckCleaner.markResultUsed();
2088 
2089     if (MemRuntimeCheckCond) {
2090       auto &SE = *MemCheckExp.getSE();
2091       // Memory runtime check generation creates compares that use expanded
2092       // values. Remove them before running the SCEVExpanderCleaners.
2093       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2094         if (MemCheckExp.isInsertedInstruction(&I))
2095           continue;
2096         SE.forgetValue(&I);
2097         I.eraseFromParent();
2098       }
2099     }
2100     MemCheckCleaner.cleanup();
2101     SCEVCleaner.cleanup();
2102 
2103     if (SCEVCheckCond)
2104       SCEVCheckBlock->eraseFromParent();
2105     if (MemRuntimeCheckCond)
2106       MemCheckBlock->eraseFromParent();
2107   }
2108 
2109   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2110   /// adjusts the branches to branch to the vector preheader or \p Bypass,
2111   /// depending on the generated condition.
2112   BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass,
2113                              BasicBlock *LoopVectorPreHeader,
2114                              BasicBlock *LoopExitBlock) {
2115     if (!SCEVCheckCond)
2116       return nullptr;
2117     if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond))
2118       if (C->isZero())
2119         return nullptr;
2120 
2121     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2122 
2123     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2124     // Create new preheader for vector loop.
2125     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2126       PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2127 
2128     SCEVCheckBlock->getTerminator()->eraseFromParent();
2129     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2130     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2131                                                 SCEVCheckBlock);
2132 
2133     DT->addNewBlock(SCEVCheckBlock, Pred);
2134     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2135 
2136     ReplaceInstWithInst(
2137         SCEVCheckBlock->getTerminator(),
2138         BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond));
2139     // Mark the check as used, to prevent it from being removed during cleanup.
2140     SCEVCheckCond = nullptr;
2141     return SCEVCheckBlock;
2142   }
2143 
2144   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2145   /// the branches to branch to the vector preheader or \p Bypass, depending on
2146   /// the generated condition.
2147   BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass,
2148                                    BasicBlock *LoopVectorPreHeader) {
2149     // Check if we generated code that checks in runtime if arrays overlap.
2150     if (!MemRuntimeCheckCond)
2151       return nullptr;
2152 
2153     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2154     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2155                                                 MemCheckBlock);
2156 
2157     DT->addNewBlock(MemCheckBlock, Pred);
2158     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2159     MemCheckBlock->moveBefore(LoopVectorPreHeader);
2160 
2161     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2162       PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2163 
2164     ReplaceInstWithInst(
2165         MemCheckBlock->getTerminator(),
2166         BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2167     MemCheckBlock->getTerminator()->setDebugLoc(
2168         Pred->getTerminator()->getDebugLoc());
2169 
2170     // Mark the check as used, to prevent it from being removed during cleanup.
2171     MemRuntimeCheckCond = nullptr;
2172     return MemCheckBlock;
2173   }
2174 };
2175 
2176 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2177 // vectorization. The loop needs to be annotated with #pragma omp simd
2178 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2179 // vector length information is not provided, vectorization is not considered
2180 // explicit. Interleave hints are not allowed either. These limitations will be
2181 // relaxed in the future.
2182 // Please, note that we are currently forced to abuse the pragma 'clang
2183 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2184 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2185 // provides *explicit vectorization hints* (LV can bypass legal checks and
2186 // assume that vectorization is legal). However, both hints are implemented
2187 // using the same metadata (llvm.loop.vectorize, processed by
2188 // LoopVectorizeHints). This will be fixed in the future when the native IR
2189 // representation for pragma 'omp simd' is introduced.
2190 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2191                                    OptimizationRemarkEmitter *ORE) {
2192   assert(!OuterLp->isInnermost() && "This is not an outer loop");
2193   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2194 
2195   // Only outer loops with an explicit vectorization hint are supported.
2196   // Unannotated outer loops are ignored.
2197   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2198     return false;
2199 
2200   Function *Fn = OuterLp->getHeader()->getParent();
2201   if (!Hints.allowVectorization(Fn, OuterLp,
2202                                 true /*VectorizeOnlyWhenForced*/)) {
2203     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2204     return false;
2205   }
2206 
2207   if (Hints.getInterleave() > 1) {
2208     // TODO: Interleave support is future work.
2209     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2210                          "outer loops.\n");
2211     Hints.emitRemarkWithHints();
2212     return false;
2213   }
2214 
2215   return true;
2216 }
2217 
2218 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2219                                   OptimizationRemarkEmitter *ORE,
2220                                   SmallVectorImpl<Loop *> &V) {
2221   // Collect inner loops and outer loops without irreducible control flow. For
2222   // now, only collect outer loops that have explicit vectorization hints. If we
2223   // are stress testing the VPlan H-CFG construction, we collect the outermost
2224   // loop of every loop nest.
2225   if (L.isInnermost() || VPlanBuildStressTest ||
2226       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2227     LoopBlocksRPO RPOT(&L);
2228     RPOT.perform(LI);
2229     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2230       V.push_back(&L);
2231       // TODO: Collect inner loops inside marked outer loops in case
2232       // vectorization fails for the outer loop. Do not invoke
2233       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2234       // already known to be reducible. We can use an inherited attribute for
2235       // that.
2236       return;
2237     }
2238   }
2239   for (Loop *InnerL : L)
2240     collectSupportedLoops(*InnerL, LI, ORE, V);
2241 }
2242 
2243 namespace {
2244 
2245 /// The LoopVectorize Pass.
2246 struct LoopVectorize : public FunctionPass {
2247   /// Pass identification, replacement for typeid
2248   static char ID;
2249 
2250   LoopVectorizePass Impl;
2251 
2252   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2253                          bool VectorizeOnlyWhenForced = false)
2254       : FunctionPass(ID),
2255         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2256     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2257   }
2258 
2259   bool runOnFunction(Function &F) override {
2260     if (skipFunction(F))
2261       return false;
2262 
2263     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2264     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2265     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2266     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2267     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2268     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2269     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2270     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2271     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2272     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2273     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2274     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2275     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2276 
2277     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2278         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2279 
2280     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2281                         GetLAA, *ORE, PSI).MadeAnyChange;
2282   }
2283 
2284   void getAnalysisUsage(AnalysisUsage &AU) const override {
2285     AU.addRequired<AssumptionCacheTracker>();
2286     AU.addRequired<BlockFrequencyInfoWrapperPass>();
2287     AU.addRequired<DominatorTreeWrapperPass>();
2288     AU.addRequired<LoopInfoWrapperPass>();
2289     AU.addRequired<ScalarEvolutionWrapperPass>();
2290     AU.addRequired<TargetTransformInfoWrapperPass>();
2291     AU.addRequired<AAResultsWrapperPass>();
2292     AU.addRequired<LoopAccessLegacyAnalysis>();
2293     AU.addRequired<DemandedBitsWrapperPass>();
2294     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2295     AU.addRequired<InjectTLIMappingsLegacy>();
2296 
2297     // We currently do not preserve loopinfo/dominator analyses with outer loop
2298     // vectorization. Until this is addressed, mark these analyses as preserved
2299     // only for non-VPlan-native path.
2300     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2301     if (!EnableVPlanNativePath) {
2302       AU.addPreserved<LoopInfoWrapperPass>();
2303       AU.addPreserved<DominatorTreeWrapperPass>();
2304     }
2305 
2306     AU.addPreserved<BasicAAWrapperPass>();
2307     AU.addPreserved<GlobalsAAWrapperPass>();
2308     AU.addRequired<ProfileSummaryInfoWrapperPass>();
2309   }
2310 };
2311 
2312 } // end anonymous namespace
2313 
2314 //===----------------------------------------------------------------------===//
2315 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2316 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2317 //===----------------------------------------------------------------------===//
2318 
2319 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2320   // We need to place the broadcast of invariant variables outside the loop,
2321   // but only if it's proven safe to do so. Else, broadcast will be inside
2322   // vector loop body.
2323   Instruction *Instr = dyn_cast<Instruction>(V);
2324   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2325                      (!Instr ||
2326                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2327   // Place the code for broadcasting invariant variables in the new preheader.
2328   IRBuilder<>::InsertPointGuard Guard(Builder);
2329   if (SafeToHoist)
2330     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2331 
2332   // Broadcast the scalar into all locations in the vector.
2333   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2334 
2335   return Shuf;
2336 }
2337 
2338 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
2339     const InductionDescriptor &II, Value *Step, Value *Start,
2340     Instruction *EntryVal, VPValue *Def, VPTransformState &State) {
2341   IRBuilder<> &Builder = State.Builder;
2342   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2343          "Expected either an induction phi-node or a truncate of it!");
2344 
2345   // Construct the initial value of the vector IV in the vector loop preheader
2346   auto CurrIP = Builder.saveIP();
2347   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2348   if (isa<TruncInst>(EntryVal)) {
2349     assert(Start->getType()->isIntegerTy() &&
2350            "Truncation requires an integer type");
2351     auto *TruncType = cast<IntegerType>(EntryVal->getType());
2352     Step = Builder.CreateTrunc(Step, TruncType);
2353     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2354   }
2355 
2356   Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
2357   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
2358   Value *SteppedStart =
2359       getStepVector(SplatStart, Zero, Step, II.getInductionOpcode());
2360 
2361   // We create vector phi nodes for both integer and floating-point induction
2362   // variables. Here, we determine the kind of arithmetic we will perform.
2363   Instruction::BinaryOps AddOp;
2364   Instruction::BinaryOps MulOp;
2365   if (Step->getType()->isIntegerTy()) {
2366     AddOp = Instruction::Add;
2367     MulOp = Instruction::Mul;
2368   } else {
2369     AddOp = II.getInductionOpcode();
2370     MulOp = Instruction::FMul;
2371   }
2372 
2373   // Multiply the vectorization factor by the step using integer or
2374   // floating-point arithmetic as appropriate.
2375   Type *StepType = Step->getType();
2376   Value *RuntimeVF;
2377   if (Step->getType()->isFloatingPointTy())
2378     RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
2379   else
2380     RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
2381   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
2382 
2383   // Create a vector splat to use in the induction update.
2384   //
2385   // FIXME: If the step is non-constant, we create the vector splat with
2386   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2387   //        handle a constant vector splat.
2388   Value *SplatVF = isa<Constant>(Mul)
2389                        ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
2390                        : Builder.CreateVectorSplat(State.VF, Mul);
2391   Builder.restoreIP(CurrIP);
2392 
2393   // We may need to add the step a number of times, depending on the unroll
2394   // factor. The last of those goes into the PHI.
2395   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2396                                     &*LoopVectorBody->getFirstInsertionPt());
2397   VecInd->setDebugLoc(EntryVal->getDebugLoc());
2398   Instruction *LastInduction = VecInd;
2399   for (unsigned Part = 0; Part < UF; ++Part) {
2400     State.set(Def, LastInduction, Part);
2401 
2402     if (isa<TruncInst>(EntryVal))
2403       addMetadata(LastInduction, EntryVal);
2404 
2405     LastInduction = cast<Instruction>(
2406         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
2407     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
2408   }
2409 
2410   // Move the last step to the end of the latch block. This ensures consistent
2411   // placement of all induction updates.
2412   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2413   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2414   auto *ICmp = cast<Instruction>(Br->getCondition());
2415   LastInduction->moveBefore(ICmp);
2416   LastInduction->setName("vec.ind.next");
2417 
2418   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2419   VecInd->addIncoming(LastInduction, LoopVectorLatch);
2420 }
2421 
2422 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
2423   return Cost->isScalarAfterVectorization(I, VF) ||
2424          Cost->isProfitableToScalarize(I, VF);
2425 }
2426 
2427 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
2428   if (shouldScalarizeInstruction(IV))
2429     return true;
2430   auto isScalarInst = [&](User *U) -> bool {
2431     auto *I = cast<Instruction>(U);
2432     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
2433   };
2434   return llvm::any_of(IV->users(), isScalarInst);
2435 }
2436 
2437 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV,
2438                                                 const InductionDescriptor &ID,
2439                                                 Value *Start, TruncInst *Trunc,
2440                                                 VPValue *Def,
2441                                                 VPTransformState &State) {
2442   IRBuilder<> &Builder = State.Builder;
2443   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
2444          "Primary induction variable must have an integer type");
2445   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
2446 
2447   // The value from the original loop to which we are mapping the new induction
2448   // variable.
2449   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2450 
2451   auto &DL = EntryVal->getModule()->getDataLayout();
2452 
2453   // Generate code for the induction step. Note that induction steps are
2454   // required to be loop-invariant
2455   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
2456     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
2457            "Induction step should be loop invariant");
2458     if (PSE.getSE()->isSCEVable(IV->getType())) {
2459       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2460       return Exp.expandCodeFor(Step, Step->getType(),
2461                                State.CFG.VectorPreHeader->getTerminator());
2462     }
2463     return cast<SCEVUnknown>(Step)->getValue();
2464   };
2465 
2466   // The scalar value to broadcast. This is derived from the canonical
2467   // induction variable. If a truncation type is given, truncate the canonical
2468   // induction variable and step. Otherwise, derive these values from the
2469   // induction descriptor.
2470   auto CreateScalarIV = [&](Value *&Step) -> Value * {
2471     Value *ScalarIV = Induction;
2472     if (IV != OldInduction) {
2473       ScalarIV = IV->getType()->isIntegerTy()
2474                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
2475                      : Builder.CreateCast(Instruction::SIToFP, Induction,
2476                                           IV->getType());
2477       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID,
2478                                       State.CFG.PrevBB);
2479       ScalarIV->setName("offset.idx");
2480     }
2481     if (Trunc) {
2482       auto *TruncType = cast<IntegerType>(Trunc->getType());
2483       assert(Step->getType()->isIntegerTy() &&
2484              "Truncation requires an integer step");
2485       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2486       Step = Builder.CreateTrunc(Step, TruncType);
2487     }
2488     return ScalarIV;
2489   };
2490 
2491   // Create the vector values from the scalar IV, in the absence of creating a
2492   // vector IV.
2493   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
2494     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
2495     for (unsigned Part = 0; Part < UF; ++Part) {
2496       assert(!State.VF.isScalable() && "scalable vectors not yet supported.");
2497       Value *StartIdx;
2498       if (Step->getType()->isFloatingPointTy())
2499         StartIdx =
2500             getRuntimeVFAsFloat(Builder, Step->getType(), State.VF * Part);
2501       else
2502         StartIdx = getRuntimeVF(Builder, Step->getType(), State.VF * Part);
2503 
2504       Value *EntryPart =
2505           getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode());
2506       State.set(Def, EntryPart, Part);
2507       if (Trunc)
2508         addMetadata(EntryPart, Trunc);
2509     }
2510   };
2511 
2512   // Fast-math-flags propagate from the original induction instruction.
2513   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
2514   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
2515     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
2516 
2517   // Now do the actual transformations, and start with creating the step value.
2518   Value *Step = CreateStepValue(ID.getStep());
2519   if (State.VF.isZero() || State.VF.isScalar()) {
2520     Value *ScalarIV = CreateScalarIV(Step);
2521     CreateSplatIV(ScalarIV, Step);
2522     return;
2523   }
2524 
2525   // Determine if we want a scalar version of the induction variable. This is
2526   // true if the induction variable itself is not widened, or if it has at
2527   // least one user in the loop that is not widened.
2528   auto NeedsScalarIV = needsScalarInduction(EntryVal);
2529   if (!NeedsScalarIV) {
2530     createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State);
2531     return;
2532   }
2533 
2534   // Try to create a new independent vector induction variable. If we can't
2535   // create the phi node, we will splat the scalar induction variable in each
2536   // loop iteration.
2537   if (!shouldScalarizeInstruction(EntryVal)) {
2538     createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State);
2539     Value *ScalarIV = CreateScalarIV(Step);
2540     // Create scalar steps that can be used by instructions we will later
2541     // scalarize. Note that the addition of the scalar steps will not increase
2542     // the number of instructions in the loop in the common case prior to
2543     // InstCombine. We will be trading one vector extract for each scalar step.
2544     buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State);
2545     return;
2546   }
2547 
2548   // All IV users are scalar instructions, so only emit a scalar IV, not a
2549   // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2550   // predicate used by the masked loads/stores.
2551   Value *ScalarIV = CreateScalarIV(Step);
2552   if (!Cost->isScalarEpilogueAllowed())
2553     CreateSplatIV(ScalarIV, Step);
2554   buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State);
2555 }
2556 
2557 Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx,
2558                                           Value *Step,
2559                                           Instruction::BinaryOps BinOp) {
2560   // Create and check the types.
2561   auto *ValVTy = cast<VectorType>(Val->getType());
2562   ElementCount VLen = ValVTy->getElementCount();
2563 
2564   Type *STy = Val->getType()->getScalarType();
2565   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2566          "Induction Step must be an integer or FP");
2567   assert(Step->getType() == STy && "Step has wrong type");
2568 
2569   SmallVector<Constant *, 8> Indices;
2570 
2571   // Create a vector of consecutive numbers from zero to VF.
2572   VectorType *InitVecValVTy = ValVTy;
2573   Type *InitVecValSTy = STy;
2574   if (STy->isFloatingPointTy()) {
2575     InitVecValSTy =
2576         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
2577     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2578   }
2579   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2580 
2581   // Splat the StartIdx
2582   Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
2583 
2584   if (STy->isIntegerTy()) {
2585     InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2586     Step = Builder.CreateVectorSplat(VLen, Step);
2587     assert(Step->getType() == Val->getType() && "Invalid step vec");
2588     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2589     // which can be found from the original scalar operations.
2590     Step = Builder.CreateMul(InitVec, Step);
2591     return Builder.CreateAdd(Val, Step, "induction");
2592   }
2593 
2594   // Floating point induction.
2595   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2596          "Binary Opcode should be specified for FP induction");
2597   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2598   InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
2599 
2600   Step = Builder.CreateVectorSplat(VLen, Step);
2601   Value *MulOp = Builder.CreateFMul(InitVec, Step);
2602   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2603 }
2604 
2605 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2606                                            Instruction *EntryVal,
2607                                            const InductionDescriptor &ID,
2608                                            VPValue *Def,
2609                                            VPTransformState &State) {
2610   IRBuilder<> &Builder = State.Builder;
2611   // We shouldn't have to build scalar steps if we aren't vectorizing.
2612   assert(State.VF.isVector() && "VF should be greater than one");
2613   // Get the value type and ensure it and the step have the same integer type.
2614   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2615   assert(ScalarIVTy == Step->getType() &&
2616          "Val and Step should have the same type");
2617 
2618   // We build scalar steps for both integer and floating-point induction
2619   // variables. Here, we determine the kind of arithmetic we will perform.
2620   Instruction::BinaryOps AddOp;
2621   Instruction::BinaryOps MulOp;
2622   if (ScalarIVTy->isIntegerTy()) {
2623     AddOp = Instruction::Add;
2624     MulOp = Instruction::Mul;
2625   } else {
2626     AddOp = ID.getInductionOpcode();
2627     MulOp = Instruction::FMul;
2628   }
2629 
2630   // Determine the number of scalars we need to generate for each unroll
2631   // iteration. If EntryVal is uniform, we only need to generate the first
2632   // lane. Otherwise, we generate all VF values.
2633   bool IsUniform =
2634       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), State.VF);
2635   unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue();
2636   // Compute the scalar steps and save the results in State.
2637   Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2638                                      ScalarIVTy->getScalarSizeInBits());
2639   Type *VecIVTy = nullptr;
2640   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2641   if (!IsUniform && State.VF.isScalable()) {
2642     VecIVTy = VectorType::get(ScalarIVTy, State.VF);
2643     UnitStepVec =
2644         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
2645     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
2646     SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV);
2647   }
2648 
2649   for (unsigned Part = 0; Part < State.UF; ++Part) {
2650     Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
2651 
2652     if (!IsUniform && State.VF.isScalable()) {
2653       auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
2654       auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2655       if (ScalarIVTy->isFloatingPointTy())
2656         InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2657       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2658       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2659       State.set(Def, Add, Part);
2660       // It's useful to record the lane values too for the known minimum number
2661       // of elements so we do those below. This improves the code quality when
2662       // trying to extract the first element, for example.
2663     }
2664 
2665     if (ScalarIVTy->isFloatingPointTy())
2666       StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2667 
2668     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2669       Value *StartIdx = Builder.CreateBinOp(
2670           AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2671       // The step returned by `createStepForVF` is a runtime-evaluated value
2672       // when VF is scalable. Otherwise, it should be folded into a Constant.
2673       assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
2674              "Expected StartIdx to be folded to a constant when VF is not "
2675              "scalable");
2676       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2677       auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2678       State.set(Def, Add, VPIteration(Part, Lane));
2679     }
2680   }
2681 }
2682 
2683 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2684                                                     const VPIteration &Instance,
2685                                                     VPTransformState &State) {
2686   Value *ScalarInst = State.get(Def, Instance);
2687   Value *VectorValue = State.get(Def, Instance.Part);
2688   VectorValue = Builder.CreateInsertElement(
2689       VectorValue, ScalarInst,
2690       Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2691   State.set(Def, VectorValue, Instance.Part);
2692 }
2693 
2694 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2695   assert(Vec->getType()->isVectorTy() && "Invalid type");
2696   return Builder.CreateVectorReverse(Vec, "reverse");
2697 }
2698 
2699 // Return whether we allow using masked interleave-groups (for dealing with
2700 // strided loads/stores that reside in predicated blocks, or for dealing
2701 // with gaps).
2702 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2703   // If an override option has been passed in for interleaved accesses, use it.
2704   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2705     return EnableMaskedInterleavedMemAccesses;
2706 
2707   return TTI.enableMaskedInterleavedAccessVectorization();
2708 }
2709 
2710 // Try to vectorize the interleave group that \p Instr belongs to.
2711 //
2712 // E.g. Translate following interleaved load group (factor = 3):
2713 //   for (i = 0; i < N; i+=3) {
2714 //     R = Pic[i];             // Member of index 0
2715 //     G = Pic[i+1];           // Member of index 1
2716 //     B = Pic[i+2];           // Member of index 2
2717 //     ... // do something to R, G, B
2718 //   }
2719 // To:
2720 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2721 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2722 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2723 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2724 //
2725 // Or translate following interleaved store group (factor = 3):
2726 //   for (i = 0; i < N; i+=3) {
2727 //     ... do something to R, G, B
2728 //     Pic[i]   = R;           // Member of index 0
2729 //     Pic[i+1] = G;           // Member of index 1
2730 //     Pic[i+2] = B;           // Member of index 2
2731 //   }
2732 // To:
2733 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2734 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2735 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2736 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2737 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2738 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2739     const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2740     VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2741     VPValue *BlockInMask) {
2742   Instruction *Instr = Group->getInsertPos();
2743   const DataLayout &DL = Instr->getModule()->getDataLayout();
2744 
2745   // Prepare for the vector type of the interleaved load/store.
2746   Type *ScalarTy = getLoadStoreType(Instr);
2747   unsigned InterleaveFactor = Group->getFactor();
2748   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2749   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2750 
2751   // Prepare for the new pointers.
2752   SmallVector<Value *, 2> AddrParts;
2753   unsigned Index = Group->getIndex(Instr);
2754 
2755   // TODO: extend the masked interleaved-group support to reversed access.
2756   assert((!BlockInMask || !Group->isReverse()) &&
2757          "Reversed masked interleave-group not supported.");
2758 
2759   // If the group is reverse, adjust the index to refer to the last vector lane
2760   // instead of the first. We adjust the index from the first vector lane,
2761   // rather than directly getting the pointer for lane VF - 1, because the
2762   // pointer operand of the interleaved access is supposed to be uniform. For
2763   // uniform instructions, we're only required to generate a value for the
2764   // first vector lane in each unroll iteration.
2765   if (Group->isReverse())
2766     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2767 
2768   for (unsigned Part = 0; Part < UF; Part++) {
2769     Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2770     setDebugLocFromInst(AddrPart);
2771 
2772     // Notice current instruction could be any index. Need to adjust the address
2773     // to the member of index 0.
2774     //
2775     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2776     //       b = A[i];       // Member of index 0
2777     // Current pointer is pointed to A[i+1], adjust it to A[i].
2778     //
2779     // E.g.  A[i+1] = a;     // Member of index 1
2780     //       A[i]   = b;     // Member of index 0
2781     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2782     // Current pointer is pointed to A[i+2], adjust it to A[i].
2783 
2784     bool InBounds = false;
2785     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2786       InBounds = gep->isInBounds();
2787     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2788     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2789 
2790     // Cast to the vector pointer type.
2791     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2792     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2793     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2794   }
2795 
2796   setDebugLocFromInst(Instr);
2797   Value *PoisonVec = PoisonValue::get(VecTy);
2798 
2799   Value *MaskForGaps = nullptr;
2800   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2801     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2802     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2803   }
2804 
2805   // Vectorize the interleaved load group.
2806   if (isa<LoadInst>(Instr)) {
2807     // For each unroll part, create a wide load for the group.
2808     SmallVector<Value *, 2> NewLoads;
2809     for (unsigned Part = 0; Part < UF; Part++) {
2810       Instruction *NewLoad;
2811       if (BlockInMask || MaskForGaps) {
2812         assert(useMaskedInterleavedAccesses(*TTI) &&
2813                "masked interleaved groups are not allowed.");
2814         Value *GroupMask = MaskForGaps;
2815         if (BlockInMask) {
2816           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2817           Value *ShuffledMask = Builder.CreateShuffleVector(
2818               BlockInMaskPart,
2819               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2820               "interleaved.mask");
2821           GroupMask = MaskForGaps
2822                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2823                                                 MaskForGaps)
2824                           : ShuffledMask;
2825         }
2826         NewLoad =
2827             Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2828                                      GroupMask, PoisonVec, "wide.masked.vec");
2829       }
2830       else
2831         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2832                                             Group->getAlign(), "wide.vec");
2833       Group->addMetadata(NewLoad);
2834       NewLoads.push_back(NewLoad);
2835     }
2836 
2837     // For each member in the group, shuffle out the appropriate data from the
2838     // wide loads.
2839     unsigned J = 0;
2840     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2841       Instruction *Member = Group->getMember(I);
2842 
2843       // Skip the gaps in the group.
2844       if (!Member)
2845         continue;
2846 
2847       auto StrideMask =
2848           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2849       for (unsigned Part = 0; Part < UF; Part++) {
2850         Value *StridedVec = Builder.CreateShuffleVector(
2851             NewLoads[Part], StrideMask, "strided.vec");
2852 
2853         // If this member has different type, cast the result type.
2854         if (Member->getType() != ScalarTy) {
2855           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2856           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2857           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2858         }
2859 
2860         if (Group->isReverse())
2861           StridedVec = reverseVector(StridedVec);
2862 
2863         State.set(VPDefs[J], StridedVec, Part);
2864       }
2865       ++J;
2866     }
2867     return;
2868   }
2869 
2870   // The sub vector type for current instruction.
2871   auto *SubVT = VectorType::get(ScalarTy, VF);
2872 
2873   // Vectorize the interleaved store group.
2874   MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2875   assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2876          "masked interleaved groups are not allowed.");
2877   assert((!MaskForGaps || !VF.isScalable()) &&
2878          "masking gaps for scalable vectors is not yet supported.");
2879   for (unsigned Part = 0; Part < UF; Part++) {
2880     // Collect the stored vector from each member.
2881     SmallVector<Value *, 4> StoredVecs;
2882     for (unsigned i = 0; i < InterleaveFactor; i++) {
2883       assert((Group->getMember(i) || MaskForGaps) &&
2884              "Fail to get a member from an interleaved store group");
2885       Instruction *Member = Group->getMember(i);
2886 
2887       // Skip the gaps in the group.
2888       if (!Member) {
2889         Value *Undef = PoisonValue::get(SubVT);
2890         StoredVecs.push_back(Undef);
2891         continue;
2892       }
2893 
2894       Value *StoredVec = State.get(StoredValues[i], Part);
2895 
2896       if (Group->isReverse())
2897         StoredVec = reverseVector(StoredVec);
2898 
2899       // If this member has different type, cast it to a unified type.
2900 
2901       if (StoredVec->getType() != SubVT)
2902         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2903 
2904       StoredVecs.push_back(StoredVec);
2905     }
2906 
2907     // Concatenate all vectors into a wide vector.
2908     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2909 
2910     // Interleave the elements in the wide vector.
2911     Value *IVec = Builder.CreateShuffleVector(
2912         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2913         "interleaved.vec");
2914 
2915     Instruction *NewStoreInstr;
2916     if (BlockInMask || MaskForGaps) {
2917       Value *GroupMask = MaskForGaps;
2918       if (BlockInMask) {
2919         Value *BlockInMaskPart = State.get(BlockInMask, Part);
2920         Value *ShuffledMask = Builder.CreateShuffleVector(
2921             BlockInMaskPart,
2922             createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2923             "interleaved.mask");
2924         GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And,
2925                                                       ShuffledMask, MaskForGaps)
2926                                 : ShuffledMask;
2927       }
2928       NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2929                                                 Group->getAlign(), GroupMask);
2930     } else
2931       NewStoreInstr =
2932           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2933 
2934     Group->addMetadata(NewStoreInstr);
2935   }
2936 }
2937 
2938 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2939                                                VPReplicateRecipe *RepRecipe,
2940                                                const VPIteration &Instance,
2941                                                bool IfPredicateInstr,
2942                                                VPTransformState &State) {
2943   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2944 
2945   // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2946   // the first lane and part.
2947   if (isa<NoAliasScopeDeclInst>(Instr))
2948     if (!Instance.isFirstIteration())
2949       return;
2950 
2951   setDebugLocFromInst(Instr);
2952 
2953   // Does this instruction return a value ?
2954   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2955 
2956   Instruction *Cloned = Instr->clone();
2957   if (!IsVoidRetTy)
2958     Cloned->setName(Instr->getName() + ".cloned");
2959 
2960   // If the scalarized instruction contributes to the address computation of a
2961   // widen masked load/store which was in a basic block that needed predication
2962   // and is not predicated after vectorization, we can't propagate
2963   // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized
2964   // instruction could feed a poison value to the base address of the widen
2965   // load/store.
2966   if (State.MayGeneratePoisonRecipes.count(RepRecipe) > 0)
2967     Cloned->dropPoisonGeneratingFlags();
2968 
2969   State.Builder.SetInsertPoint(Builder.GetInsertBlock(),
2970                                Builder.GetInsertPoint());
2971   // Replace the operands of the cloned instructions with their scalar
2972   // equivalents in the new loop.
2973   for (auto &I : enumerate(RepRecipe->operands())) {
2974     auto InputInstance = Instance;
2975     VPValue *Operand = I.value();
2976     if (State.Plan->isUniformAfterVectorization(Operand))
2977       InputInstance.Lane = VPLane::getFirstLane();
2978     Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2979   }
2980   addNewMetadata(Cloned, Instr);
2981 
2982   // Place the cloned scalar in the new loop.
2983   Builder.Insert(Cloned);
2984 
2985   State.set(RepRecipe, Cloned, Instance);
2986 
2987   // If we just cloned a new assumption, add it the assumption cache.
2988   if (auto *II = dyn_cast<AssumeInst>(Cloned))
2989     AC->registerAssumption(II);
2990 
2991   // End if-block.
2992   if (IfPredicateInstr)
2993     PredicatedInstructions.push_back(Cloned);
2994 }
2995 
2996 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2997                                                       Value *End, Value *Step,
2998                                                       Instruction *DL) {
2999   BasicBlock *Header = L->getHeader();
3000   BasicBlock *Latch = L->getLoopLatch();
3001   // As we're just creating this loop, it's possible no latch exists
3002   // yet. If so, use the header as this will be a single block loop.
3003   if (!Latch)
3004     Latch = Header;
3005 
3006   IRBuilder<> B(&*Header->getFirstInsertionPt());
3007   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
3008   setDebugLocFromInst(OldInst, &B);
3009   auto *Induction = B.CreatePHI(Start->getType(), 2, "index");
3010 
3011   B.SetInsertPoint(Latch->getTerminator());
3012   setDebugLocFromInst(OldInst, &B);
3013 
3014   // Create i+1 and fill the PHINode.
3015   //
3016   // If the tail is not folded, we know that End - Start >= Step (either
3017   // statically or through the minimum iteration checks). We also know that both
3018   // Start % Step == 0 and End % Step == 0. We exit the vector loop if %IV +
3019   // %Step == %End. Hence we must exit the loop before %IV + %Step unsigned
3020   // overflows and we can mark the induction increment as NUW.
3021   Value *Next = B.CreateAdd(Induction, Step, "index.next",
3022                             /*NUW=*/!Cost->foldTailByMasking(), /*NSW=*/false);
3023   Induction->addIncoming(Start, L->getLoopPreheader());
3024   Induction->addIncoming(Next, Latch);
3025   // Create the compare.
3026   Value *ICmp = B.CreateICmpEQ(Next, End);
3027   B.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header);
3028 
3029   // Now we have two terminators. Remove the old one from the block.
3030   Latch->getTerminator()->eraseFromParent();
3031 
3032   return Induction;
3033 }
3034 
3035 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
3036   if (TripCount)
3037     return TripCount;
3038 
3039   assert(L && "Create Trip Count for null loop.");
3040   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3041   // Find the loop boundaries.
3042   ScalarEvolution *SE = PSE.getSE();
3043   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
3044   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
3045          "Invalid loop count");
3046 
3047   Type *IdxTy = Legal->getWidestInductionType();
3048   assert(IdxTy && "No type for induction");
3049 
3050   // The exit count might have the type of i64 while the phi is i32. This can
3051   // happen if we have an induction variable that is sign extended before the
3052   // compare. The only way that we get a backedge taken count is that the
3053   // induction variable was signed and as such will not overflow. In such a case
3054   // truncation is legal.
3055   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
3056       IdxTy->getPrimitiveSizeInBits())
3057     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
3058   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
3059 
3060   // Get the total trip count from the count by adding 1.
3061   const SCEV *ExitCount = SE->getAddExpr(
3062       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
3063 
3064   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
3065 
3066   // Expand the trip count and place the new instructions in the preheader.
3067   // Notice that the pre-header does not change, only the loop body.
3068   SCEVExpander Exp(*SE, DL, "induction");
3069 
3070   // Count holds the overall loop count (N).
3071   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
3072                                 L->getLoopPreheader()->getTerminator());
3073 
3074   if (TripCount->getType()->isPointerTy())
3075     TripCount =
3076         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
3077                                     L->getLoopPreheader()->getTerminator());
3078 
3079   return TripCount;
3080 }
3081 
3082 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
3083   if (VectorTripCount)
3084     return VectorTripCount;
3085 
3086   Value *TC = getOrCreateTripCount(L);
3087   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3088 
3089   Type *Ty = TC->getType();
3090   // This is where we can make the step a runtime constant.
3091   Value *Step = createStepForVF(Builder, Ty, VF, UF);
3092 
3093   // If the tail is to be folded by masking, round the number of iterations N
3094   // up to a multiple of Step instead of rounding down. This is done by first
3095   // adding Step-1 and then rounding down. Note that it's ok if this addition
3096   // overflows: the vector induction variable will eventually wrap to zero given
3097   // that it starts at zero and its Step is a power of two; the loop will then
3098   // exit, with the last early-exit vector comparison also producing all-true.
3099   if (Cost->foldTailByMasking()) {
3100     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
3101            "VF*UF must be a power of 2 when folding tail by masking");
3102     assert(!VF.isScalable() &&
3103            "Tail folding not yet supported for scalable vectors");
3104     TC = Builder.CreateAdd(
3105         TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
3106   }
3107 
3108   // Now we need to generate the expression for the part of the loop that the
3109   // vectorized body will execute. This is equal to N - (N % Step) if scalar
3110   // iterations are not required for correctness, or N - Step, otherwise. Step
3111   // is equal to the vectorization factor (number of SIMD elements) times the
3112   // unroll factor (number of SIMD instructions).
3113   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
3114 
3115   // There are cases where we *must* run at least one iteration in the remainder
3116   // loop.  See the cost model for when this can happen.  If the step evenly
3117   // divides the trip count, we set the remainder to be equal to the step. If
3118   // the step does not evenly divide the trip count, no adjustment is necessary
3119   // since there will already be scalar iterations. Note that the minimum
3120   // iterations check ensures that N >= Step.
3121   if (Cost->requiresScalarEpilogue(VF)) {
3122     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
3123     R = Builder.CreateSelect(IsZero, Step, R);
3124   }
3125 
3126   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
3127 
3128   return VectorTripCount;
3129 }
3130 
3131 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
3132                                                    const DataLayout &DL) {
3133   // Verify that V is a vector type with same number of elements as DstVTy.
3134   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
3135   unsigned VF = DstFVTy->getNumElements();
3136   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
3137   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
3138   Type *SrcElemTy = SrcVecTy->getElementType();
3139   Type *DstElemTy = DstFVTy->getElementType();
3140   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3141          "Vector elements must have same size");
3142 
3143   // Do a direct cast if element types are castable.
3144   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3145     return Builder.CreateBitOrPointerCast(V, DstFVTy);
3146   }
3147   // V cannot be directly casted to desired vector type.
3148   // May happen when V is a floating point vector but DstVTy is a vector of
3149   // pointers or vice-versa. Handle this using a two-step bitcast using an
3150   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3151   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3152          "Only one type should be a pointer type");
3153   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3154          "Only one type should be a floating point type");
3155   Type *IntTy =
3156       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3157   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
3158   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3159   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
3160 }
3161 
3162 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
3163                                                          BasicBlock *Bypass) {
3164   Value *Count = getOrCreateTripCount(L);
3165   // Reuse existing vector loop preheader for TC checks.
3166   // Note that new preheader block is generated for vector loop.
3167   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
3168   IRBuilder<> Builder(TCCheckBlock->getTerminator());
3169 
3170   // Generate code to check if the loop's trip count is less than VF * UF, or
3171   // equal to it in case a scalar epilogue is required; this implies that the
3172   // vector trip count is zero. This check also covers the case where adding one
3173   // to the backedge-taken count overflowed leading to an incorrect trip count
3174   // of zero. In this case we will also jump to the scalar loop.
3175   auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
3176                                             : ICmpInst::ICMP_ULT;
3177 
3178   // If tail is to be folded, vector loop takes care of all iterations.
3179   Value *CheckMinIters = Builder.getFalse();
3180   if (!Cost->foldTailByMasking()) {
3181     Value *Step = createStepForVF(Builder, Count->getType(), VF, UF);
3182     CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
3183   }
3184   // Create new preheader for vector loop.
3185   LoopVectorPreHeader =
3186       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3187                  "vector.ph");
3188 
3189   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
3190                                DT->getNode(Bypass)->getIDom()) &&
3191          "TC check is expected to dominate Bypass");
3192 
3193   // Update dominator for Bypass & LoopExit (if needed).
3194   DT->changeImmediateDominator(Bypass, TCCheckBlock);
3195   if (!Cost->requiresScalarEpilogue(VF))
3196     // If there is an epilogue which must run, there's no edge from the
3197     // middle block to exit blocks  and thus no need to update the immediate
3198     // dominator of the exit blocks.
3199     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3200 
3201   ReplaceInstWithInst(
3202       TCCheckBlock->getTerminator(),
3203       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3204   LoopBypassBlocks.push_back(TCCheckBlock);
3205 }
3206 
3207 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
3208 
3209   BasicBlock *const SCEVCheckBlock =
3210       RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock);
3211   if (!SCEVCheckBlock)
3212     return nullptr;
3213 
3214   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3215            (OptForSizeBasedOnProfile &&
3216             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
3217          "Cannot SCEV check stride or overflow when optimizing for size");
3218 
3219 
3220   // Update dominator only if this is first RT check.
3221   if (LoopBypassBlocks.empty()) {
3222     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3223     if (!Cost->requiresScalarEpilogue(VF))
3224       // If there is an epilogue which must run, there's no edge from the
3225       // middle block to exit blocks  and thus no need to update the immediate
3226       // dominator of the exit blocks.
3227       DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3228   }
3229 
3230   LoopBypassBlocks.push_back(SCEVCheckBlock);
3231   AddedSafetyChecks = true;
3232   return SCEVCheckBlock;
3233 }
3234 
3235 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
3236                                                       BasicBlock *Bypass) {
3237   // VPlan-native path does not do any analysis for runtime checks currently.
3238   if (EnableVPlanNativePath)
3239     return nullptr;
3240 
3241   BasicBlock *const MemCheckBlock =
3242       RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader);
3243 
3244   // Check if we generated code that checks in runtime if arrays overlap. We put
3245   // the checks into a separate block to make the more common case of few
3246   // elements faster.
3247   if (!MemCheckBlock)
3248     return nullptr;
3249 
3250   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3251     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3252            "Cannot emit memory checks when optimizing for size, unless forced "
3253            "to vectorize.");
3254     ORE->emit([&]() {
3255       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3256                                         L->getStartLoc(), L->getHeader())
3257              << "Code-size may be reduced by not forcing "
3258                 "vectorization, or by source-code modifications "
3259                 "eliminating the need for runtime checks "
3260                 "(e.g., adding 'restrict').";
3261     });
3262   }
3263 
3264   LoopBypassBlocks.push_back(MemCheckBlock);
3265 
3266   AddedSafetyChecks = true;
3267 
3268   // We currently don't use LoopVersioning for the actual loop cloning but we
3269   // still use it to add the noalias metadata.
3270   LVer = std::make_unique<LoopVersioning>(
3271       *Legal->getLAI(),
3272       Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3273       DT, PSE.getSE());
3274   LVer->prepareNoAliasMetadata();
3275   return MemCheckBlock;
3276 }
3277 
3278 Value *InnerLoopVectorizer::emitTransformedIndex(
3279     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
3280     const InductionDescriptor &ID, BasicBlock *VectorHeader) const {
3281 
3282   SCEVExpander Exp(*SE, DL, "induction");
3283   auto Step = ID.getStep();
3284   auto StartValue = ID.getStartValue();
3285   assert(Index->getType()->getScalarType() == Step->getType() &&
3286          "Index scalar type does not match StepValue type");
3287 
3288   // Note: the IR at this point is broken. We cannot use SE to create any new
3289   // SCEV and then expand it, hoping that SCEV's simplification will give us
3290   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3291   // lead to various SCEV crashes. So all we can do is to use builder and rely
3292   // on InstCombine for future simplifications. Here we handle some trivial
3293   // cases only.
3294   auto CreateAdd = [&B](Value *X, Value *Y) {
3295     assert(X->getType() == Y->getType() && "Types don't match!");
3296     if (auto *CX = dyn_cast<ConstantInt>(X))
3297       if (CX->isZero())
3298         return Y;
3299     if (auto *CY = dyn_cast<ConstantInt>(Y))
3300       if (CY->isZero())
3301         return X;
3302     return B.CreateAdd(X, Y);
3303   };
3304 
3305   // We allow X to be a vector type, in which case Y will potentially be
3306   // splatted into a vector with the same element count.
3307   auto CreateMul = [&B](Value *X, Value *Y) {
3308     assert(X->getType()->getScalarType() == Y->getType() &&
3309            "Types don't match!");
3310     if (auto *CX = dyn_cast<ConstantInt>(X))
3311       if (CX->isOne())
3312         return Y;
3313     if (auto *CY = dyn_cast<ConstantInt>(Y))
3314       if (CY->isOne())
3315         return X;
3316     VectorType *XVTy = dyn_cast<VectorType>(X->getType());
3317     if (XVTy && !isa<VectorType>(Y->getType()))
3318       Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
3319     return B.CreateMul(X, Y);
3320   };
3321 
3322   // Get a suitable insert point for SCEV expansion. For blocks in the vector
3323   // loop, choose the end of the vector loop header (=VectorHeader), because
3324   // the DomTree is not kept up-to-date for additional blocks generated in the
3325   // vector loop. By using the header as insertion point, we guarantee that the
3326   // expanded instructions dominate all their uses.
3327   auto GetInsertPoint = [this, &B, VectorHeader]() {
3328     BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3329     if (InsertBB != LoopVectorBody &&
3330         LI->getLoopFor(VectorHeader) == LI->getLoopFor(InsertBB))
3331       return VectorHeader->getTerminator();
3332     return &*B.GetInsertPoint();
3333   };
3334 
3335   switch (ID.getKind()) {
3336   case InductionDescriptor::IK_IntInduction: {
3337     assert(!isa<VectorType>(Index->getType()) &&
3338            "Vector indices not supported for integer inductions yet");
3339     assert(Index->getType() == StartValue->getType() &&
3340            "Index type does not match StartValue type");
3341     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3342       return B.CreateSub(StartValue, Index);
3343     auto *Offset = CreateMul(
3344         Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3345     return CreateAdd(StartValue, Offset);
3346   }
3347   case InductionDescriptor::IK_PtrInduction: {
3348     assert(isa<SCEVConstant>(Step) &&
3349            "Expected constant step for pointer induction");
3350     return B.CreateGEP(
3351         ID.getElementType(), StartValue,
3352         CreateMul(Index,
3353                   Exp.expandCodeFor(Step, Index->getType()->getScalarType(),
3354                                     GetInsertPoint())));
3355   }
3356   case InductionDescriptor::IK_FpInduction: {
3357     assert(!isa<VectorType>(Index->getType()) &&
3358            "Vector indices not supported for FP inductions yet");
3359     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
3360     auto InductionBinOp = ID.getInductionBinOp();
3361     assert(InductionBinOp &&
3362            (InductionBinOp->getOpcode() == Instruction::FAdd ||
3363             InductionBinOp->getOpcode() == Instruction::FSub) &&
3364            "Original bin op should be defined for FP induction");
3365 
3366     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3367     Value *MulExp = B.CreateFMul(StepValue, Index);
3368     return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3369                          "induction");
3370   }
3371   case InductionDescriptor::IK_NoInduction:
3372     return nullptr;
3373   }
3374   llvm_unreachable("invalid enum");
3375 }
3376 
3377 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3378   LoopScalarBody = OrigLoop->getHeader();
3379   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3380   assert(LoopVectorPreHeader && "Invalid loop structure");
3381   LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3382   assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) &&
3383          "multiple exit loop without required epilogue?");
3384 
3385   LoopMiddleBlock =
3386       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3387                  LI, nullptr, Twine(Prefix) + "middle.block");
3388   LoopScalarPreHeader =
3389       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3390                  nullptr, Twine(Prefix) + "scalar.ph");
3391 
3392   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3393 
3394   // Set up the middle block terminator.  Two cases:
3395   // 1) If we know that we must execute the scalar epilogue, emit an
3396   //    unconditional branch.
3397   // 2) Otherwise, we must have a single unique exit block (due to how we
3398   //    implement the multiple exit case).  In this case, set up a conditonal
3399   //    branch from the middle block to the loop scalar preheader, and the
3400   //    exit block.  completeLoopSkeleton will update the condition to use an
3401   //    iteration check, if required to decide whether to execute the remainder.
3402   BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ?
3403     BranchInst::Create(LoopScalarPreHeader) :
3404     BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3405                        Builder.getTrue());
3406   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3407   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3408 
3409   // We intentionally don't let SplitBlock to update LoopInfo since
3410   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3411   // LoopVectorBody is explicitly added to the correct place few lines later.
3412   LoopVectorBody =
3413       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3414                  nullptr, nullptr, Twine(Prefix) + "vector.body");
3415 
3416   // Update dominator for loop exit.
3417   if (!Cost->requiresScalarEpilogue(VF))
3418     // If there is an epilogue which must run, there's no edge from the
3419     // middle block to exit blocks  and thus no need to update the immediate
3420     // dominator of the exit blocks.
3421     DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3422 
3423   // Create and register the new vector loop.
3424   Loop *Lp = LI->AllocateLoop();
3425   Loop *ParentLoop = OrigLoop->getParentLoop();
3426 
3427   // Insert the new loop into the loop nest and register the new basic blocks
3428   // before calling any utilities such as SCEV that require valid LoopInfo.
3429   if (ParentLoop) {
3430     ParentLoop->addChildLoop(Lp);
3431   } else {
3432     LI->addTopLevelLoop(Lp);
3433   }
3434   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3435   return Lp;
3436 }
3437 
3438 void InnerLoopVectorizer::createInductionResumeValues(
3439     Loop *L, Value *VectorTripCount,
3440     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3441   assert(VectorTripCount && L && "Expected valid arguments");
3442   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3443           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3444          "Inconsistent information about additional bypass.");
3445   // We are going to resume the execution of the scalar loop.
3446   // Go over all of the induction variables that we found and fix the
3447   // PHIs that are left in the scalar version of the loop.
3448   // The starting values of PHI nodes depend on the counter of the last
3449   // iteration in the vectorized loop.
3450   // If we come from a bypass edge then we need to start from the original
3451   // start value.
3452   for (auto &InductionEntry : Legal->getInductionVars()) {
3453     PHINode *OrigPhi = InductionEntry.first;
3454     InductionDescriptor II = InductionEntry.second;
3455 
3456     // Create phi nodes to merge from the  backedge-taken check block.
3457     PHINode *BCResumeVal =
3458         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3459                         LoopScalarPreHeader->getTerminator());
3460     // Copy original phi DL over to the new one.
3461     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3462     Value *&EndValue = IVEndValues[OrigPhi];
3463     Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3464     if (OrigPhi == OldInduction) {
3465       // We know what the end value is.
3466       EndValue = VectorTripCount;
3467     } else {
3468       IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3469 
3470       // Fast-math-flags propagate from the original induction instruction.
3471       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3472         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3473 
3474       Type *StepType = II.getStep()->getType();
3475       Instruction::CastOps CastOp =
3476           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3477       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3478       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3479       EndValue =
3480           emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody);
3481       EndValue->setName("ind.end");
3482 
3483       // Compute the end value for the additional bypass (if applicable).
3484       if (AdditionalBypass.first) {
3485         B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3486         CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3487                                          StepType, true);
3488         CRD =
3489             B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
3490         EndValueFromAdditionalBypass =
3491             emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody);
3492         EndValueFromAdditionalBypass->setName("ind.end");
3493       }
3494     }
3495     // The new PHI merges the original incoming value, in case of a bypass,
3496     // or the value at the end of the vectorized loop.
3497     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3498 
3499     // Fix the scalar body counter (PHI node).
3500     // The old induction's phi node in the scalar body needs the truncated
3501     // value.
3502     for (BasicBlock *BB : LoopBypassBlocks)
3503       BCResumeVal->addIncoming(II.getStartValue(), BB);
3504 
3505     if (AdditionalBypass.first)
3506       BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3507                                             EndValueFromAdditionalBypass);
3508 
3509     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3510   }
3511 }
3512 
3513 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3514                                                       MDNode *OrigLoopID) {
3515   assert(L && "Expected valid loop.");
3516 
3517   // The trip counts should be cached by now.
3518   Value *Count = getOrCreateTripCount(L);
3519   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3520 
3521   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3522 
3523   // Add a check in the middle block to see if we have completed
3524   // all of the iterations in the first vector loop.  Three cases:
3525   // 1) If we require a scalar epilogue, there is no conditional branch as
3526   //    we unconditionally branch to the scalar preheader.  Do nothing.
3527   // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3528   //    Thus if tail is to be folded, we know we don't need to run the
3529   //    remainder and we can use the previous value for the condition (true).
3530   // 3) Otherwise, construct a runtime check.
3531   if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) {
3532     Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3533                                         Count, VectorTripCount, "cmp.n",
3534                                         LoopMiddleBlock->getTerminator());
3535 
3536     // Here we use the same DebugLoc as the scalar loop latch terminator instead
3537     // of the corresponding compare because they may have ended up with
3538     // different line numbers and we want to avoid awkward line stepping while
3539     // debugging. Eg. if the compare has got a line number inside the loop.
3540     CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3541     cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3542   }
3543 
3544   // Get ready to start creating new instructions into the vectorized body.
3545   assert(LoopVectorPreHeader == L->getLoopPreheader() &&
3546          "Inconsistent vector loop preheader");
3547   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3548 
3549   Optional<MDNode *> VectorizedLoopID =
3550       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3551                                       LLVMLoopVectorizeFollowupVectorized});
3552   if (VectorizedLoopID.hasValue()) {
3553     L->setLoopID(VectorizedLoopID.getValue());
3554 
3555     // Do not setAlreadyVectorized if loop attributes have been defined
3556     // explicitly.
3557     return LoopVectorPreHeader;
3558   }
3559 
3560   // Keep all loop hints from the original loop on the vector loop (we'll
3561   // replace the vectorizer-specific hints below).
3562   if (MDNode *LID = OrigLoop->getLoopID())
3563     L->setLoopID(LID);
3564 
3565   LoopVectorizeHints Hints(L, true, *ORE);
3566   Hints.setAlreadyVectorized();
3567 
3568 #ifdef EXPENSIVE_CHECKS
3569   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3570   LI->verify(*DT);
3571 #endif
3572 
3573   return LoopVectorPreHeader;
3574 }
3575 
3576 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3577   /*
3578    In this function we generate a new loop. The new loop will contain
3579    the vectorized instructions while the old loop will continue to run the
3580    scalar remainder.
3581 
3582        [ ] <-- loop iteration number check.
3583     /   |
3584    /    v
3585   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3586   |  /  |
3587   | /   v
3588   ||   [ ]     <-- vector pre header.
3589   |/    |
3590   |     v
3591   |    [  ] \
3592   |    [  ]_|   <-- vector loop.
3593   |     |
3594   |     v
3595   \   -[ ]   <--- middle-block.
3596    \/   |
3597    /\   v
3598    | ->[ ]     <--- new preheader.
3599    |    |
3600  (opt)  v      <-- edge from middle to exit iff epilogue is not required.
3601    |   [ ] \
3602    |   [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue).
3603     \   |
3604      \  v
3605       >[ ]     <-- exit block(s).
3606    ...
3607    */
3608 
3609   // Get the metadata of the original loop before it gets modified.
3610   MDNode *OrigLoopID = OrigLoop->getLoopID();
3611 
3612   // Workaround!  Compute the trip count of the original loop and cache it
3613   // before we start modifying the CFG.  This code has a systemic problem
3614   // wherein it tries to run analysis over partially constructed IR; this is
3615   // wrong, and not simply for SCEV.  The trip count of the original loop
3616   // simply happens to be prone to hitting this in practice.  In theory, we
3617   // can hit the same issue for any SCEV, or ValueTracking query done during
3618   // mutation.  See PR49900.
3619   getOrCreateTripCount(OrigLoop);
3620 
3621   // Create an empty vector loop, and prepare basic blocks for the runtime
3622   // checks.
3623   Loop *Lp = createVectorLoopSkeleton("");
3624 
3625   // Now, compare the new count to zero. If it is zero skip the vector loop and
3626   // jump to the scalar loop. This check also covers the case where the
3627   // backedge-taken count is uint##_max: adding one to it will overflow leading
3628   // to an incorrect trip count of zero. In this (rare) case we will also jump
3629   // to the scalar loop.
3630   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3631 
3632   // Generate the code to check any assumptions that we've made for SCEV
3633   // expressions.
3634   emitSCEVChecks(Lp, LoopScalarPreHeader);
3635 
3636   // Generate the code that checks in runtime if arrays overlap. We put the
3637   // checks into a separate block to make the more common case of few elements
3638   // faster.
3639   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3640 
3641   // Some loops have a single integer induction variable, while other loops
3642   // don't. One example is c++ iterators that often have multiple pointer
3643   // induction variables. In the code below we also support a case where we
3644   // don't have a single induction variable.
3645   //
3646   // We try to obtain an induction variable from the original loop as hard
3647   // as possible. However if we don't find one that:
3648   //   - is an integer
3649   //   - counts from zero, stepping by one
3650   //   - is the size of the widest induction variable type
3651   // then we create a new one.
3652   OldInduction = Legal->getPrimaryInduction();
3653   Type *IdxTy = Legal->getWidestInductionType();
3654   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3655   // The loop step is equal to the vectorization factor (num of SIMD elements)
3656   // times the unroll factor (num of SIMD instructions).
3657   Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
3658   Value *Step = createStepForVF(Builder, IdxTy, VF, UF);
3659   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3660   Induction =
3661       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3662                               getDebugLocFromInstOrOperands(OldInduction));
3663 
3664   // Emit phis for the new starting index of the scalar loop.
3665   createInductionResumeValues(Lp, CountRoundDown);
3666 
3667   return completeLoopSkeleton(Lp, OrigLoopID);
3668 }
3669 
3670 // Fix up external users of the induction variable. At this point, we are
3671 // in LCSSA form, with all external PHIs that use the IV having one input value,
3672 // coming from the remainder loop. We need those PHIs to also have a correct
3673 // value for the IV when arriving directly from the middle block.
3674 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3675                                        const InductionDescriptor &II,
3676                                        Value *CountRoundDown, Value *EndValue,
3677                                        BasicBlock *MiddleBlock) {
3678   // There are two kinds of external IV usages - those that use the value
3679   // computed in the last iteration (the PHI) and those that use the penultimate
3680   // value (the value that feeds into the phi from the loop latch).
3681   // We allow both, but they, obviously, have different values.
3682 
3683   assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3684 
3685   DenseMap<Value *, Value *> MissingVals;
3686 
3687   // An external user of the last iteration's value should see the value that
3688   // the remainder loop uses to initialize its own IV.
3689   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3690   for (User *U : PostInc->users()) {
3691     Instruction *UI = cast<Instruction>(U);
3692     if (!OrigLoop->contains(UI)) {
3693       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3694       MissingVals[UI] = EndValue;
3695     }
3696   }
3697 
3698   // An external user of the penultimate value need to see EndValue - Step.
3699   // The simplest way to get this is to recompute it from the constituent SCEVs,
3700   // that is Start + (Step * (CRD - 1)).
3701   for (User *U : OrigPhi->users()) {
3702     auto *UI = cast<Instruction>(U);
3703     if (!OrigLoop->contains(UI)) {
3704       const DataLayout &DL =
3705           OrigLoop->getHeader()->getModule()->getDataLayout();
3706       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3707 
3708       IRBuilder<> B(MiddleBlock->getTerminator());
3709 
3710       // Fast-math-flags propagate from the original induction instruction.
3711       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3712         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3713 
3714       Value *CountMinusOne = B.CreateSub(
3715           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3716       Value *CMO =
3717           !II.getStep()->getType()->isIntegerTy()
3718               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3719                              II.getStep()->getType())
3720               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3721       CMO->setName("cast.cmo");
3722       Value *Escape =
3723           emitTransformedIndex(B, CMO, PSE.getSE(), DL, II, LoopVectorBody);
3724       Escape->setName("ind.escape");
3725       MissingVals[UI] = Escape;
3726     }
3727   }
3728 
3729   for (auto &I : MissingVals) {
3730     PHINode *PHI = cast<PHINode>(I.first);
3731     // One corner case we have to handle is two IVs "chasing" each-other,
3732     // that is %IV2 = phi [...], [ %IV1, %latch ]
3733     // In this case, if IV1 has an external use, we need to avoid adding both
3734     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3735     // don't already have an incoming value for the middle block.
3736     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3737       PHI->addIncoming(I.second, MiddleBlock);
3738   }
3739 }
3740 
3741 namespace {
3742 
3743 struct CSEDenseMapInfo {
3744   static bool canHandle(const Instruction *I) {
3745     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3746            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3747   }
3748 
3749   static inline Instruction *getEmptyKey() {
3750     return DenseMapInfo<Instruction *>::getEmptyKey();
3751   }
3752 
3753   static inline Instruction *getTombstoneKey() {
3754     return DenseMapInfo<Instruction *>::getTombstoneKey();
3755   }
3756 
3757   static unsigned getHashValue(const Instruction *I) {
3758     assert(canHandle(I) && "Unknown instruction!");
3759     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3760                                                            I->value_op_end()));
3761   }
3762 
3763   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3764     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3765         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3766       return LHS == RHS;
3767     return LHS->isIdenticalTo(RHS);
3768   }
3769 };
3770 
3771 } // end anonymous namespace
3772 
3773 ///Perform cse of induction variable instructions.
3774 static void cse(BasicBlock *BB) {
3775   // Perform simple cse.
3776   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3777   for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3778     if (!CSEDenseMapInfo::canHandle(&In))
3779       continue;
3780 
3781     // Check if we can replace this instruction with any of the
3782     // visited instructions.
3783     if (Instruction *V = CSEMap.lookup(&In)) {
3784       In.replaceAllUsesWith(V);
3785       In.eraseFromParent();
3786       continue;
3787     }
3788 
3789     CSEMap[&In] = &In;
3790   }
3791 }
3792 
3793 InstructionCost
3794 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3795                                               bool &NeedToScalarize) const {
3796   Function *F = CI->getCalledFunction();
3797   Type *ScalarRetTy = CI->getType();
3798   SmallVector<Type *, 4> Tys, ScalarTys;
3799   for (auto &ArgOp : CI->args())
3800     ScalarTys.push_back(ArgOp->getType());
3801 
3802   // Estimate cost of scalarized vector call. The source operands are assumed
3803   // to be vectors, so we need to extract individual elements from there,
3804   // execute VF scalar calls, and then gather the result into the vector return
3805   // value.
3806   InstructionCost ScalarCallCost =
3807       TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
3808   if (VF.isScalar())
3809     return ScalarCallCost;
3810 
3811   // Compute corresponding vector type for return value and arguments.
3812   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3813   for (Type *ScalarTy : ScalarTys)
3814     Tys.push_back(ToVectorTy(ScalarTy, VF));
3815 
3816   // Compute costs of unpacking argument values for the scalar calls and
3817   // packing the return values to a vector.
3818   InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
3819 
3820   InstructionCost Cost =
3821       ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3822 
3823   // If we can't emit a vector call for this function, then the currently found
3824   // cost is the cost we need to return.
3825   NeedToScalarize = true;
3826   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3827   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3828 
3829   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3830     return Cost;
3831 
3832   // If the corresponding vector cost is cheaper, return its cost.
3833   InstructionCost VectorCallCost =
3834       TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
3835   if (VectorCallCost < Cost) {
3836     NeedToScalarize = false;
3837     Cost = VectorCallCost;
3838   }
3839   return Cost;
3840 }
3841 
3842 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3843   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3844     return Elt;
3845   return VectorType::get(Elt, VF);
3846 }
3847 
3848 InstructionCost
3849 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3850                                                    ElementCount VF) const {
3851   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3852   assert(ID && "Expected intrinsic call!");
3853   Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3854   FastMathFlags FMF;
3855   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3856     FMF = FPMO->getFastMathFlags();
3857 
3858   SmallVector<const Value *> Arguments(CI->args());
3859   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3860   SmallVector<Type *> ParamTys;
3861   std::transform(FTy->param_begin(), FTy->param_end(),
3862                  std::back_inserter(ParamTys),
3863                  [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3864 
3865   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3866                                     dyn_cast<IntrinsicInst>(CI));
3867   return TTI.getIntrinsicInstrCost(CostAttrs,
3868                                    TargetTransformInfo::TCK_RecipThroughput);
3869 }
3870 
3871 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3872   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3873   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3874   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3875 }
3876 
3877 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3878   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3879   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3880   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3881 }
3882 
3883 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3884   // For every instruction `I` in MinBWs, truncate the operands, create a
3885   // truncated version of `I` and reextend its result. InstCombine runs
3886   // later and will remove any ext/trunc pairs.
3887   SmallPtrSet<Value *, 4> Erased;
3888   for (const auto &KV : Cost->getMinimalBitwidths()) {
3889     // If the value wasn't vectorized, we must maintain the original scalar
3890     // type. The absence of the value from State indicates that it
3891     // wasn't vectorized.
3892     // FIXME: Should not rely on getVPValue at this point.
3893     VPValue *Def = State.Plan->getVPValue(KV.first, true);
3894     if (!State.hasAnyVectorValue(Def))
3895       continue;
3896     for (unsigned Part = 0; Part < UF; ++Part) {
3897       Value *I = State.get(Def, Part);
3898       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3899         continue;
3900       Type *OriginalTy = I->getType();
3901       Type *ScalarTruncatedTy =
3902           IntegerType::get(OriginalTy->getContext(), KV.second);
3903       auto *TruncatedTy = VectorType::get(
3904           ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
3905       if (TruncatedTy == OriginalTy)
3906         continue;
3907 
3908       IRBuilder<> B(cast<Instruction>(I));
3909       auto ShrinkOperand = [&](Value *V) -> Value * {
3910         if (auto *ZI = dyn_cast<ZExtInst>(V))
3911           if (ZI->getSrcTy() == TruncatedTy)
3912             return ZI->getOperand(0);
3913         return B.CreateZExtOrTrunc(V, TruncatedTy);
3914       };
3915 
3916       // The actual instruction modification depends on the instruction type,
3917       // unfortunately.
3918       Value *NewI = nullptr;
3919       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3920         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3921                              ShrinkOperand(BO->getOperand(1)));
3922 
3923         // Any wrapping introduced by shrinking this operation shouldn't be
3924         // considered undefined behavior. So, we can't unconditionally copy
3925         // arithmetic wrapping flags to NewI.
3926         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3927       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3928         NewI =
3929             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3930                          ShrinkOperand(CI->getOperand(1)));
3931       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3932         NewI = B.CreateSelect(SI->getCondition(),
3933                               ShrinkOperand(SI->getTrueValue()),
3934                               ShrinkOperand(SI->getFalseValue()));
3935       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3936         switch (CI->getOpcode()) {
3937         default:
3938           llvm_unreachable("Unhandled cast!");
3939         case Instruction::Trunc:
3940           NewI = ShrinkOperand(CI->getOperand(0));
3941           break;
3942         case Instruction::SExt:
3943           NewI = B.CreateSExtOrTrunc(
3944               CI->getOperand(0),
3945               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3946           break;
3947         case Instruction::ZExt:
3948           NewI = B.CreateZExtOrTrunc(
3949               CI->getOperand(0),
3950               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3951           break;
3952         }
3953       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3954         auto Elements0 =
3955             cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
3956         auto *O0 = B.CreateZExtOrTrunc(
3957             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3958         auto Elements1 =
3959             cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
3960         auto *O1 = B.CreateZExtOrTrunc(
3961             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3962 
3963         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3964       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3965         // Don't do anything with the operands, just extend the result.
3966         continue;
3967       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3968         auto Elements =
3969             cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
3970         auto *O0 = B.CreateZExtOrTrunc(
3971             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3972         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3973         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3974       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3975         auto Elements =
3976             cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
3977         auto *O0 = B.CreateZExtOrTrunc(
3978             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3979         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3980       } else {
3981         // If we don't know what to do, be conservative and don't do anything.
3982         continue;
3983       }
3984 
3985       // Lastly, extend the result.
3986       NewI->takeName(cast<Instruction>(I));
3987       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3988       I->replaceAllUsesWith(Res);
3989       cast<Instruction>(I)->eraseFromParent();
3990       Erased.insert(I);
3991       State.reset(Def, Res, Part);
3992     }
3993   }
3994 
3995   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3996   for (const auto &KV : Cost->getMinimalBitwidths()) {
3997     // If the value wasn't vectorized, we must maintain the original scalar
3998     // type. The absence of the value from State indicates that it
3999     // wasn't vectorized.
4000     // FIXME: Should not rely on getVPValue at this point.
4001     VPValue *Def = State.Plan->getVPValue(KV.first, true);
4002     if (!State.hasAnyVectorValue(Def))
4003       continue;
4004     for (unsigned Part = 0; Part < UF; ++Part) {
4005       Value *I = State.get(Def, Part);
4006       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
4007       if (Inst && Inst->use_empty()) {
4008         Value *NewI = Inst->getOperand(0);
4009         Inst->eraseFromParent();
4010         State.reset(Def, NewI, Part);
4011       }
4012     }
4013   }
4014 }
4015 
4016 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
4017   // Insert truncates and extends for any truncated instructions as hints to
4018   // InstCombine.
4019   if (VF.isVector())
4020     truncateToMinimalBitwidths(State);
4021 
4022   // Fix widened non-induction PHIs by setting up the PHI operands.
4023   if (OrigPHIsToFix.size()) {
4024     assert(EnableVPlanNativePath &&
4025            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
4026     fixNonInductionPHIs(State);
4027   }
4028 
4029   // At this point every instruction in the original loop is widened to a
4030   // vector form. Now we need to fix the recurrences in the loop. These PHI
4031   // nodes are currently empty because we did not want to introduce cycles.
4032   // This is the second stage of vectorizing recurrences.
4033   fixCrossIterationPHIs(State);
4034 
4035   // Forget the original basic block.
4036   PSE.getSE()->forgetLoop(OrigLoop);
4037 
4038   // If we inserted an edge from the middle block to the unique exit block,
4039   // update uses outside the loop (phis) to account for the newly inserted
4040   // edge.
4041   if (!Cost->requiresScalarEpilogue(VF)) {
4042     // Fix-up external users of the induction variables.
4043     for (auto &Entry : Legal->getInductionVars())
4044       fixupIVUsers(Entry.first, Entry.second,
4045                    getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
4046                    IVEndValues[Entry.first], LoopMiddleBlock);
4047 
4048     fixLCSSAPHIs(State);
4049   }
4050 
4051   for (Instruction *PI : PredicatedInstructions)
4052     sinkScalarOperands(&*PI);
4053 
4054   // Remove redundant induction instructions.
4055   cse(LoopVectorBody);
4056 
4057   // Set/update profile weights for the vector and remainder loops as original
4058   // loop iterations are now distributed among them. Note that original loop
4059   // represented by LoopScalarBody becomes remainder loop after vectorization.
4060   //
4061   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
4062   // end up getting slightly roughened result but that should be OK since
4063   // profile is not inherently precise anyway. Note also possible bypass of
4064   // vector code caused by legality checks is ignored, assigning all the weight
4065   // to the vector loop, optimistically.
4066   //
4067   // For scalable vectorization we can't know at compile time how many iterations
4068   // of the loop are handled in one vector iteration, so instead assume a pessimistic
4069   // vscale of '1'.
4070   setProfileInfoAfterUnrolling(
4071       LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
4072       LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
4073 }
4074 
4075 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
4076   // In order to support recurrences we need to be able to vectorize Phi nodes.
4077   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4078   // stage #2: We now need to fix the recurrences by adding incoming edges to
4079   // the currently empty PHI nodes. At this point every instruction in the
4080   // original loop is widened to a vector form so we can use them to construct
4081   // the incoming edges.
4082   VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock();
4083   for (VPRecipeBase &R : Header->phis()) {
4084     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
4085       fixReduction(ReductionPhi, State);
4086     else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
4087       fixFirstOrderRecurrence(FOR, State);
4088   }
4089 }
4090 
4091 void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR,
4092                                                   VPTransformState &State) {
4093   // This is the second phase of vectorizing first-order recurrences. An
4094   // overview of the transformation is described below. Suppose we have the
4095   // following loop.
4096   //
4097   //   for (int i = 0; i < n; ++i)
4098   //     b[i] = a[i] - a[i - 1];
4099   //
4100   // There is a first-order recurrence on "a". For this loop, the shorthand
4101   // scalar IR looks like:
4102   //
4103   //   scalar.ph:
4104   //     s_init = a[-1]
4105   //     br scalar.body
4106   //
4107   //   scalar.body:
4108   //     i = phi [0, scalar.ph], [i+1, scalar.body]
4109   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
4110   //     s2 = a[i]
4111   //     b[i] = s2 - s1
4112   //     br cond, scalar.body, ...
4113   //
4114   // In this example, s1 is a recurrence because it's value depends on the
4115   // previous iteration. In the first phase of vectorization, we created a
4116   // vector phi v1 for s1. We now complete the vectorization and produce the
4117   // shorthand vector IR shown below (for VF = 4, UF = 1).
4118   //
4119   //   vector.ph:
4120   //     v_init = vector(..., ..., ..., a[-1])
4121   //     br vector.body
4122   //
4123   //   vector.body
4124   //     i = phi [0, vector.ph], [i+4, vector.body]
4125   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
4126   //     v2 = a[i, i+1, i+2, i+3];
4127   //     v3 = vector(v1(3), v2(0, 1, 2))
4128   //     b[i, i+1, i+2, i+3] = v2 - v3
4129   //     br cond, vector.body, middle.block
4130   //
4131   //   middle.block:
4132   //     x = v2(3)
4133   //     br scalar.ph
4134   //
4135   //   scalar.ph:
4136   //     s_init = phi [x, middle.block], [a[-1], otherwise]
4137   //     br scalar.body
4138   //
4139   // After execution completes the vector loop, we extract the next value of
4140   // the recurrence (x) to use as the initial value in the scalar loop.
4141 
4142   // Extract the last vector element in the middle block. This will be the
4143   // initial value for the recurrence when jumping to the scalar loop.
4144   VPValue *PreviousDef = PhiR->getBackedgeValue();
4145   Value *Incoming = State.get(PreviousDef, UF - 1);
4146   auto *ExtractForScalar = Incoming;
4147   auto *IdxTy = Builder.getInt32Ty();
4148   if (VF.isVector()) {
4149     auto *One = ConstantInt::get(IdxTy, 1);
4150     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4151     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
4152     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
4153     ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
4154                                                     "vector.recur.extract");
4155   }
4156   // Extract the second last element in the middle block if the
4157   // Phi is used outside the loop. We need to extract the phi itself
4158   // and not the last element (the phi update in the current iteration). This
4159   // will be the value when jumping to the exit block from the LoopMiddleBlock,
4160   // when the scalar loop is not run at all.
4161   Value *ExtractForPhiUsedOutsideLoop = nullptr;
4162   if (VF.isVector()) {
4163     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
4164     auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
4165     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4166         Incoming, Idx, "vector.recur.extract.for.phi");
4167   } else if (UF > 1)
4168     // When loop is unrolled without vectorizing, initialize
4169     // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
4170     // of `Incoming`. This is analogous to the vectorized case above: extracting
4171     // the second last element when VF > 1.
4172     ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
4173 
4174   // Fix the initial value of the original recurrence in the scalar loop.
4175   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
4176   PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
4177   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4178   auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
4179   for (auto *BB : predecessors(LoopScalarPreHeader)) {
4180     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4181     Start->addIncoming(Incoming, BB);
4182   }
4183 
4184   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
4185   Phi->setName("scalar.recur");
4186 
4187   // Finally, fix users of the recurrence outside the loop. The users will need
4188   // either the last value of the scalar recurrence or the last value of the
4189   // vector recurrence we extracted in the middle block. Since the loop is in
4190   // LCSSA form, we just need to find all the phi nodes for the original scalar
4191   // recurrence in the exit block, and then add an edge for the middle block.
4192   // Note that LCSSA does not imply single entry when the original scalar loop
4193   // had multiple exiting edges (as we always run the last iteration in the
4194   // scalar epilogue); in that case, there is no edge from middle to exit and
4195   // and thus no phis which needed updated.
4196   if (!Cost->requiresScalarEpilogue(VF))
4197     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4198       if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi))
4199         LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4200 }
4201 
4202 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
4203                                        VPTransformState &State) {
4204   PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
4205   // Get it's reduction variable descriptor.
4206   assert(Legal->isReductionVariable(OrigPhi) &&
4207          "Unable to find the reduction variable");
4208   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
4209 
4210   RecurKind RK = RdxDesc.getRecurrenceKind();
4211   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4212   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4213   setDebugLocFromInst(ReductionStartValue);
4214 
4215   VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
4216   // This is the vector-clone of the value that leaves the loop.
4217   Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
4218 
4219   // Wrap flags are in general invalid after vectorization, clear them.
4220   clearReductionWrapFlags(RdxDesc, State);
4221 
4222   // Before each round, move the insertion point right between
4223   // the PHIs and the values we are going to write.
4224   // This allows us to write both PHINodes and the extractelement
4225   // instructions.
4226   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4227 
4228   setDebugLocFromInst(LoopExitInst);
4229 
4230   Type *PhiTy = OrigPhi->getType();
4231   // If tail is folded by masking, the vector value to leave the loop should be
4232   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
4233   // instead of the former. For an inloop reduction the reduction will already
4234   // be predicated, and does not need to be handled here.
4235   if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
4236     for (unsigned Part = 0; Part < UF; ++Part) {
4237       Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
4238       Value *Sel = nullptr;
4239       for (User *U : VecLoopExitInst->users()) {
4240         if (isa<SelectInst>(U)) {
4241           assert(!Sel && "Reduction exit feeding two selects");
4242           Sel = U;
4243         } else
4244           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
4245       }
4246       assert(Sel && "Reduction exit feeds no select");
4247       State.reset(LoopExitInstDef, Sel, Part);
4248 
4249       // If the target can create a predicated operator for the reduction at no
4250       // extra cost in the loop (for example a predicated vadd), it can be
4251       // cheaper for the select to remain in the loop than be sunk out of it,
4252       // and so use the select value for the phi instead of the old
4253       // LoopExitValue.
4254       if (PreferPredicatedReductionSelect ||
4255           TTI->preferPredicatedReductionSelect(
4256               RdxDesc.getOpcode(), PhiTy,
4257               TargetTransformInfo::ReductionFlags())) {
4258         auto *VecRdxPhi =
4259             cast<PHINode>(State.get(PhiR, Part));
4260         VecRdxPhi->setIncomingValueForBlock(
4261             LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4262       }
4263     }
4264   }
4265 
4266   // If the vector reduction can be performed in a smaller type, we truncate
4267   // then extend the loop exit value to enable InstCombine to evaluate the
4268   // entire expression in the smaller type.
4269   if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
4270     assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
4271     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4272     Builder.SetInsertPoint(
4273         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4274     VectorParts RdxParts(UF);
4275     for (unsigned Part = 0; Part < UF; ++Part) {
4276       RdxParts[Part] = State.get(LoopExitInstDef, Part);
4277       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4278       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4279                                         : Builder.CreateZExt(Trunc, VecTy);
4280       for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users()))
4281         if (U != Trunc) {
4282           U->replaceUsesOfWith(RdxParts[Part], Extnd);
4283           RdxParts[Part] = Extnd;
4284         }
4285     }
4286     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4287     for (unsigned Part = 0; Part < UF; ++Part) {
4288       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4289       State.reset(LoopExitInstDef, RdxParts[Part], Part);
4290     }
4291   }
4292 
4293   // Reduce all of the unrolled parts into a single vector.
4294   Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
4295   unsigned Op = RecurrenceDescriptor::getOpcode(RK);
4296 
4297   // The middle block terminator has already been assigned a DebugLoc here (the
4298   // OrigLoop's single latch terminator). We want the whole middle block to
4299   // appear to execute on this line because: (a) it is all compiler generated,
4300   // (b) these instructions are always executed after evaluating the latch
4301   // conditional branch, and (c) other passes may add new predecessors which
4302   // terminate on this line. This is the easiest way to ensure we don't
4303   // accidentally cause an extra step back into the loop while debugging.
4304   setDebugLocFromInst(LoopMiddleBlock->getTerminator());
4305   if (PhiR->isOrdered())
4306     ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
4307   else {
4308     // Floating-point operations should have some FMF to enable the reduction.
4309     IRBuilderBase::FastMathFlagGuard FMFG(Builder);
4310     Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
4311     for (unsigned Part = 1; Part < UF; ++Part) {
4312       Value *RdxPart = State.get(LoopExitInstDef, Part);
4313       if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
4314         ReducedPartRdx = Builder.CreateBinOp(
4315             (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
4316       } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK))
4317         ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK,
4318                                            ReducedPartRdx, RdxPart);
4319       else
4320         ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
4321     }
4322   }
4323 
4324   // Create the reduction after the loop. Note that inloop reductions create the
4325   // target reduction in the loop using a Reduction recipe.
4326   if (VF.isVector() && !PhiR->isInLoop()) {
4327     ReducedPartRdx =
4328         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi);
4329     // If the reduction can be performed in a smaller type, we need to extend
4330     // the reduction to the wider type before we branch to the original loop.
4331     if (PhiTy != RdxDesc.getRecurrenceType())
4332       ReducedPartRdx = RdxDesc.isSigned()
4333                            ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
4334                            : Builder.CreateZExt(ReducedPartRdx, PhiTy);
4335   }
4336 
4337   // Create a phi node that merges control-flow from the backedge-taken check
4338   // block and the middle block.
4339   PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
4340                                         LoopScalarPreHeader->getTerminator());
4341   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4342     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4343   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4344 
4345   // Now, we need to fix the users of the reduction variable
4346   // inside and outside of the scalar remainder loop.
4347 
4348   // We know that the loop is in LCSSA form. We need to update the PHI nodes
4349   // in the exit blocks.  See comment on analogous loop in
4350   // fixFirstOrderRecurrence for a more complete explaination of the logic.
4351   if (!Cost->requiresScalarEpilogue(VF))
4352     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4353       if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst))
4354         LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4355 
4356   // Fix the scalar loop reduction variable with the incoming reduction sum
4357   // from the vector body and from the backedge value.
4358   int IncomingEdgeBlockIdx =
4359       OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4360   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4361   // Pick the other block.
4362   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4363   OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4364   OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4365 }
4366 
4367 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
4368                                                   VPTransformState &State) {
4369   RecurKind RK = RdxDesc.getRecurrenceKind();
4370   if (RK != RecurKind::Add && RK != RecurKind::Mul)
4371     return;
4372 
4373   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4374   assert(LoopExitInstr && "null loop exit instruction");
4375   SmallVector<Instruction *, 8> Worklist;
4376   SmallPtrSet<Instruction *, 8> Visited;
4377   Worklist.push_back(LoopExitInstr);
4378   Visited.insert(LoopExitInstr);
4379 
4380   while (!Worklist.empty()) {
4381     Instruction *Cur = Worklist.pop_back_val();
4382     if (isa<OverflowingBinaryOperator>(Cur))
4383       for (unsigned Part = 0; Part < UF; ++Part) {
4384         // FIXME: Should not rely on getVPValue at this point.
4385         Value *V = State.get(State.Plan->getVPValue(Cur, true), Part);
4386         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4387       }
4388 
4389     for (User *U : Cur->users()) {
4390       Instruction *UI = cast<Instruction>(U);
4391       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4392           Visited.insert(UI).second)
4393         Worklist.push_back(UI);
4394     }
4395   }
4396 }
4397 
4398 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) {
4399   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4400     if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
4401       // Some phis were already hand updated by the reduction and recurrence
4402       // code above, leave them alone.
4403       continue;
4404 
4405     auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4406     // Non-instruction incoming values will have only one value.
4407 
4408     VPLane Lane = VPLane::getFirstLane();
4409     if (isa<Instruction>(IncomingValue) &&
4410         !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue),
4411                                            VF))
4412       Lane = VPLane::getLastLaneForVF(VF);
4413 
4414     // Can be a loop invariant incoming value or the last scalar value to be
4415     // extracted from the vectorized loop.
4416     // FIXME: Should not rely on getVPValue at this point.
4417     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4418     Value *lastIncomingValue =
4419         OrigLoop->isLoopInvariant(IncomingValue)
4420             ? IncomingValue
4421             : State.get(State.Plan->getVPValue(IncomingValue, true),
4422                         VPIteration(UF - 1, Lane));
4423     LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4424   }
4425 }
4426 
4427 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4428   // The basic block and loop containing the predicated instruction.
4429   auto *PredBB = PredInst->getParent();
4430   auto *VectorLoop = LI->getLoopFor(PredBB);
4431 
4432   // Initialize a worklist with the operands of the predicated instruction.
4433   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4434 
4435   // Holds instructions that we need to analyze again. An instruction may be
4436   // reanalyzed if we don't yet know if we can sink it or not.
4437   SmallVector<Instruction *, 8> InstsToReanalyze;
4438 
4439   // Returns true if a given use occurs in the predicated block. Phi nodes use
4440   // their operands in their corresponding predecessor blocks.
4441   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4442     auto *I = cast<Instruction>(U.getUser());
4443     BasicBlock *BB = I->getParent();
4444     if (auto *Phi = dyn_cast<PHINode>(I))
4445       BB = Phi->getIncomingBlock(
4446           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4447     return BB == PredBB;
4448   };
4449 
4450   // Iteratively sink the scalarized operands of the predicated instruction
4451   // into the block we created for it. When an instruction is sunk, it's
4452   // operands are then added to the worklist. The algorithm ends after one pass
4453   // through the worklist doesn't sink a single instruction.
4454   bool Changed;
4455   do {
4456     // Add the instructions that need to be reanalyzed to the worklist, and
4457     // reset the changed indicator.
4458     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4459     InstsToReanalyze.clear();
4460     Changed = false;
4461 
4462     while (!Worklist.empty()) {
4463       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4464 
4465       // We can't sink an instruction if it is a phi node, is not in the loop,
4466       // or may have side effects.
4467       if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
4468           I->mayHaveSideEffects())
4469         continue;
4470 
4471       // If the instruction is already in PredBB, check if we can sink its
4472       // operands. In that case, VPlan's sinkScalarOperands() succeeded in
4473       // sinking the scalar instruction I, hence it appears in PredBB; but it
4474       // may have failed to sink I's operands (recursively), which we try
4475       // (again) here.
4476       if (I->getParent() == PredBB) {
4477         Worklist.insert(I->op_begin(), I->op_end());
4478         continue;
4479       }
4480 
4481       // It's legal to sink the instruction if all its uses occur in the
4482       // predicated block. Otherwise, there's nothing to do yet, and we may
4483       // need to reanalyze the instruction.
4484       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4485         InstsToReanalyze.push_back(I);
4486         continue;
4487       }
4488 
4489       // Move the instruction to the beginning of the predicated block, and add
4490       // it's operands to the worklist.
4491       I->moveBefore(&*PredBB->getFirstInsertionPt());
4492       Worklist.insert(I->op_begin(), I->op_end());
4493 
4494       // The sinking may have enabled other instructions to be sunk, so we will
4495       // need to iterate.
4496       Changed = true;
4497     }
4498   } while (Changed);
4499 }
4500 
4501 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
4502   for (PHINode *OrigPhi : OrigPHIsToFix) {
4503     VPWidenPHIRecipe *VPPhi =
4504         cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi));
4505     PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4506     // Make sure the builder has a valid insert point.
4507     Builder.SetInsertPoint(NewPhi);
4508     for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4509       VPValue *Inc = VPPhi->getIncomingValue(i);
4510       VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4511       NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4512     }
4513   }
4514 }
4515 
4516 bool InnerLoopVectorizer::useOrderedReductions(
4517     const RecurrenceDescriptor &RdxDesc) {
4518   return Cost->useOrderedReductions(RdxDesc);
4519 }
4520 
4521 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
4522                                               VPWidenPHIRecipe *PhiR,
4523                                               VPTransformState &State) {
4524   PHINode *P = cast<PHINode>(PN);
4525   if (EnableVPlanNativePath) {
4526     // Currently we enter here in the VPlan-native path for non-induction
4527     // PHIs where all control flow is uniform. We simply widen these PHIs.
4528     // Create a vector phi with no operands - the vector phi operands will be
4529     // set at the end of vector code generation.
4530     Type *VecTy = (State.VF.isScalar())
4531                       ? PN->getType()
4532                       : VectorType::get(PN->getType(), State.VF);
4533     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4534     State.set(PhiR, VecPhi, 0);
4535     OrigPHIsToFix.push_back(P);
4536 
4537     return;
4538   }
4539 
4540   assert(PN->getParent() == OrigLoop->getHeader() &&
4541          "Non-header phis should have been handled elsewhere");
4542 
4543   // In order to support recurrences we need to be able to vectorize Phi nodes.
4544   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4545   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4546   // this value when we vectorize all of the instructions that use the PHI.
4547 
4548   assert(!Legal->isReductionVariable(P) &&
4549          "reductions should be handled elsewhere");
4550 
4551   setDebugLocFromInst(P);
4552 
4553   // This PHINode must be an induction variable.
4554   // Make sure that we know about it.
4555   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4556 
4557   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4558   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4559 
4560   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4561   // which can be found from the original scalar operations.
4562   switch (II.getKind()) {
4563   case InductionDescriptor::IK_NoInduction:
4564     llvm_unreachable("Unknown induction");
4565   case InductionDescriptor::IK_IntInduction:
4566   case InductionDescriptor::IK_FpInduction:
4567     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4568   case InductionDescriptor::IK_PtrInduction: {
4569     // Handle the pointer induction variable case.
4570     assert(P->getType()->isPointerTy() && "Unexpected type.");
4571 
4572     if (Cost->isScalarAfterVectorization(P, State.VF)) {
4573       // This is the normalized GEP that starts counting at zero.
4574       Value *PtrInd =
4575           Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4576       // Determine the number of scalars we need to generate for each unroll
4577       // iteration. If the instruction is uniform, we only need to generate the
4578       // first lane. Otherwise, we generate all VF values.
4579       bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF);
4580       assert((IsUniform || !State.VF.isScalable()) &&
4581              "Cannot scalarize a scalable VF");
4582       unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
4583 
4584       for (unsigned Part = 0; Part < UF; ++Part) {
4585         Value *PartStart =
4586             createStepForVF(Builder, PtrInd->getType(), VF, Part);
4587 
4588         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4589           Value *Idx = Builder.CreateAdd(
4590               PartStart, ConstantInt::get(PtrInd->getType(), Lane));
4591           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4592           Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(),
4593                                                 DL, II, State.CFG.PrevBB);
4594           SclrGep->setName("next.gep");
4595           State.set(PhiR, SclrGep, VPIteration(Part, Lane));
4596         }
4597       }
4598       return;
4599     }
4600     assert(isa<SCEVConstant>(II.getStep()) &&
4601            "Induction step not a SCEV constant!");
4602     Type *PhiType = II.getStep()->getType();
4603 
4604     // Build a pointer phi
4605     Value *ScalarStartValue = II.getStartValue();
4606     Type *ScStValueType = ScalarStartValue->getType();
4607     PHINode *NewPointerPhi =
4608         PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4609     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4610 
4611     // A pointer induction, performed by using a gep
4612     BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4613     Instruction *InductionLoc = LoopLatch->getTerminator();
4614     const SCEV *ScalarStep = II.getStep();
4615     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4616     Value *ScalarStepValue =
4617         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4618     Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF);
4619     Value *NumUnrolledElems =
4620         Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
4621     Value *InductionGEP = GetElementPtrInst::Create(
4622         II.getElementType(), NewPointerPhi,
4623         Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
4624         InductionLoc);
4625     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4626 
4627     // Create UF many actual address geps that use the pointer
4628     // phi as base and a vectorized version of the step value
4629     // (<step*0, ..., step*N>) as offset.
4630     for (unsigned Part = 0; Part < State.UF; ++Part) {
4631       Type *VecPhiType = VectorType::get(PhiType, State.VF);
4632       Value *StartOffsetScalar =
4633           Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
4634       Value *StartOffset =
4635           Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
4636       // Create a vector of consecutive numbers from zero to VF.
4637       StartOffset =
4638           Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType));
4639 
4640       Value *GEP = Builder.CreateGEP(
4641           II.getElementType(), NewPointerPhi,
4642           Builder.CreateMul(
4643               StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue),
4644               "vector.gep"));
4645       State.set(PhiR, GEP, Part);
4646     }
4647   }
4648   }
4649 }
4650 
4651 /// A helper function for checking whether an integer division-related
4652 /// instruction may divide by zero (in which case it must be predicated if
4653 /// executed conditionally in the scalar code).
4654 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4655 /// Non-zero divisors that are non compile-time constants will not be
4656 /// converted into multiplication, so we will still end up scalarizing
4657 /// the division, but can do so w/o predication.
4658 static bool mayDivideByZero(Instruction &I) {
4659   assert((I.getOpcode() == Instruction::UDiv ||
4660           I.getOpcode() == Instruction::SDiv ||
4661           I.getOpcode() == Instruction::URem ||
4662           I.getOpcode() == Instruction::SRem) &&
4663          "Unexpected instruction");
4664   Value *Divisor = I.getOperand(1);
4665   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4666   return !CInt || CInt->isZero();
4667 }
4668 
4669 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4670                                                VPUser &ArgOperands,
4671                                                VPTransformState &State) {
4672   assert(!isa<DbgInfoIntrinsic>(I) &&
4673          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4674   setDebugLocFromInst(&I);
4675 
4676   Module *M = I.getParent()->getParent()->getParent();
4677   auto *CI = cast<CallInst>(&I);
4678 
4679   SmallVector<Type *, 4> Tys;
4680   for (Value *ArgOperand : CI->args())
4681     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4682 
4683   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4684 
4685   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4686   // version of the instruction.
4687   // Is it beneficial to perform intrinsic call compared to lib call?
4688   bool NeedToScalarize = false;
4689   InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4690   InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
4691   bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
4692   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4693          "Instruction should be scalarized elsewhere.");
4694   assert((IntrinsicCost.isValid() || CallCost.isValid()) &&
4695          "Either the intrinsic cost or vector call cost must be valid");
4696 
4697   for (unsigned Part = 0; Part < UF; ++Part) {
4698     SmallVector<Type *, 2> TysForDecl = {CI->getType()};
4699     SmallVector<Value *, 4> Args;
4700     for (auto &I : enumerate(ArgOperands.operands())) {
4701       // Some intrinsics have a scalar argument - don't replace it with a
4702       // vector.
4703       Value *Arg;
4704       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4705         Arg = State.get(I.value(), Part);
4706       else {
4707         Arg = State.get(I.value(), VPIteration(0, 0));
4708         if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index()))
4709           TysForDecl.push_back(Arg->getType());
4710       }
4711       Args.push_back(Arg);
4712     }
4713 
4714     Function *VectorF;
4715     if (UseVectorIntrinsic) {
4716       // Use vector version of the intrinsic.
4717       if (VF.isVector())
4718         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4719       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4720       assert(VectorF && "Can't retrieve vector intrinsic.");
4721     } else {
4722       // Use vector version of the function call.
4723       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4724 #ifndef NDEBUG
4725       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4726              "Can't create vector function.");
4727 #endif
4728         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4729     }
4730       SmallVector<OperandBundleDef, 1> OpBundles;
4731       CI->getOperandBundlesAsDefs(OpBundles);
4732       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4733 
4734       if (isa<FPMathOperator>(V))
4735         V->copyFastMathFlags(CI);
4736 
4737       State.set(Def, V, Part);
4738       addMetadata(V, &I);
4739   }
4740 }
4741 
4742 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4743   // We should not collect Scalars more than once per VF. Right now, this
4744   // function is called from collectUniformsAndScalars(), which already does
4745   // this check. Collecting Scalars for VF=1 does not make any sense.
4746   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4747          "This function should not be visited twice for the same VF");
4748 
4749   SmallSetVector<Instruction *, 8> Worklist;
4750 
4751   // These sets are used to seed the analysis with pointers used by memory
4752   // accesses that will remain scalar.
4753   SmallSetVector<Instruction *, 8> ScalarPtrs;
4754   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4755   auto *Latch = TheLoop->getLoopLatch();
4756 
4757   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4758   // The pointer operands of loads and stores will be scalar as long as the
4759   // memory access is not a gather or scatter operation. The value operand of a
4760   // store will remain scalar if the store is scalarized.
4761   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4762     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4763     assert(WideningDecision != CM_Unknown &&
4764            "Widening decision should be ready at this moment");
4765     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4766       if (Ptr == Store->getValueOperand())
4767         return WideningDecision == CM_Scalarize;
4768     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4769            "Ptr is neither a value or pointer operand");
4770     return WideningDecision != CM_GatherScatter;
4771   };
4772 
4773   // A helper that returns true if the given value is a bitcast or
4774   // getelementptr instruction contained in the loop.
4775   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4776     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4777             isa<GetElementPtrInst>(V)) &&
4778            !TheLoop->isLoopInvariant(V);
4779   };
4780 
4781   // A helper that evaluates a memory access's use of a pointer. If the use will
4782   // be a scalar use and the pointer is only used by memory accesses, we place
4783   // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4784   // PossibleNonScalarPtrs.
4785   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4786     // We only care about bitcast and getelementptr instructions contained in
4787     // the loop.
4788     if (!isLoopVaryingBitCastOrGEP(Ptr))
4789       return;
4790 
4791     // If the pointer has already been identified as scalar (e.g., if it was
4792     // also identified as uniform), there's nothing to do.
4793     auto *I = cast<Instruction>(Ptr);
4794     if (Worklist.count(I))
4795       return;
4796 
4797     // If the use of the pointer will be a scalar use, and all users of the
4798     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4799     // place the pointer in PossibleNonScalarPtrs.
4800     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4801           return isa<LoadInst>(U) || isa<StoreInst>(U);
4802         }))
4803       ScalarPtrs.insert(I);
4804     else
4805       PossibleNonScalarPtrs.insert(I);
4806   };
4807 
4808   // We seed the scalars analysis with three classes of instructions: (1)
4809   // instructions marked uniform-after-vectorization and (2) bitcast,
4810   // getelementptr and (pointer) phi instructions used by memory accesses
4811   // requiring a scalar use.
4812   //
4813   // (1) Add to the worklist all instructions that have been identified as
4814   // uniform-after-vectorization.
4815   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4816 
4817   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4818   // memory accesses requiring a scalar use. The pointer operands of loads and
4819   // stores will be scalar as long as the memory accesses is not a gather or
4820   // scatter operation. The value operand of a store will remain scalar if the
4821   // store is scalarized.
4822   for (auto *BB : TheLoop->blocks())
4823     for (auto &I : *BB) {
4824       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4825         evaluatePtrUse(Load, Load->getPointerOperand());
4826       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4827         evaluatePtrUse(Store, Store->getPointerOperand());
4828         evaluatePtrUse(Store, Store->getValueOperand());
4829       }
4830     }
4831   for (auto *I : ScalarPtrs)
4832     if (!PossibleNonScalarPtrs.count(I)) {
4833       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4834       Worklist.insert(I);
4835     }
4836 
4837   // Insert the forced scalars.
4838   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4839   // induction variable when the PHI user is scalarized.
4840   auto ForcedScalar = ForcedScalars.find(VF);
4841   if (ForcedScalar != ForcedScalars.end())
4842     for (auto *I : ForcedScalar->second)
4843       Worklist.insert(I);
4844 
4845   // Expand the worklist by looking through any bitcasts and getelementptr
4846   // instructions we've already identified as scalar. This is similar to the
4847   // expansion step in collectLoopUniforms(); however, here we're only
4848   // expanding to include additional bitcasts and getelementptr instructions.
4849   unsigned Idx = 0;
4850   while (Idx != Worklist.size()) {
4851     Instruction *Dst = Worklist[Idx++];
4852     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4853       continue;
4854     auto *Src = cast<Instruction>(Dst->getOperand(0));
4855     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4856           auto *J = cast<Instruction>(U);
4857           return !TheLoop->contains(J) || Worklist.count(J) ||
4858                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4859                   isScalarUse(J, Src));
4860         })) {
4861       Worklist.insert(Src);
4862       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4863     }
4864   }
4865 
4866   // An induction variable will remain scalar if all users of the induction
4867   // variable and induction variable update remain scalar.
4868   for (auto &Induction : Legal->getInductionVars()) {
4869     auto *Ind = Induction.first;
4870     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4871 
4872     // If tail-folding is applied, the primary induction variable will be used
4873     // to feed a vector compare.
4874     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4875       continue;
4876 
4877     // Returns true if \p Indvar is a pointer induction that is used directly by
4878     // load/store instruction \p I.
4879     auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
4880                                               Instruction *I) {
4881       return Induction.second.getKind() ==
4882                  InductionDescriptor::IK_PtrInduction &&
4883              (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
4884              Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
4885     };
4886 
4887     // Determine if all users of the induction variable are scalar after
4888     // vectorization.
4889     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4890       auto *I = cast<Instruction>(U);
4891       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4892              IsDirectLoadStoreFromPtrIndvar(Ind, I);
4893     });
4894     if (!ScalarInd)
4895       continue;
4896 
4897     // Determine if all users of the induction variable update instruction are
4898     // scalar after vectorization.
4899     auto ScalarIndUpdate =
4900         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4901           auto *I = cast<Instruction>(U);
4902           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4903                  IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
4904         });
4905     if (!ScalarIndUpdate)
4906       continue;
4907 
4908     // The induction variable and its update instruction will remain scalar.
4909     Worklist.insert(Ind);
4910     Worklist.insert(IndUpdate);
4911     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4912     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4913                       << "\n");
4914   }
4915 
4916   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4917 }
4918 
4919 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const {
4920   if (!blockNeedsPredicationForAnyReason(I->getParent()))
4921     return false;
4922   switch(I->getOpcode()) {
4923   default:
4924     break;
4925   case Instruction::Load:
4926   case Instruction::Store: {
4927     if (!Legal->isMaskRequired(I))
4928       return false;
4929     auto *Ptr = getLoadStorePointerOperand(I);
4930     auto *Ty = getLoadStoreType(I);
4931     const Align Alignment = getLoadStoreAlignment(I);
4932     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4933                                 TTI.isLegalMaskedGather(Ty, Alignment))
4934                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4935                                 TTI.isLegalMaskedScatter(Ty, Alignment));
4936   }
4937   case Instruction::UDiv:
4938   case Instruction::SDiv:
4939   case Instruction::SRem:
4940   case Instruction::URem:
4941     return mayDivideByZero(*I);
4942   }
4943   return false;
4944 }
4945 
4946 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4947     Instruction *I, ElementCount VF) {
4948   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4949   assert(getWideningDecision(I, VF) == CM_Unknown &&
4950          "Decision should not be set yet.");
4951   auto *Group = getInterleavedAccessGroup(I);
4952   assert(Group && "Must have a group.");
4953 
4954   // If the instruction's allocated size doesn't equal it's type size, it
4955   // requires padding and will be scalarized.
4956   auto &DL = I->getModule()->getDataLayout();
4957   auto *ScalarTy = getLoadStoreType(I);
4958   if (hasIrregularType(ScalarTy, DL))
4959     return false;
4960 
4961   // Check if masking is required.
4962   // A Group may need masking for one of two reasons: it resides in a block that
4963   // needs predication, or it was decided to use masking to deal with gaps
4964   // (either a gap at the end of a load-access that may result in a speculative
4965   // load, or any gaps in a store-access).
4966   bool PredicatedAccessRequiresMasking =
4967       blockNeedsPredicationForAnyReason(I->getParent()) &&
4968       Legal->isMaskRequired(I);
4969   bool LoadAccessWithGapsRequiresEpilogMasking =
4970       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4971       !isScalarEpilogueAllowed();
4972   bool StoreAccessWithGapsRequiresMasking =
4973       isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4974   if (!PredicatedAccessRequiresMasking &&
4975       !LoadAccessWithGapsRequiresEpilogMasking &&
4976       !StoreAccessWithGapsRequiresMasking)
4977     return true;
4978 
4979   // If masked interleaving is required, we expect that the user/target had
4980   // enabled it, because otherwise it either wouldn't have been created or
4981   // it should have been invalidated by the CostModel.
4982   assert(useMaskedInterleavedAccesses(TTI) &&
4983          "Masked interleave-groups for predicated accesses are not enabled.");
4984 
4985   if (Group->isReverse())
4986     return false;
4987 
4988   auto *Ty = getLoadStoreType(I);
4989   const Align Alignment = getLoadStoreAlignment(I);
4990   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4991                           : TTI.isLegalMaskedStore(Ty, Alignment);
4992 }
4993 
4994 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4995     Instruction *I, ElementCount VF) {
4996   // Get and ensure we have a valid memory instruction.
4997   assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
4998 
4999   auto *Ptr = getLoadStorePointerOperand(I);
5000   auto *ScalarTy = getLoadStoreType(I);
5001 
5002   // In order to be widened, the pointer should be consecutive, first of all.
5003   if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
5004     return false;
5005 
5006   // If the instruction is a store located in a predicated block, it will be
5007   // scalarized.
5008   if (isScalarWithPredication(I))
5009     return false;
5010 
5011   // If the instruction's allocated size doesn't equal it's type size, it
5012   // requires padding and will be scalarized.
5013   auto &DL = I->getModule()->getDataLayout();
5014   if (hasIrregularType(ScalarTy, DL))
5015     return false;
5016 
5017   return true;
5018 }
5019 
5020 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
5021   // We should not collect Uniforms more than once per VF. Right now,
5022   // this function is called from collectUniformsAndScalars(), which
5023   // already does this check. Collecting Uniforms for VF=1 does not make any
5024   // sense.
5025 
5026   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
5027          "This function should not be visited twice for the same VF");
5028 
5029   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5030   // not analyze again.  Uniforms.count(VF) will return 1.
5031   Uniforms[VF].clear();
5032 
5033   // We now know that the loop is vectorizable!
5034   // Collect instructions inside the loop that will remain uniform after
5035   // vectorization.
5036 
5037   // Global values, params and instructions outside of current loop are out of
5038   // scope.
5039   auto isOutOfScope = [&](Value *V) -> bool {
5040     Instruction *I = dyn_cast<Instruction>(V);
5041     return (!I || !TheLoop->contains(I));
5042   };
5043 
5044   // Worklist containing uniform instructions demanding lane 0.
5045   SetVector<Instruction *> Worklist;
5046   BasicBlock *Latch = TheLoop->getLoopLatch();
5047 
5048   // Add uniform instructions demanding lane 0 to the worklist. Instructions
5049   // that are scalar with predication must not be considered uniform after
5050   // vectorization, because that would create an erroneous replicating region
5051   // where only a single instance out of VF should be formed.
5052   // TODO: optimize such seldom cases if found important, see PR40816.
5053   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
5054     if (isOutOfScope(I)) {
5055       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
5056                         << *I << "\n");
5057       return;
5058     }
5059     if (isScalarWithPredication(I)) {
5060       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
5061                         << *I << "\n");
5062       return;
5063     }
5064     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
5065     Worklist.insert(I);
5066   };
5067 
5068   // Start with the conditional branch. If the branch condition is an
5069   // instruction contained in the loop that is only used by the branch, it is
5070   // uniform.
5071   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5072   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5073     addToWorklistIfAllowed(Cmp);
5074 
5075   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5076     InstWidening WideningDecision = getWideningDecision(I, VF);
5077     assert(WideningDecision != CM_Unknown &&
5078            "Widening decision should be ready at this moment");
5079 
5080     // A uniform memory op is itself uniform.  We exclude uniform stores
5081     // here as they demand the last lane, not the first one.
5082     if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
5083       assert(WideningDecision == CM_Scalarize);
5084       return true;
5085     }
5086 
5087     return (WideningDecision == CM_Widen ||
5088             WideningDecision == CM_Widen_Reverse ||
5089             WideningDecision == CM_Interleave);
5090   };
5091 
5092 
5093   // Returns true if Ptr is the pointer operand of a memory access instruction
5094   // I, and I is known to not require scalarization.
5095   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5096     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5097   };
5098 
5099   // Holds a list of values which are known to have at least one uniform use.
5100   // Note that there may be other uses which aren't uniform.  A "uniform use"
5101   // here is something which only demands lane 0 of the unrolled iterations;
5102   // it does not imply that all lanes produce the same value (e.g. this is not
5103   // the usual meaning of uniform)
5104   SetVector<Value *> HasUniformUse;
5105 
5106   // Scan the loop for instructions which are either a) known to have only
5107   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
5108   for (auto *BB : TheLoop->blocks())
5109     for (auto &I : *BB) {
5110       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
5111         switch (II->getIntrinsicID()) {
5112         case Intrinsic::sideeffect:
5113         case Intrinsic::experimental_noalias_scope_decl:
5114         case Intrinsic::assume:
5115         case Intrinsic::lifetime_start:
5116         case Intrinsic::lifetime_end:
5117           if (TheLoop->hasLoopInvariantOperands(&I))
5118             addToWorklistIfAllowed(&I);
5119           break;
5120         default:
5121           break;
5122         }
5123       }
5124 
5125       // ExtractValue instructions must be uniform, because the operands are
5126       // known to be loop-invariant.
5127       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
5128         assert(isOutOfScope(EVI->getAggregateOperand()) &&
5129                "Expected aggregate value to be loop invariant");
5130         addToWorklistIfAllowed(EVI);
5131         continue;
5132       }
5133 
5134       // If there's no pointer operand, there's nothing to do.
5135       auto *Ptr = getLoadStorePointerOperand(&I);
5136       if (!Ptr)
5137         continue;
5138 
5139       // A uniform memory op is itself uniform.  We exclude uniform stores
5140       // here as they demand the last lane, not the first one.
5141       if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
5142         addToWorklistIfAllowed(&I);
5143 
5144       if (isUniformDecision(&I, VF)) {
5145         assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
5146         HasUniformUse.insert(Ptr);
5147       }
5148     }
5149 
5150   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
5151   // demanding) users.  Since loops are assumed to be in LCSSA form, this
5152   // disallows uses outside the loop as well.
5153   for (auto *V : HasUniformUse) {
5154     if (isOutOfScope(V))
5155       continue;
5156     auto *I = cast<Instruction>(V);
5157     auto UsersAreMemAccesses =
5158       llvm::all_of(I->users(), [&](User *U) -> bool {
5159         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
5160       });
5161     if (UsersAreMemAccesses)
5162       addToWorklistIfAllowed(I);
5163   }
5164 
5165   // Expand Worklist in topological order: whenever a new instruction
5166   // is added , its users should be already inside Worklist.  It ensures
5167   // a uniform instruction will only be used by uniform instructions.
5168   unsigned idx = 0;
5169   while (idx != Worklist.size()) {
5170     Instruction *I = Worklist[idx++];
5171 
5172     for (auto OV : I->operand_values()) {
5173       // isOutOfScope operands cannot be uniform instructions.
5174       if (isOutOfScope(OV))
5175         continue;
5176       // First order recurrence Phi's should typically be considered
5177       // non-uniform.
5178       auto *OP = dyn_cast<PHINode>(OV);
5179       if (OP && Legal->isFirstOrderRecurrence(OP))
5180         continue;
5181       // If all the users of the operand are uniform, then add the
5182       // operand into the uniform worklist.
5183       auto *OI = cast<Instruction>(OV);
5184       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5185             auto *J = cast<Instruction>(U);
5186             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
5187           }))
5188         addToWorklistIfAllowed(OI);
5189     }
5190   }
5191 
5192   // For an instruction to be added into Worklist above, all its users inside
5193   // the loop should also be in Worklist. However, this condition cannot be
5194   // true for phi nodes that form a cyclic dependence. We must process phi
5195   // nodes separately. An induction variable will remain uniform if all users
5196   // of the induction variable and induction variable update remain uniform.
5197   // The code below handles both pointer and non-pointer induction variables.
5198   for (auto &Induction : Legal->getInductionVars()) {
5199     auto *Ind = Induction.first;
5200     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5201 
5202     // Determine if all users of the induction variable are uniform after
5203     // vectorization.
5204     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5205       auto *I = cast<Instruction>(U);
5206       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5207              isVectorizedMemAccessUse(I, Ind);
5208     });
5209     if (!UniformInd)
5210       continue;
5211 
5212     // Determine if all users of the induction variable update instruction are
5213     // uniform after vectorization.
5214     auto UniformIndUpdate =
5215         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5216           auto *I = cast<Instruction>(U);
5217           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5218                  isVectorizedMemAccessUse(I, IndUpdate);
5219         });
5220     if (!UniformIndUpdate)
5221       continue;
5222 
5223     // The induction variable and its update instruction will remain uniform.
5224     addToWorklistIfAllowed(Ind);
5225     addToWorklistIfAllowed(IndUpdate);
5226   }
5227 
5228   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5229 }
5230 
5231 bool LoopVectorizationCostModel::runtimeChecksRequired() {
5232   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
5233 
5234   if (Legal->getRuntimePointerChecking()->Need) {
5235     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5236         "runtime pointer checks needed. Enable vectorization of this "
5237         "loop with '#pragma clang loop vectorize(enable)' when "
5238         "compiling with -Os/-Oz",
5239         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5240     return true;
5241   }
5242 
5243   if (!PSE.getUnionPredicate().getPredicates().empty()) {
5244     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5245         "runtime SCEV checks needed. Enable vectorization of this "
5246         "loop with '#pragma clang loop vectorize(enable)' when "
5247         "compiling with -Os/-Oz",
5248         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5249     return true;
5250   }
5251 
5252   // FIXME: Avoid specializing for stride==1 instead of bailing out.
5253   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5254     reportVectorizationFailure("Runtime stride check for small trip count",
5255         "runtime stride == 1 checks needed. Enable vectorization of "
5256         "this loop without such check by compiling with -Os/-Oz",
5257         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5258     return true;
5259   }
5260 
5261   return false;
5262 }
5263 
5264 ElementCount
5265 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
5266   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
5267     return ElementCount::getScalable(0);
5268 
5269   if (Hints->isScalableVectorizationDisabled()) {
5270     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
5271                             "ScalableVectorizationDisabled", ORE, TheLoop);
5272     return ElementCount::getScalable(0);
5273   }
5274 
5275   LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
5276 
5277   auto MaxScalableVF = ElementCount::getScalable(
5278       std::numeric_limits<ElementCount::ScalarTy>::max());
5279 
5280   // Test that the loop-vectorizer can legalize all operations for this MaxVF.
5281   // FIXME: While for scalable vectors this is currently sufficient, this should
5282   // be replaced by a more detailed mechanism that filters out specific VFs,
5283   // instead of invalidating vectorization for a whole set of VFs based on the
5284   // MaxVF.
5285 
5286   // Disable scalable vectorization if the loop contains unsupported reductions.
5287   if (!canVectorizeReductions(MaxScalableVF)) {
5288     reportVectorizationInfo(
5289         "Scalable vectorization not supported for the reduction "
5290         "operations found in this loop.",
5291         "ScalableVFUnfeasible", ORE, TheLoop);
5292     return ElementCount::getScalable(0);
5293   }
5294 
5295   // Disable scalable vectorization if the loop contains any instructions
5296   // with element types not supported for scalable vectors.
5297   if (any_of(ElementTypesInLoop, [&](Type *Ty) {
5298         return !Ty->isVoidTy() &&
5299                !this->TTI.isElementTypeLegalForScalableVector(Ty);
5300       })) {
5301     reportVectorizationInfo("Scalable vectorization is not supported "
5302                             "for all element types found in this loop.",
5303                             "ScalableVFUnfeasible", ORE, TheLoop);
5304     return ElementCount::getScalable(0);
5305   }
5306 
5307   if (Legal->isSafeForAnyVectorWidth())
5308     return MaxScalableVF;
5309 
5310   // Limit MaxScalableVF by the maximum safe dependence distance.
5311   Optional<unsigned> MaxVScale = TTI.getMaxVScale();
5312   if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange))
5313     MaxVScale =
5314         TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
5315   MaxScalableVF = ElementCount::getScalable(
5316       MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
5317   if (!MaxScalableVF)
5318     reportVectorizationInfo(
5319         "Max legal vector width too small, scalable vectorization "
5320         "unfeasible.",
5321         "ScalableVFUnfeasible", ORE, TheLoop);
5322 
5323   return MaxScalableVF;
5324 }
5325 
5326 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
5327     unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) {
5328   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5329   unsigned SmallestType, WidestType;
5330   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5331 
5332   // Get the maximum safe dependence distance in bits computed by LAA.
5333   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5334   // the memory accesses that is most restrictive (involved in the smallest
5335   // dependence distance).
5336   unsigned MaxSafeElements =
5337       PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
5338 
5339   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
5340   auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
5341 
5342   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
5343                     << ".\n");
5344   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
5345                     << ".\n");
5346 
5347   // First analyze the UserVF, fall back if the UserVF should be ignored.
5348   if (UserVF) {
5349     auto MaxSafeUserVF =
5350         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
5351 
5352     if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
5353       // If `VF=vscale x N` is safe, then so is `VF=N`
5354       if (UserVF.isScalable())
5355         return FixedScalableVFPair(
5356             ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
5357       else
5358         return UserVF;
5359     }
5360 
5361     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
5362 
5363     // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
5364     // is better to ignore the hint and let the compiler choose a suitable VF.
5365     if (!UserVF.isScalable()) {
5366       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5367                         << " is unsafe, clamping to max safe VF="
5368                         << MaxSafeFixedVF << ".\n");
5369       ORE->emit([&]() {
5370         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5371                                           TheLoop->getStartLoc(),
5372                                           TheLoop->getHeader())
5373                << "User-specified vectorization factor "
5374                << ore::NV("UserVectorizationFactor", UserVF)
5375                << " is unsafe, clamping to maximum safe vectorization factor "
5376                << ore::NV("VectorizationFactor", MaxSafeFixedVF);
5377       });
5378       return MaxSafeFixedVF;
5379     }
5380 
5381     if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
5382       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5383                         << " is ignored because scalable vectors are not "
5384                            "available.\n");
5385       ORE->emit([&]() {
5386         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5387                                           TheLoop->getStartLoc(),
5388                                           TheLoop->getHeader())
5389                << "User-specified vectorization factor "
5390                << ore::NV("UserVectorizationFactor", UserVF)
5391                << " is ignored because the target does not support scalable "
5392                   "vectors. The compiler will pick a more suitable value.";
5393       });
5394     } else {
5395       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5396                         << " is unsafe. Ignoring scalable UserVF.\n");
5397       ORE->emit([&]() {
5398         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5399                                           TheLoop->getStartLoc(),
5400                                           TheLoop->getHeader())
5401                << "User-specified vectorization factor "
5402                << ore::NV("UserVectorizationFactor", UserVF)
5403                << " is unsafe. Ignoring the hint to let the compiler pick a "
5404                   "more suitable value.";
5405       });
5406     }
5407   }
5408 
5409   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5410                     << " / " << WidestType << " bits.\n");
5411 
5412   FixedScalableVFPair Result(ElementCount::getFixed(1),
5413                              ElementCount::getScalable(0));
5414   if (auto MaxVF =
5415           getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5416                                   MaxSafeFixedVF, FoldTailByMasking))
5417     Result.FixedVF = MaxVF;
5418 
5419   if (auto MaxVF =
5420           getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5421                                   MaxSafeScalableVF, FoldTailByMasking))
5422     if (MaxVF.isScalable()) {
5423       Result.ScalableVF = MaxVF;
5424       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
5425                         << "\n");
5426     }
5427 
5428   return Result;
5429 }
5430 
5431 FixedScalableVFPair
5432 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5433   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5434     // TODO: It may by useful to do since it's still likely to be dynamically
5435     // uniform if the target can skip.
5436     reportVectorizationFailure(
5437         "Not inserting runtime ptr check for divergent target",
5438         "runtime pointer checks needed. Not enabled for divergent target",
5439         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5440     return FixedScalableVFPair::getNone();
5441   }
5442 
5443   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5444   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5445   if (TC == 1) {
5446     reportVectorizationFailure("Single iteration (non) loop",
5447         "loop trip count is one, irrelevant for vectorization",
5448         "SingleIterationLoop", ORE, TheLoop);
5449     return FixedScalableVFPair::getNone();
5450   }
5451 
5452   switch (ScalarEpilogueStatus) {
5453   case CM_ScalarEpilogueAllowed:
5454     return computeFeasibleMaxVF(TC, UserVF, false);
5455   case CM_ScalarEpilogueNotAllowedUsePredicate:
5456     LLVM_FALLTHROUGH;
5457   case CM_ScalarEpilogueNotNeededUsePredicate:
5458     LLVM_DEBUG(
5459         dbgs() << "LV: vector predicate hint/switch found.\n"
5460                << "LV: Not allowing scalar epilogue, creating predicated "
5461                << "vector loop.\n");
5462     break;
5463   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5464     // fallthrough as a special case of OptForSize
5465   case CM_ScalarEpilogueNotAllowedOptSize:
5466     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5467       LLVM_DEBUG(
5468           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5469     else
5470       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5471                         << "count.\n");
5472 
5473     // Bail if runtime checks are required, which are not good when optimising
5474     // for size.
5475     if (runtimeChecksRequired())
5476       return FixedScalableVFPair::getNone();
5477 
5478     break;
5479   }
5480 
5481   // The only loops we can vectorize without a scalar epilogue, are loops with
5482   // a bottom-test and a single exiting block. We'd have to handle the fact
5483   // that not every instruction executes on the last iteration.  This will
5484   // require a lane mask which varies through the vector loop body.  (TODO)
5485   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5486     // If there was a tail-folding hint/switch, but we can't fold the tail by
5487     // masking, fallback to a vectorization with a scalar epilogue.
5488     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5489       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5490                            "scalar epilogue instead.\n");
5491       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5492       return computeFeasibleMaxVF(TC, UserVF, false);
5493     }
5494     return FixedScalableVFPair::getNone();
5495   }
5496 
5497   // Now try the tail folding
5498 
5499   // Invalidate interleave groups that require an epilogue if we can't mask
5500   // the interleave-group.
5501   if (!useMaskedInterleavedAccesses(TTI)) {
5502     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5503            "No decisions should have been taken at this point");
5504     // Note: There is no need to invalidate any cost modeling decisions here, as
5505     // non where taken so far.
5506     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5507   }
5508 
5509   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true);
5510   // Avoid tail folding if the trip count is known to be a multiple of any VF
5511   // we chose.
5512   // FIXME: The condition below pessimises the case for fixed-width vectors,
5513   // when scalable VFs are also candidates for vectorization.
5514   if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
5515     ElementCount MaxFixedVF = MaxFactors.FixedVF;
5516     assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&
5517            "MaxFixedVF must be a power of 2");
5518     unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
5519                                    : MaxFixedVF.getFixedValue();
5520     ScalarEvolution *SE = PSE.getSE();
5521     const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5522     const SCEV *ExitCount = SE->getAddExpr(
5523         BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5524     const SCEV *Rem = SE->getURemExpr(
5525         SE->applyLoopGuards(ExitCount, TheLoop),
5526         SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5527     if (Rem->isZero()) {
5528       // Accept MaxFixedVF if we do not have a tail.
5529       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5530       return MaxFactors;
5531     }
5532   }
5533 
5534   // For scalable vectors, don't use tail folding as this is currently not yet
5535   // supported. The code is likely to have ended up here if the tripcount is
5536   // low, in which case it makes sense not to use scalable vectors.
5537   if (MaxFactors.ScalableVF.isVector())
5538     MaxFactors.ScalableVF = ElementCount::getScalable(0);
5539 
5540   // If we don't know the precise trip count, or if the trip count that we
5541   // found modulo the vectorization factor is not zero, try to fold the tail
5542   // by masking.
5543   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5544   if (Legal->prepareToFoldTailByMasking()) {
5545     FoldTailByMasking = true;
5546     return MaxFactors;
5547   }
5548 
5549   // If there was a tail-folding hint/switch, but we can't fold the tail by
5550   // masking, fallback to a vectorization with a scalar epilogue.
5551   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5552     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5553                          "scalar epilogue instead.\n");
5554     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5555     return MaxFactors;
5556   }
5557 
5558   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5559     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5560     return FixedScalableVFPair::getNone();
5561   }
5562 
5563   if (TC == 0) {
5564     reportVectorizationFailure(
5565         "Unable to calculate the loop count due to complex control flow",
5566         "unable to calculate the loop count due to complex control flow",
5567         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5568     return FixedScalableVFPair::getNone();
5569   }
5570 
5571   reportVectorizationFailure(
5572       "Cannot optimize for size and vectorize at the same time.",
5573       "cannot optimize for size and vectorize at the same time. "
5574       "Enable vectorization of this loop with '#pragma clang loop "
5575       "vectorize(enable)' when compiling with -Os/-Oz",
5576       "NoTailLoopWithOptForSize", ORE, TheLoop);
5577   return FixedScalableVFPair::getNone();
5578 }
5579 
5580 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
5581     unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
5582     const ElementCount &MaxSafeVF, bool FoldTailByMasking) {
5583   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
5584   TypeSize WidestRegister = TTI.getRegisterBitWidth(
5585       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5586                            : TargetTransformInfo::RGK_FixedWidthVector);
5587 
5588   // Convenience function to return the minimum of two ElementCounts.
5589   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
5590     assert((LHS.isScalable() == RHS.isScalable()) &&
5591            "Scalable flags must match");
5592     return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
5593   };
5594 
5595   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5596   // Note that both WidestRegister and WidestType may not be a powers of 2.
5597   auto MaxVectorElementCount = ElementCount::get(
5598       PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType),
5599       ComputeScalableMaxVF);
5600   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
5601   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5602                     << (MaxVectorElementCount * WidestType) << " bits.\n");
5603 
5604   if (!MaxVectorElementCount) {
5605     LLVM_DEBUG(dbgs() << "LV: The target has no "
5606                       << (ComputeScalableMaxVF ? "scalable" : "fixed")
5607                       << " vector registers.\n");
5608     return ElementCount::getFixed(1);
5609   }
5610 
5611   const auto TripCountEC = ElementCount::getFixed(ConstTripCount);
5612   if (ConstTripCount &&
5613       ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) &&
5614       (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {
5615     // If loop trip count (TC) is known at compile time there is no point in
5616     // choosing VF greater than TC (as done in the loop below). Select maximum
5617     // power of two which doesn't exceed TC.
5618     // If MaxVectorElementCount is scalable, we only fall back on a fixed VF
5619     // when the TC is less than or equal to the known number of lanes.
5620     auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount);
5621     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
5622                          "exceeding the constant trip count: "
5623                       << ClampedConstTripCount << "\n");
5624     return ElementCount::getFixed(ClampedConstTripCount);
5625   }
5626 
5627   ElementCount MaxVF = MaxVectorElementCount;
5628   if (TTI.shouldMaximizeVectorBandwidth() ||
5629       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5630     auto MaxVectorElementCountMaxBW = ElementCount::get(
5631         PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
5632         ComputeScalableMaxVF);
5633     MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
5634 
5635     // Collect all viable vectorization factors larger than the default MaxVF
5636     // (i.e. MaxVectorElementCount).
5637     SmallVector<ElementCount, 8> VFs;
5638     for (ElementCount VS = MaxVectorElementCount * 2;
5639          ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
5640       VFs.push_back(VS);
5641 
5642     // For each VF calculate its register usage.
5643     auto RUs = calculateRegisterUsage(VFs);
5644 
5645     // Select the largest VF which doesn't require more registers than existing
5646     // ones.
5647     for (int i = RUs.size() - 1; i >= 0; --i) {
5648       bool Selected = true;
5649       for (auto &pair : RUs[i].MaxLocalUsers) {
5650         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5651         if (pair.second > TargetNumRegisters)
5652           Selected = false;
5653       }
5654       if (Selected) {
5655         MaxVF = VFs[i];
5656         break;
5657       }
5658     }
5659     if (ElementCount MinVF =
5660             TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
5661       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
5662         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5663                           << ") with target's minimum: " << MinVF << '\n');
5664         MaxVF = MinVF;
5665       }
5666     }
5667   }
5668   return MaxVF;
5669 }
5670 
5671 bool LoopVectorizationCostModel::isMoreProfitable(
5672     const VectorizationFactor &A, const VectorizationFactor &B) const {
5673   InstructionCost CostA = A.Cost;
5674   InstructionCost CostB = B.Cost;
5675 
5676   unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
5677 
5678   if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking &&
5679       MaxTripCount) {
5680     // If we are folding the tail and the trip count is a known (possibly small)
5681     // constant, the trip count will be rounded up to an integer number of
5682     // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),
5683     // which we compare directly. When not folding the tail, the total cost will
5684     // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is
5685     // approximated with the per-lane cost below instead of using the tripcount
5686     // as here.
5687     auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue());
5688     auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue());
5689     return RTCostA < RTCostB;
5690   }
5691 
5692   // Improve estimate for the vector width if it is scalable.
5693   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
5694   unsigned EstimatedWidthB = B.Width.getKnownMinValue();
5695   if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) {
5696     if (A.Width.isScalable())
5697       EstimatedWidthA *= VScale.getValue();
5698     if (B.Width.isScalable())
5699       EstimatedWidthB *= VScale.getValue();
5700   }
5701 
5702   // When set to preferred, for now assume vscale may be larger than 1 (or the
5703   // one being tuned for), so that scalable vectorization is slightly favorable
5704   // over fixed-width vectorization.
5705   if (Hints->isScalableVectorizationPreferred())
5706     if (A.Width.isScalable() && !B.Width.isScalable())
5707       return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
5708 
5709   // To avoid the need for FP division:
5710   //      (CostA / A.Width) < (CostB / B.Width)
5711   // <=>  (CostA * B.Width) < (CostB * A.Width)
5712   return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
5713 }
5714 
5715 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
5716     const ElementCountSet &VFCandidates) {
5717   InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
5718   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
5719   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
5720   assert(VFCandidates.count(ElementCount::getFixed(1)) &&
5721          "Expected Scalar VF to be a candidate");
5722 
5723   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost);
5724   VectorizationFactor ChosenFactor = ScalarCost;
5725 
5726   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5727   if (ForceVectorization && VFCandidates.size() > 1) {
5728     // Ignore scalar width, because the user explicitly wants vectorization.
5729     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5730     // evaluation.
5731     ChosenFactor.Cost = InstructionCost::getMax();
5732   }
5733 
5734   SmallVector<InstructionVFPair> InvalidCosts;
5735   for (const auto &i : VFCandidates) {
5736     // The cost for scalar VF=1 is already calculated, so ignore it.
5737     if (i.isScalar())
5738       continue;
5739 
5740     VectorizationCostTy C = expectedCost(i, &InvalidCosts);
5741     VectorizationFactor Candidate(i, C.first);
5742 
5743 #ifndef NDEBUG
5744     unsigned AssumedMinimumVscale = 1;
5745     if (Optional<unsigned> VScale = TTI.getVScaleForTuning())
5746       AssumedMinimumVscale = VScale.getValue();
5747     unsigned Width =
5748         Candidate.Width.isScalable()
5749             ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5750             : Candidate.Width.getFixedValue();
5751     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5752                       << " costs: " << (Candidate.Cost / Width));
5753     if (i.isScalable())
5754       LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
5755                         << AssumedMinimumVscale << ")");
5756     LLVM_DEBUG(dbgs() << ".\n");
5757 #endif
5758 
5759     if (!C.second && !ForceVectorization) {
5760       LLVM_DEBUG(
5761           dbgs() << "LV: Not considering vector loop of width " << i
5762                  << " because it will not generate any vector instructions.\n");
5763       continue;
5764     }
5765 
5766     // If profitable add it to ProfitableVF list.
5767     if (isMoreProfitable(Candidate, ScalarCost))
5768       ProfitableVFs.push_back(Candidate);
5769 
5770     if (isMoreProfitable(Candidate, ChosenFactor))
5771       ChosenFactor = Candidate;
5772   }
5773 
5774   // Emit a report of VFs with invalid costs in the loop.
5775   if (!InvalidCosts.empty()) {
5776     // Group the remarks per instruction, keeping the instruction order from
5777     // InvalidCosts.
5778     std::map<Instruction *, unsigned> Numbering;
5779     unsigned I = 0;
5780     for (auto &Pair : InvalidCosts)
5781       if (!Numbering.count(Pair.first))
5782         Numbering[Pair.first] = I++;
5783 
5784     // Sort the list, first on instruction(number) then on VF.
5785     llvm::sort(InvalidCosts,
5786                [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
5787                  if (Numbering[A.first] != Numbering[B.first])
5788                    return Numbering[A.first] < Numbering[B.first];
5789                  ElementCountComparator ECC;
5790                  return ECC(A.second, B.second);
5791                });
5792 
5793     // For a list of ordered instruction-vf pairs:
5794     //   [(load, vf1), (load, vf2), (store, vf1)]
5795     // Group the instructions together to emit separate remarks for:
5796     //   load  (vf1, vf2)
5797     //   store (vf1)
5798     auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
5799     auto Subset = ArrayRef<InstructionVFPair>();
5800     do {
5801       if (Subset.empty())
5802         Subset = Tail.take_front(1);
5803 
5804       Instruction *I = Subset.front().first;
5805 
5806       // If the next instruction is different, or if there are no other pairs,
5807       // emit a remark for the collated subset. e.g.
5808       //   [(load, vf1), (load, vf2))]
5809       // to emit:
5810       //  remark: invalid costs for 'load' at VF=(vf, vf2)
5811       if (Subset == Tail || Tail[Subset.size()].first != I) {
5812         std::string OutString;
5813         raw_string_ostream OS(OutString);
5814         assert(!Subset.empty() && "Unexpected empty range");
5815         OS << "Instruction with invalid costs prevented vectorization at VF=(";
5816         for (auto &Pair : Subset)
5817           OS << (Pair.second == Subset.front().second ? "" : ", ")
5818              << Pair.second;
5819         OS << "):";
5820         if (auto *CI = dyn_cast<CallInst>(I))
5821           OS << " call to " << CI->getCalledFunction()->getName();
5822         else
5823           OS << " " << I->getOpcodeName();
5824         OS.flush();
5825         reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
5826         Tail = Tail.drop_front(Subset.size());
5827         Subset = {};
5828       } else
5829         // Grow the subset by one element
5830         Subset = Tail.take_front(Subset.size() + 1);
5831     } while (!Tail.empty());
5832   }
5833 
5834   if (!EnableCondStoresVectorization && NumPredStores) {
5835     reportVectorizationFailure("There are conditional stores.",
5836         "store that is conditionally executed prevents vectorization",
5837         "ConditionalStore", ORE, TheLoop);
5838     ChosenFactor = ScalarCost;
5839   }
5840 
5841   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
5842                  ChosenFactor.Cost >= ScalarCost.Cost) dbgs()
5843              << "LV: Vectorization seems to be not beneficial, "
5844              << "but was forced by a user.\n");
5845   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
5846   return ChosenFactor;
5847 }
5848 
5849 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
5850     const Loop &L, ElementCount VF) const {
5851   // Cross iteration phis such as reductions need special handling and are
5852   // currently unsupported.
5853   if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) {
5854         return Legal->isFirstOrderRecurrence(&Phi) ||
5855                Legal->isReductionVariable(&Phi);
5856       }))
5857     return false;
5858 
5859   // Phis with uses outside of the loop require special handling and are
5860   // currently unsupported.
5861   for (auto &Entry : Legal->getInductionVars()) {
5862     // Look for uses of the value of the induction at the last iteration.
5863     Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
5864     for (User *U : PostInc->users())
5865       if (!L.contains(cast<Instruction>(U)))
5866         return false;
5867     // Look for uses of penultimate value of the induction.
5868     for (User *U : Entry.first->users())
5869       if (!L.contains(cast<Instruction>(U)))
5870         return false;
5871   }
5872 
5873   // Induction variables that are widened require special handling that is
5874   // currently not supported.
5875   if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
5876         return !(this->isScalarAfterVectorization(Entry.first, VF) ||
5877                  this->isProfitableToScalarize(Entry.first, VF));
5878       }))
5879     return false;
5880 
5881   // Epilogue vectorization code has not been auditted to ensure it handles
5882   // non-latch exits properly.  It may be fine, but it needs auditted and
5883   // tested.
5884   if (L.getExitingBlock() != L.getLoopLatch())
5885     return false;
5886 
5887   return true;
5888 }
5889 
5890 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5891     const ElementCount VF) const {
5892   // FIXME: We need a much better cost-model to take different parameters such
5893   // as register pressure, code size increase and cost of extra branches into
5894   // account. For now we apply a very crude heuristic and only consider loops
5895   // with vectorization factors larger than a certain value.
5896   // We also consider epilogue vectorization unprofitable for targets that don't
5897   // consider interleaving beneficial (eg. MVE).
5898   if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
5899     return false;
5900   if (VF.getFixedValue() >= EpilogueVectorizationMinVF)
5901     return true;
5902   return false;
5903 }
5904 
5905 VectorizationFactor
5906 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
5907     const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
5908   VectorizationFactor Result = VectorizationFactor::Disabled();
5909   if (!EnableEpilogueVectorization) {
5910     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
5911     return Result;
5912   }
5913 
5914   if (!isScalarEpilogueAllowed()) {
5915     LLVM_DEBUG(
5916         dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
5917                   "allowed.\n";);
5918     return Result;
5919   }
5920 
5921   // Not really a cost consideration, but check for unsupported cases here to
5922   // simplify the logic.
5923   if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
5924     LLVM_DEBUG(
5925         dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
5926                   "not a supported candidate.\n";);
5927     return Result;
5928   }
5929 
5930   if (EpilogueVectorizationForceVF > 1) {
5931     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
5932     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
5933     if (LVP.hasPlanWithVF(ForcedEC))
5934       return {ForcedEC, 0};
5935     else {
5936       LLVM_DEBUG(
5937           dbgs()
5938               << "LEV: Epilogue vectorization forced factor is not viable.\n";);
5939       return Result;
5940     }
5941   }
5942 
5943   if (TheLoop->getHeader()->getParent()->hasOptSize() ||
5944       TheLoop->getHeader()->getParent()->hasMinSize()) {
5945     LLVM_DEBUG(
5946         dbgs()
5947             << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
5948     return Result;
5949   }
5950 
5951   auto FixedMainLoopVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
5952   if (MainLoopVF.isScalable())
5953     LLVM_DEBUG(
5954         dbgs() << "LEV: Epilogue vectorization using scalable vectors not "
5955                   "yet supported. Converting to fixed-width (VF="
5956                << FixedMainLoopVF << ") instead\n");
5957 
5958   if (!isEpilogueVectorizationProfitable(FixedMainLoopVF)) {
5959     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
5960                          "this loop\n");
5961     return Result;
5962   }
5963 
5964   for (auto &NextVF : ProfitableVFs)
5965     if (ElementCount::isKnownLT(NextVF.Width, FixedMainLoopVF) &&
5966         (Result.Width.getFixedValue() == 1 ||
5967          isMoreProfitable(NextVF, Result)) &&
5968         LVP.hasPlanWithVF(NextVF.Width))
5969       Result = NextVF;
5970 
5971   if (Result != VectorizationFactor::Disabled())
5972     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5973                       << Result.Width.getFixedValue() << "\n";);
5974   return Result;
5975 }
5976 
5977 std::pair<unsigned, unsigned>
5978 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5979   unsigned MinWidth = -1U;
5980   unsigned MaxWidth = 8;
5981   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5982   for (Type *T : ElementTypesInLoop) {
5983     MinWidth = std::min<unsigned>(
5984         MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5985     MaxWidth = std::max<unsigned>(
5986         MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5987   }
5988   return {MinWidth, MaxWidth};
5989 }
5990 
5991 void LoopVectorizationCostModel::collectElementTypesForWidening() {
5992   ElementTypesInLoop.clear();
5993   // For each block.
5994   for (BasicBlock *BB : TheLoop->blocks()) {
5995     // For each instruction in the loop.
5996     for (Instruction &I : BB->instructionsWithoutDebug()) {
5997       Type *T = I.getType();
5998 
5999       // Skip ignored values.
6000       if (ValuesToIgnore.count(&I))
6001         continue;
6002 
6003       // Only examine Loads, Stores and PHINodes.
6004       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
6005         continue;
6006 
6007       // Examine PHI nodes that are reduction variables. Update the type to
6008       // account for the recurrence type.
6009       if (auto *PN = dyn_cast<PHINode>(&I)) {
6010         if (!Legal->isReductionVariable(PN))
6011           continue;
6012         const RecurrenceDescriptor &RdxDesc =
6013             Legal->getReductionVars().find(PN)->second;
6014         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
6015             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
6016                                       RdxDesc.getRecurrenceType(),
6017                                       TargetTransformInfo::ReductionFlags()))
6018           continue;
6019         T = RdxDesc.getRecurrenceType();
6020       }
6021 
6022       // Examine the stored values.
6023       if (auto *ST = dyn_cast<StoreInst>(&I))
6024         T = ST->getValueOperand()->getType();
6025 
6026       // Ignore loaded pointer types and stored pointer types that are not
6027       // vectorizable.
6028       //
6029       // FIXME: The check here attempts to predict whether a load or store will
6030       //        be vectorized. We only know this for certain after a VF has
6031       //        been selected. Here, we assume that if an access can be
6032       //        vectorized, it will be. We should also look at extending this
6033       //        optimization to non-pointer types.
6034       //
6035       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
6036           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
6037         continue;
6038 
6039       ElementTypesInLoop.insert(T);
6040     }
6041   }
6042 }
6043 
6044 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
6045                                                            unsigned LoopCost) {
6046   // -- The interleave heuristics --
6047   // We interleave the loop in order to expose ILP and reduce the loop overhead.
6048   // There are many micro-architectural considerations that we can't predict
6049   // at this level. For example, frontend pressure (on decode or fetch) due to
6050   // code size, or the number and capabilities of the execution ports.
6051   //
6052   // We use the following heuristics to select the interleave count:
6053   // 1. If the code has reductions, then we interleave to break the cross
6054   // iteration dependency.
6055   // 2. If the loop is really small, then we interleave to reduce the loop
6056   // overhead.
6057   // 3. We don't interleave if we think that we will spill registers to memory
6058   // due to the increased register pressure.
6059 
6060   if (!isScalarEpilogueAllowed())
6061     return 1;
6062 
6063   // We used the distance for the interleave count.
6064   if (Legal->getMaxSafeDepDistBytes() != -1U)
6065     return 1;
6066 
6067   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
6068   const bool HasReductions = !Legal->getReductionVars().empty();
6069   // Do not interleave loops with a relatively small known or estimated trip
6070   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
6071   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
6072   // because with the above conditions interleaving can expose ILP and break
6073   // cross iteration dependences for reductions.
6074   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
6075       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
6076     return 1;
6077 
6078   RegisterUsage R = calculateRegisterUsage({VF})[0];
6079   // We divide by these constants so assume that we have at least one
6080   // instruction that uses at least one register.
6081   for (auto& pair : R.MaxLocalUsers) {
6082     pair.second = std::max(pair.second, 1U);
6083   }
6084 
6085   // We calculate the interleave count using the following formula.
6086   // Subtract the number of loop invariants from the number of available
6087   // registers. These registers are used by all of the interleaved instances.
6088   // Next, divide the remaining registers by the number of registers that is
6089   // required by the loop, in order to estimate how many parallel instances
6090   // fit without causing spills. All of this is rounded down if necessary to be
6091   // a power of two. We want power of two interleave count to simplify any
6092   // addressing operations or alignment considerations.
6093   // We also want power of two interleave counts to ensure that the induction
6094   // variable of the vector loop wraps to zero, when tail is folded by masking;
6095   // this currently happens when OptForSize, in which case IC is set to 1 above.
6096   unsigned IC = UINT_MAX;
6097 
6098   for (auto& pair : R.MaxLocalUsers) {
6099     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
6100     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
6101                       << " registers of "
6102                       << TTI.getRegisterClassName(pair.first) << " register class\n");
6103     if (VF.isScalar()) {
6104       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
6105         TargetNumRegisters = ForceTargetNumScalarRegs;
6106     } else {
6107       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
6108         TargetNumRegisters = ForceTargetNumVectorRegs;
6109     }
6110     unsigned MaxLocalUsers = pair.second;
6111     unsigned LoopInvariantRegs = 0;
6112     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
6113       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
6114 
6115     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
6116     // Don't count the induction variable as interleaved.
6117     if (EnableIndVarRegisterHeur) {
6118       TmpIC =
6119           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
6120                         std::max(1U, (MaxLocalUsers - 1)));
6121     }
6122 
6123     IC = std::min(IC, TmpIC);
6124   }
6125 
6126   // Clamp the interleave ranges to reasonable counts.
6127   unsigned MaxInterleaveCount =
6128       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
6129 
6130   // Check if the user has overridden the max.
6131   if (VF.isScalar()) {
6132     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
6133       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
6134   } else {
6135     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
6136       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
6137   }
6138 
6139   // If trip count is known or estimated compile time constant, limit the
6140   // interleave count to be less than the trip count divided by VF, provided it
6141   // is at least 1.
6142   //
6143   // For scalable vectors we can't know if interleaving is beneficial. It may
6144   // not be beneficial for small loops if none of the lanes in the second vector
6145   // iterations is enabled. However, for larger loops, there is likely to be a
6146   // similar benefit as for fixed-width vectors. For now, we choose to leave
6147   // the InterleaveCount as if vscale is '1', although if some information about
6148   // the vector is known (e.g. min vector size), we can make a better decision.
6149   if (BestKnownTC) {
6150     MaxInterleaveCount =
6151         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
6152     // Make sure MaxInterleaveCount is greater than 0.
6153     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
6154   }
6155 
6156   assert(MaxInterleaveCount > 0 &&
6157          "Maximum interleave count must be greater than 0");
6158 
6159   // Clamp the calculated IC to be between the 1 and the max interleave count
6160   // that the target and trip count allows.
6161   if (IC > MaxInterleaveCount)
6162     IC = MaxInterleaveCount;
6163   else
6164     // Make sure IC is greater than 0.
6165     IC = std::max(1u, IC);
6166 
6167   assert(IC > 0 && "Interleave count must be greater than 0.");
6168 
6169   // If we did not calculate the cost for VF (because the user selected the VF)
6170   // then we calculate the cost of VF here.
6171   if (LoopCost == 0) {
6172     InstructionCost C = expectedCost(VF).first;
6173     assert(C.isValid() && "Expected to have chosen a VF with valid cost");
6174     LoopCost = *C.getValue();
6175   }
6176 
6177   assert(LoopCost && "Non-zero loop cost expected");
6178 
6179   // Interleave if we vectorized this loop and there is a reduction that could
6180   // benefit from interleaving.
6181   if (VF.isVector() && HasReductions) {
6182     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
6183     return IC;
6184   }
6185 
6186   // Note that if we've already vectorized the loop we will have done the
6187   // runtime check and so interleaving won't require further checks.
6188   bool InterleavingRequiresRuntimePointerCheck =
6189       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
6190 
6191   // We want to interleave small loops in order to reduce the loop overhead and
6192   // potentially expose ILP opportunities.
6193   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
6194                     << "LV: IC is " << IC << '\n'
6195                     << "LV: VF is " << VF << '\n');
6196   const bool AggressivelyInterleaveReductions =
6197       TTI.enableAggressiveInterleaving(HasReductions);
6198   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
6199     // We assume that the cost overhead is 1 and we use the cost model
6200     // to estimate the cost of the loop and interleave until the cost of the
6201     // loop overhead is about 5% of the cost of the loop.
6202     unsigned SmallIC =
6203         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
6204 
6205     // Interleave until store/load ports (estimated by max interleave count) are
6206     // saturated.
6207     unsigned NumStores = Legal->getNumStores();
6208     unsigned NumLoads = Legal->getNumLoads();
6209     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
6210     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
6211 
6212     // There is little point in interleaving for reductions containing selects
6213     // and compares when VF=1 since it may just create more overhead than it's
6214     // worth for loops with small trip counts. This is because we still have to
6215     // do the final reduction after the loop.
6216     bool HasSelectCmpReductions =
6217         HasReductions &&
6218         any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
6219           const RecurrenceDescriptor &RdxDesc = Reduction.second;
6220           return RecurrenceDescriptor::isSelectCmpRecurrenceKind(
6221               RdxDesc.getRecurrenceKind());
6222         });
6223     if (HasSelectCmpReductions) {
6224       LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
6225       return 1;
6226     }
6227 
6228     // If we have a scalar reduction (vector reductions are already dealt with
6229     // by this point), we can increase the critical path length if the loop
6230     // we're interleaving is inside another loop. For tree-wise reductions
6231     // set the limit to 2, and for ordered reductions it's best to disable
6232     // interleaving entirely.
6233     if (HasReductions && TheLoop->getLoopDepth() > 1) {
6234       bool HasOrderedReductions =
6235           any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
6236             const RecurrenceDescriptor &RdxDesc = Reduction.second;
6237             return RdxDesc.isOrdered();
6238           });
6239       if (HasOrderedReductions) {
6240         LLVM_DEBUG(
6241             dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
6242         return 1;
6243       }
6244 
6245       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
6246       SmallIC = std::min(SmallIC, F);
6247       StoresIC = std::min(StoresIC, F);
6248       LoadsIC = std::min(LoadsIC, F);
6249     }
6250 
6251     if (EnableLoadStoreRuntimeInterleave &&
6252         std::max(StoresIC, LoadsIC) > SmallIC) {
6253       LLVM_DEBUG(
6254           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
6255       return std::max(StoresIC, LoadsIC);
6256     }
6257 
6258     // If there are scalar reductions and TTI has enabled aggressive
6259     // interleaving for reductions, we will interleave to expose ILP.
6260     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
6261         AggressivelyInterleaveReductions) {
6262       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6263       // Interleave no less than SmallIC but not as aggressive as the normal IC
6264       // to satisfy the rare situation when resources are too limited.
6265       return std::max(IC / 2, SmallIC);
6266     } else {
6267       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
6268       return SmallIC;
6269     }
6270   }
6271 
6272   // Interleave if this is a large loop (small loops are already dealt with by
6273   // this point) that could benefit from interleaving.
6274   if (AggressivelyInterleaveReductions) {
6275     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6276     return IC;
6277   }
6278 
6279   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
6280   return 1;
6281 }
6282 
6283 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
6284 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
6285   // This function calculates the register usage by measuring the highest number
6286   // of values that are alive at a single location. Obviously, this is a very
6287   // rough estimation. We scan the loop in a topological order in order and
6288   // assign a number to each instruction. We use RPO to ensure that defs are
6289   // met before their users. We assume that each instruction that has in-loop
6290   // users starts an interval. We record every time that an in-loop value is
6291   // used, so we have a list of the first and last occurrences of each
6292   // instruction. Next, we transpose this data structure into a multi map that
6293   // holds the list of intervals that *end* at a specific location. This multi
6294   // map allows us to perform a linear search. We scan the instructions linearly
6295   // and record each time that a new interval starts, by placing it in a set.
6296   // If we find this value in the multi-map then we remove it from the set.
6297   // The max register usage is the maximum size of the set.
6298   // We also search for instructions that are defined outside the loop, but are
6299   // used inside the loop. We need this number separately from the max-interval
6300   // usage number because when we unroll, loop-invariant values do not take
6301   // more register.
6302   LoopBlocksDFS DFS(TheLoop);
6303   DFS.perform(LI);
6304 
6305   RegisterUsage RU;
6306 
6307   // Each 'key' in the map opens a new interval. The values
6308   // of the map are the index of the 'last seen' usage of the
6309   // instruction that is the key.
6310   using IntervalMap = DenseMap<Instruction *, unsigned>;
6311 
6312   // Maps instruction to its index.
6313   SmallVector<Instruction *, 64> IdxToInstr;
6314   // Marks the end of each interval.
6315   IntervalMap EndPoint;
6316   // Saves the list of instruction indices that are used in the loop.
6317   SmallPtrSet<Instruction *, 8> Ends;
6318   // Saves the list of values that are used in the loop but are
6319   // defined outside the loop, such as arguments and constants.
6320   SmallPtrSet<Value *, 8> LoopInvariants;
6321 
6322   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6323     for (Instruction &I : BB->instructionsWithoutDebug()) {
6324       IdxToInstr.push_back(&I);
6325 
6326       // Save the end location of each USE.
6327       for (Value *U : I.operands()) {
6328         auto *Instr = dyn_cast<Instruction>(U);
6329 
6330         // Ignore non-instruction values such as arguments, constants, etc.
6331         if (!Instr)
6332           continue;
6333 
6334         // If this instruction is outside the loop then record it and continue.
6335         if (!TheLoop->contains(Instr)) {
6336           LoopInvariants.insert(Instr);
6337           continue;
6338         }
6339 
6340         // Overwrite previous end points.
6341         EndPoint[Instr] = IdxToInstr.size();
6342         Ends.insert(Instr);
6343       }
6344     }
6345   }
6346 
6347   // Saves the list of intervals that end with the index in 'key'.
6348   using InstrList = SmallVector<Instruction *, 2>;
6349   DenseMap<unsigned, InstrList> TransposeEnds;
6350 
6351   // Transpose the EndPoints to a list of values that end at each index.
6352   for (auto &Interval : EndPoint)
6353     TransposeEnds[Interval.second].push_back(Interval.first);
6354 
6355   SmallPtrSet<Instruction *, 8> OpenIntervals;
6356   SmallVector<RegisterUsage, 8> RUs(VFs.size());
6357   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
6358 
6359   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
6360 
6361   // A lambda that gets the register usage for the given type and VF.
6362   const auto &TTICapture = TTI;
6363   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
6364     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
6365       return 0;
6366     InstructionCost::CostType RegUsage =
6367         *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue();
6368     assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() &&
6369            "Nonsensical values for register usage.");
6370     return RegUsage;
6371   };
6372 
6373   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
6374     Instruction *I = IdxToInstr[i];
6375 
6376     // Remove all of the instructions that end at this location.
6377     InstrList &List = TransposeEnds[i];
6378     for (Instruction *ToRemove : List)
6379       OpenIntervals.erase(ToRemove);
6380 
6381     // Ignore instructions that are never used within the loop.
6382     if (!Ends.count(I))
6383       continue;
6384 
6385     // Skip ignored values.
6386     if (ValuesToIgnore.count(I))
6387       continue;
6388 
6389     // For each VF find the maximum usage of registers.
6390     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6391       // Count the number of live intervals.
6392       SmallMapVector<unsigned, unsigned, 4> RegUsage;
6393 
6394       if (VFs[j].isScalar()) {
6395         for (auto Inst : OpenIntervals) {
6396           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6397           if (RegUsage.find(ClassID) == RegUsage.end())
6398             RegUsage[ClassID] = 1;
6399           else
6400             RegUsage[ClassID] += 1;
6401         }
6402       } else {
6403         collectUniformsAndScalars(VFs[j]);
6404         for (auto Inst : OpenIntervals) {
6405           // Skip ignored values for VF > 1.
6406           if (VecValuesToIgnore.count(Inst))
6407             continue;
6408           if (isScalarAfterVectorization(Inst, VFs[j])) {
6409             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6410             if (RegUsage.find(ClassID) == RegUsage.end())
6411               RegUsage[ClassID] = 1;
6412             else
6413               RegUsage[ClassID] += 1;
6414           } else {
6415             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
6416             if (RegUsage.find(ClassID) == RegUsage.end())
6417               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
6418             else
6419               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6420           }
6421         }
6422       }
6423 
6424       for (auto& pair : RegUsage) {
6425         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
6426           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
6427         else
6428           MaxUsages[j][pair.first] = pair.second;
6429       }
6430     }
6431 
6432     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6433                       << OpenIntervals.size() << '\n');
6434 
6435     // Add the current instruction to the list of open intervals.
6436     OpenIntervals.insert(I);
6437   }
6438 
6439   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6440     SmallMapVector<unsigned, unsigned, 4> Invariant;
6441 
6442     for (auto Inst : LoopInvariants) {
6443       unsigned Usage =
6444           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6445       unsigned ClassID =
6446           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6447       if (Invariant.find(ClassID) == Invariant.end())
6448         Invariant[ClassID] = Usage;
6449       else
6450         Invariant[ClassID] += Usage;
6451     }
6452 
6453     LLVM_DEBUG({
6454       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
6455       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
6456              << " item\n";
6457       for (const auto &pair : MaxUsages[i]) {
6458         dbgs() << "LV(REG): RegisterClass: "
6459                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6460                << " registers\n";
6461       }
6462       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
6463              << " item\n";
6464       for (const auto &pair : Invariant) {
6465         dbgs() << "LV(REG): RegisterClass: "
6466                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6467                << " registers\n";
6468       }
6469     });
6470 
6471     RU.LoopInvariantRegs = Invariant;
6472     RU.MaxLocalUsers = MaxUsages[i];
6473     RUs[i] = RU;
6474   }
6475 
6476   return RUs;
6477 }
6478 
6479 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
6480   // TODO: Cost model for emulated masked load/store is completely
6481   // broken. This hack guides the cost model to use an artificially
6482   // high enough value to practically disable vectorization with such
6483   // operations, except where previously deployed legality hack allowed
6484   // using very low cost values. This is to avoid regressions coming simply
6485   // from moving "masked load/store" check from legality to cost model.
6486   // Masked Load/Gather emulation was previously never allowed.
6487   // Limited number of Masked Store/Scatter emulation was allowed.
6488   assert(isPredicatedInst(I) &&
6489          "Expecting a scalar emulated instruction");
6490   return isa<LoadInst>(I) ||
6491          (isa<StoreInst>(I) &&
6492           NumPredStores > NumberOfStoresToPredicate);
6493 }
6494 
6495 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6496   // If we aren't vectorizing the loop, or if we've already collected the
6497   // instructions to scalarize, there's nothing to do. Collection may already
6498   // have occurred if we have a user-selected VF and are now computing the
6499   // expected cost for interleaving.
6500   if (VF.isScalar() || VF.isZero() ||
6501       InstsToScalarize.find(VF) != InstsToScalarize.end())
6502     return;
6503 
6504   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6505   // not profitable to scalarize any instructions, the presence of VF in the
6506   // map will indicate that we've analyzed it already.
6507   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6508 
6509   // Find all the instructions that are scalar with predication in the loop and
6510   // determine if it would be better to not if-convert the blocks they are in.
6511   // If so, we also record the instructions to scalarize.
6512   for (BasicBlock *BB : TheLoop->blocks()) {
6513     if (!blockNeedsPredicationForAnyReason(BB))
6514       continue;
6515     for (Instruction &I : *BB)
6516       if (isScalarWithPredication(&I)) {
6517         ScalarCostsTy ScalarCosts;
6518         // Do not apply discount if scalable, because that would lead to
6519         // invalid scalarization costs.
6520         // Do not apply discount logic if hacked cost is needed
6521         // for emulated masked memrefs.
6522         if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I) &&
6523             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6524           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6525         // Remember that BB will remain after vectorization.
6526         PredicatedBBsAfterVectorization.insert(BB);
6527       }
6528   }
6529 }
6530 
6531 int LoopVectorizationCostModel::computePredInstDiscount(
6532     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
6533   assert(!isUniformAfterVectorization(PredInst, VF) &&
6534          "Instruction marked uniform-after-vectorization will be predicated");
6535 
6536   // Initialize the discount to zero, meaning that the scalar version and the
6537   // vector version cost the same.
6538   InstructionCost Discount = 0;
6539 
6540   // Holds instructions to analyze. The instructions we visit are mapped in
6541   // ScalarCosts. Those instructions are the ones that would be scalarized if
6542   // we find that the scalar version costs less.
6543   SmallVector<Instruction *, 8> Worklist;
6544 
6545   // Returns true if the given instruction can be scalarized.
6546   auto canBeScalarized = [&](Instruction *I) -> bool {
6547     // We only attempt to scalarize instructions forming a single-use chain
6548     // from the original predicated block that would otherwise be vectorized.
6549     // Although not strictly necessary, we give up on instructions we know will
6550     // already be scalar to avoid traversing chains that are unlikely to be
6551     // beneficial.
6552     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6553         isScalarAfterVectorization(I, VF))
6554       return false;
6555 
6556     // If the instruction is scalar with predication, it will be analyzed
6557     // separately. We ignore it within the context of PredInst.
6558     if (isScalarWithPredication(I))
6559       return false;
6560 
6561     // If any of the instruction's operands are uniform after vectorization,
6562     // the instruction cannot be scalarized. This prevents, for example, a
6563     // masked load from being scalarized.
6564     //
6565     // We assume we will only emit a value for lane zero of an instruction
6566     // marked uniform after vectorization, rather than VF identical values.
6567     // Thus, if we scalarize an instruction that uses a uniform, we would
6568     // create uses of values corresponding to the lanes we aren't emitting code
6569     // for. This behavior can be changed by allowing getScalarValue to clone
6570     // the lane zero values for uniforms rather than asserting.
6571     for (Use &U : I->operands())
6572       if (auto *J = dyn_cast<Instruction>(U.get()))
6573         if (isUniformAfterVectorization(J, VF))
6574           return false;
6575 
6576     // Otherwise, we can scalarize the instruction.
6577     return true;
6578   };
6579 
6580   // Compute the expected cost discount from scalarizing the entire expression
6581   // feeding the predicated instruction. We currently only consider expressions
6582   // that are single-use instruction chains.
6583   Worklist.push_back(PredInst);
6584   while (!Worklist.empty()) {
6585     Instruction *I = Worklist.pop_back_val();
6586 
6587     // If we've already analyzed the instruction, there's nothing to do.
6588     if (ScalarCosts.find(I) != ScalarCosts.end())
6589       continue;
6590 
6591     // Compute the cost of the vector instruction. Note that this cost already
6592     // includes the scalarization overhead of the predicated instruction.
6593     InstructionCost VectorCost = getInstructionCost(I, VF).first;
6594 
6595     // Compute the cost of the scalarized instruction. This cost is the cost of
6596     // the instruction as if it wasn't if-converted and instead remained in the
6597     // predicated block. We will scale this cost by block probability after
6598     // computing the scalarization overhead.
6599     InstructionCost ScalarCost =
6600         VF.getFixedValue() *
6601         getInstructionCost(I, ElementCount::getFixed(1)).first;
6602 
6603     // Compute the scalarization overhead of needed insertelement instructions
6604     // and phi nodes.
6605     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
6606       ScalarCost += TTI.getScalarizationOverhead(
6607           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6608           APInt::getAllOnes(VF.getFixedValue()), true, false);
6609       ScalarCost +=
6610           VF.getFixedValue() *
6611           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6612     }
6613 
6614     // Compute the scalarization overhead of needed extractelement
6615     // instructions. For each of the instruction's operands, if the operand can
6616     // be scalarized, add it to the worklist; otherwise, account for the
6617     // overhead.
6618     for (Use &U : I->operands())
6619       if (auto *J = dyn_cast<Instruction>(U.get())) {
6620         assert(VectorType::isValidElementType(J->getType()) &&
6621                "Instruction has non-scalar type");
6622         if (canBeScalarized(J))
6623           Worklist.push_back(J);
6624         else if (needsExtract(J, VF)) {
6625           ScalarCost += TTI.getScalarizationOverhead(
6626               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6627               APInt::getAllOnes(VF.getFixedValue()), false, true);
6628         }
6629       }
6630 
6631     // Scale the total scalar cost by block probability.
6632     ScalarCost /= getReciprocalPredBlockProb();
6633 
6634     // Compute the discount. A non-negative discount means the vector version
6635     // of the instruction costs more, and scalarizing would be beneficial.
6636     Discount += VectorCost - ScalarCost;
6637     ScalarCosts[I] = ScalarCost;
6638   }
6639 
6640   return *Discount.getValue();
6641 }
6642 
6643 LoopVectorizationCostModel::VectorizationCostTy
6644 LoopVectorizationCostModel::expectedCost(
6645     ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
6646   VectorizationCostTy Cost;
6647 
6648   // For each block.
6649   for (BasicBlock *BB : TheLoop->blocks()) {
6650     VectorizationCostTy BlockCost;
6651 
6652     // For each instruction in the old loop.
6653     for (Instruction &I : BB->instructionsWithoutDebug()) {
6654       // Skip ignored values.
6655       if (ValuesToIgnore.count(&I) ||
6656           (VF.isVector() && VecValuesToIgnore.count(&I)))
6657         continue;
6658 
6659       VectorizationCostTy C = getInstructionCost(&I, VF);
6660 
6661       // Check if we should override the cost.
6662       if (C.first.isValid() &&
6663           ForceTargetInstructionCost.getNumOccurrences() > 0)
6664         C.first = InstructionCost(ForceTargetInstructionCost);
6665 
6666       // Keep a list of instructions with invalid costs.
6667       if (Invalid && !C.first.isValid())
6668         Invalid->emplace_back(&I, VF);
6669 
6670       BlockCost.first += C.first;
6671       BlockCost.second |= C.second;
6672       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6673                         << " for VF " << VF << " For instruction: " << I
6674                         << '\n');
6675     }
6676 
6677     // If we are vectorizing a predicated block, it will have been
6678     // if-converted. This means that the block's instructions (aside from
6679     // stores and instructions that may divide by zero) will now be
6680     // unconditionally executed. For the scalar case, we may not always execute
6681     // the predicated block, if it is an if-else block. Thus, scale the block's
6682     // cost by the probability of executing it. blockNeedsPredication from
6683     // Legal is used so as to not include all blocks in tail folded loops.
6684     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6685       BlockCost.first /= getReciprocalPredBlockProb();
6686 
6687     Cost.first += BlockCost.first;
6688     Cost.second |= BlockCost.second;
6689   }
6690 
6691   return Cost;
6692 }
6693 
6694 /// Gets Address Access SCEV after verifying that the access pattern
6695 /// is loop invariant except the induction variable dependence.
6696 ///
6697 /// This SCEV can be sent to the Target in order to estimate the address
6698 /// calculation cost.
6699 static const SCEV *getAddressAccessSCEV(
6700               Value *Ptr,
6701               LoopVectorizationLegality *Legal,
6702               PredicatedScalarEvolution &PSE,
6703               const Loop *TheLoop) {
6704 
6705   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6706   if (!Gep)
6707     return nullptr;
6708 
6709   // We are looking for a gep with all loop invariant indices except for one
6710   // which should be an induction variable.
6711   auto SE = PSE.getSE();
6712   unsigned NumOperands = Gep->getNumOperands();
6713   for (unsigned i = 1; i < NumOperands; ++i) {
6714     Value *Opd = Gep->getOperand(i);
6715     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6716         !Legal->isInductionVariable(Opd))
6717       return nullptr;
6718   }
6719 
6720   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6721   return PSE.getSCEV(Ptr);
6722 }
6723 
6724 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6725   return Legal->hasStride(I->getOperand(0)) ||
6726          Legal->hasStride(I->getOperand(1));
6727 }
6728 
6729 InstructionCost
6730 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6731                                                         ElementCount VF) {
6732   assert(VF.isVector() &&
6733          "Scalarization cost of instruction implies vectorization.");
6734   if (VF.isScalable())
6735     return InstructionCost::getInvalid();
6736 
6737   Type *ValTy = getLoadStoreType(I);
6738   auto SE = PSE.getSE();
6739 
6740   unsigned AS = getLoadStoreAddressSpace(I);
6741   Value *Ptr = getLoadStorePointerOperand(I);
6742   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6743   // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
6744   //       that it is being called from this specific place.
6745 
6746   // Figure out whether the access is strided and get the stride value
6747   // if it's known in compile time
6748   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6749 
6750   // Get the cost of the scalar memory instruction and address computation.
6751   InstructionCost Cost =
6752       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6753 
6754   // Don't pass *I here, since it is scalar but will actually be part of a
6755   // vectorized loop where the user of it is a vectorized instruction.
6756   const Align Alignment = getLoadStoreAlignment(I);
6757   Cost += VF.getKnownMinValue() *
6758           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6759                               AS, TTI::TCK_RecipThroughput);
6760 
6761   // Get the overhead of the extractelement and insertelement instructions
6762   // we might create due to scalarization.
6763   Cost += getScalarizationOverhead(I, VF);
6764 
6765   // If we have a predicated load/store, it will need extra i1 extracts and
6766   // conditional branches, but may not be executed for each vector lane. Scale
6767   // the cost by the probability of executing the predicated block.
6768   if (isPredicatedInst(I)) {
6769     Cost /= getReciprocalPredBlockProb();
6770 
6771     // Add the cost of an i1 extract and a branch
6772     auto *Vec_i1Ty =
6773         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
6774     Cost += TTI.getScalarizationOverhead(
6775         Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
6776         /*Insert=*/false, /*Extract=*/true);
6777     Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
6778 
6779     if (useEmulatedMaskMemRefHack(I))
6780       // Artificially setting to a high enough value to practically disable
6781       // vectorization with such operations.
6782       Cost = 3000000;
6783   }
6784 
6785   return Cost;
6786 }
6787 
6788 InstructionCost
6789 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6790                                                     ElementCount VF) {
6791   Type *ValTy = getLoadStoreType(I);
6792   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6793   Value *Ptr = getLoadStorePointerOperand(I);
6794   unsigned AS = getLoadStoreAddressSpace(I);
6795   int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
6796   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6797 
6798   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6799          "Stride should be 1 or -1 for consecutive memory access");
6800   const Align Alignment = getLoadStoreAlignment(I);
6801   InstructionCost Cost = 0;
6802   if (Legal->isMaskRequired(I))
6803     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6804                                       CostKind);
6805   else
6806     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6807                                 CostKind, I);
6808 
6809   bool Reverse = ConsecutiveStride < 0;
6810   if (Reverse)
6811     Cost +=
6812         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
6813   return Cost;
6814 }
6815 
6816 InstructionCost
6817 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6818                                                 ElementCount VF) {
6819   assert(Legal->isUniformMemOp(*I));
6820 
6821   Type *ValTy = getLoadStoreType(I);
6822   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6823   const Align Alignment = getLoadStoreAlignment(I);
6824   unsigned AS = getLoadStoreAddressSpace(I);
6825   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6826   if (isa<LoadInst>(I)) {
6827     return TTI.getAddressComputationCost(ValTy) +
6828            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6829                                CostKind) +
6830            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6831   }
6832   StoreInst *SI = cast<StoreInst>(I);
6833 
6834   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6835   return TTI.getAddressComputationCost(ValTy) +
6836          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6837                              CostKind) +
6838          (isLoopInvariantStoreValue
6839               ? 0
6840               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6841                                        VF.getKnownMinValue() - 1));
6842 }
6843 
6844 InstructionCost
6845 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6846                                                  ElementCount VF) {
6847   Type *ValTy = getLoadStoreType(I);
6848   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6849   const Align Alignment = getLoadStoreAlignment(I);
6850   const Value *Ptr = getLoadStorePointerOperand(I);
6851 
6852   return TTI.getAddressComputationCost(VectorTy) +
6853          TTI.getGatherScatterOpCost(
6854              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6855              TargetTransformInfo::TCK_RecipThroughput, I);
6856 }
6857 
6858 InstructionCost
6859 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6860                                                    ElementCount VF) {
6861   // TODO: Once we have support for interleaving with scalable vectors
6862   // we can calculate the cost properly here.
6863   if (VF.isScalable())
6864     return InstructionCost::getInvalid();
6865 
6866   Type *ValTy = getLoadStoreType(I);
6867   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6868   unsigned AS = getLoadStoreAddressSpace(I);
6869 
6870   auto Group = getInterleavedAccessGroup(I);
6871   assert(Group && "Fail to get an interleaved access group.");
6872 
6873   unsigned InterleaveFactor = Group->getFactor();
6874   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6875 
6876   // Holds the indices of existing members in the interleaved group.
6877   SmallVector<unsigned, 4> Indices;
6878   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
6879     if (Group->getMember(IF))
6880       Indices.push_back(IF);
6881 
6882   // Calculate the cost of the whole interleaved group.
6883   bool UseMaskForGaps =
6884       (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6885       (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
6886   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
6887       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6888       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6889 
6890   if (Group->isReverse()) {
6891     // TODO: Add support for reversed masked interleaved access.
6892     assert(!Legal->isMaskRequired(I) &&
6893            "Reverse masked interleaved access not supported.");
6894     Cost +=
6895         Group->getNumMembers() *
6896         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
6897   }
6898   return Cost;
6899 }
6900 
6901 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
6902     Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
6903   using namespace llvm::PatternMatch;
6904   // Early exit for no inloop reductions
6905   if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6906     return None;
6907   auto *VectorTy = cast<VectorType>(Ty);
6908 
6909   // We are looking for a pattern of, and finding the minimal acceptable cost:
6910   //  reduce(mul(ext(A), ext(B))) or
6911   //  reduce(mul(A, B)) or
6912   //  reduce(ext(A)) or
6913   //  reduce(A).
6914   // The basic idea is that we walk down the tree to do that, finding the root
6915   // reduction instruction in InLoopReductionImmediateChains. From there we find
6916   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6917   // of the components. If the reduction cost is lower then we return it for the
6918   // reduction instruction and 0 for the other instructions in the pattern. If
6919   // it is not we return an invalid cost specifying the orignal cost method
6920   // should be used.
6921   Instruction *RetI = I;
6922   if (match(RetI, m_ZExtOrSExt(m_Value()))) {
6923     if (!RetI->hasOneUser())
6924       return None;
6925     RetI = RetI->user_back();
6926   }
6927   if (match(RetI, m_Mul(m_Value(), m_Value())) &&
6928       RetI->user_back()->getOpcode() == Instruction::Add) {
6929     if (!RetI->hasOneUser())
6930       return None;
6931     RetI = RetI->user_back();
6932   }
6933 
6934   // Test if the found instruction is a reduction, and if not return an invalid
6935   // cost specifying the parent to use the original cost modelling.
6936   if (!InLoopReductionImmediateChains.count(RetI))
6937     return None;
6938 
6939   // Find the reduction this chain is a part of and calculate the basic cost of
6940   // the reduction on its own.
6941   Instruction *LastChain = InLoopReductionImmediateChains[RetI];
6942   Instruction *ReductionPhi = LastChain;
6943   while (!isa<PHINode>(ReductionPhi))
6944     ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
6945 
6946   const RecurrenceDescriptor &RdxDesc =
6947       Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
6948 
6949   InstructionCost BaseCost = TTI.getArithmeticReductionCost(
6950       RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
6951 
6952   // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6953   // normal fmul instruction to the cost of the fadd reduction.
6954   if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
6955     BaseCost +=
6956         TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
6957 
6958   // If we're using ordered reductions then we can just return the base cost
6959   // here, since getArithmeticReductionCost calculates the full ordered
6960   // reduction cost when FP reassociation is not allowed.
6961   if (useOrderedReductions(RdxDesc))
6962     return BaseCost;
6963 
6964   // Get the operand that was not the reduction chain and match it to one of the
6965   // patterns, returning the better cost if it is found.
6966   Instruction *RedOp = RetI->getOperand(1) == LastChain
6967                            ? dyn_cast<Instruction>(RetI->getOperand(0))
6968                            : dyn_cast<Instruction>(RetI->getOperand(1));
6969 
6970   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6971 
6972   Instruction *Op0, *Op1;
6973   if (RedOp &&
6974       match(RedOp,
6975             m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
6976       match(Op0, m_ZExtOrSExt(m_Value())) &&
6977       Op0->getOpcode() == Op1->getOpcode() &&
6978       Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6979       !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
6980       (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
6981 
6982     // Matched reduce(ext(mul(ext(A), ext(B)))
6983     // Note that the extend opcodes need to all match, or if A==B they will have
6984     // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6985     // which is equally fine.
6986     bool IsUnsigned = isa<ZExtInst>(Op0);
6987     auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6988     auto *MulType = VectorType::get(Op0->getType(), VectorTy);
6989 
6990     InstructionCost ExtCost =
6991         TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
6992                              TTI::CastContextHint::None, CostKind, Op0);
6993     InstructionCost MulCost =
6994         TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
6995     InstructionCost Ext2Cost =
6996         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
6997                              TTI::CastContextHint::None, CostKind, RedOp);
6998 
6999     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7000         /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
7001         CostKind);
7002 
7003     if (RedCost.isValid() &&
7004         RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
7005       return I == RetI ? RedCost : 0;
7006   } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
7007              !TheLoop->isLoopInvariant(RedOp)) {
7008     // Matched reduce(ext(A))
7009     bool IsUnsigned = isa<ZExtInst>(RedOp);
7010     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
7011     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7012         /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
7013         CostKind);
7014 
7015     InstructionCost ExtCost =
7016         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
7017                              TTI::CastContextHint::None, CostKind, RedOp);
7018     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
7019       return I == RetI ? RedCost : 0;
7020   } else if (RedOp &&
7021              match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
7022     if (match(Op0, m_ZExtOrSExt(m_Value())) &&
7023         Op0->getOpcode() == Op1->getOpcode() &&
7024         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
7025       bool IsUnsigned = isa<ZExtInst>(Op0);
7026       Type *Op0Ty = Op0->getOperand(0)->getType();
7027       Type *Op1Ty = Op1->getOperand(0)->getType();
7028       Type *LargestOpTy =
7029           Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
7030                                                                     : Op0Ty;
7031       auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
7032 
7033       // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of
7034       // different sizes. We take the largest type as the ext to reduce, and add
7035       // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
7036       InstructionCost ExtCost0 = TTI.getCastInstrCost(
7037           Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
7038           TTI::CastContextHint::None, CostKind, Op0);
7039       InstructionCost ExtCost1 = TTI.getCastInstrCost(
7040           Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
7041           TTI::CastContextHint::None, CostKind, Op1);
7042       InstructionCost MulCost =
7043           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7044 
7045       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7046           /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
7047           CostKind);
7048       InstructionCost ExtraExtCost = 0;
7049       if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
7050         Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
7051         ExtraExtCost = TTI.getCastInstrCost(
7052             ExtraExtOp->getOpcode(), ExtType,
7053             VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
7054             TTI::CastContextHint::None, CostKind, ExtraExtOp);
7055       }
7056 
7057       if (RedCost.isValid() &&
7058           (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
7059         return I == RetI ? RedCost : 0;
7060     } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
7061       // Matched reduce(mul())
7062       InstructionCost MulCost =
7063           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7064 
7065       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7066           /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy,
7067           CostKind);
7068 
7069       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
7070         return I == RetI ? RedCost : 0;
7071     }
7072   }
7073 
7074   return I == RetI ? Optional<InstructionCost>(BaseCost) : None;
7075 }
7076 
7077 InstructionCost
7078 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
7079                                                      ElementCount VF) {
7080   // Calculate scalar cost only. Vectorization cost should be ready at this
7081   // moment.
7082   if (VF.isScalar()) {
7083     Type *ValTy = getLoadStoreType(I);
7084     const Align Alignment = getLoadStoreAlignment(I);
7085     unsigned AS = getLoadStoreAddressSpace(I);
7086 
7087     return TTI.getAddressComputationCost(ValTy) +
7088            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
7089                                TTI::TCK_RecipThroughput, I);
7090   }
7091   return getWideningCost(I, VF);
7092 }
7093 
7094 LoopVectorizationCostModel::VectorizationCostTy
7095 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
7096                                                ElementCount VF) {
7097   // If we know that this instruction will remain uniform, check the cost of
7098   // the scalar version.
7099   if (isUniformAfterVectorization(I, VF))
7100     VF = ElementCount::getFixed(1);
7101 
7102   if (VF.isVector() && isProfitableToScalarize(I, VF))
7103     return VectorizationCostTy(InstsToScalarize[VF][I], false);
7104 
7105   // Forced scalars do not have any scalarization overhead.
7106   auto ForcedScalar = ForcedScalars.find(VF);
7107   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
7108     auto InstSet = ForcedScalar->second;
7109     if (InstSet.count(I))
7110       return VectorizationCostTy(
7111           (getInstructionCost(I, ElementCount::getFixed(1)).first *
7112            VF.getKnownMinValue()),
7113           false);
7114   }
7115 
7116   Type *VectorTy;
7117   InstructionCost C = getInstructionCost(I, VF, VectorTy);
7118 
7119   bool TypeNotScalarized = false;
7120   if (VF.isVector() && VectorTy->isVectorTy()) {
7121     unsigned NumParts = TTI.getNumberOfParts(VectorTy);
7122     if (NumParts)
7123       TypeNotScalarized = NumParts < VF.getKnownMinValue();
7124     else
7125       C = InstructionCost::getInvalid();
7126   }
7127   return VectorizationCostTy(C, TypeNotScalarized);
7128 }
7129 
7130 InstructionCost
7131 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
7132                                                      ElementCount VF) const {
7133 
7134   // There is no mechanism yet to create a scalable scalarization loop,
7135   // so this is currently Invalid.
7136   if (VF.isScalable())
7137     return InstructionCost::getInvalid();
7138 
7139   if (VF.isScalar())
7140     return 0;
7141 
7142   InstructionCost Cost = 0;
7143   Type *RetTy = ToVectorTy(I->getType(), VF);
7144   if (!RetTy->isVoidTy() &&
7145       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
7146     Cost += TTI.getScalarizationOverhead(
7147         cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true,
7148         false);
7149 
7150   // Some targets keep addresses scalar.
7151   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
7152     return Cost;
7153 
7154   // Some targets support efficient element stores.
7155   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
7156     return Cost;
7157 
7158   // Collect operands to consider.
7159   CallInst *CI = dyn_cast<CallInst>(I);
7160   Instruction::op_range Ops = CI ? CI->args() : I->operands();
7161 
7162   // Skip operands that do not require extraction/scalarization and do not incur
7163   // any overhead.
7164   SmallVector<Type *> Tys;
7165   for (auto *V : filterExtractingOperands(Ops, VF))
7166     Tys.push_back(MaybeVectorizeType(V->getType(), VF));
7167   return Cost + TTI.getOperandsScalarizationOverhead(
7168                     filterExtractingOperands(Ops, VF), Tys);
7169 }
7170 
7171 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
7172   if (VF.isScalar())
7173     return;
7174   NumPredStores = 0;
7175   for (BasicBlock *BB : TheLoop->blocks()) {
7176     // For each instruction in the old loop.
7177     for (Instruction &I : *BB) {
7178       Value *Ptr =  getLoadStorePointerOperand(&I);
7179       if (!Ptr)
7180         continue;
7181 
7182       // TODO: We should generate better code and update the cost model for
7183       // predicated uniform stores. Today they are treated as any other
7184       // predicated store (see added test cases in
7185       // invariant-store-vectorization.ll).
7186       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
7187         NumPredStores++;
7188 
7189       if (Legal->isUniformMemOp(I)) {
7190         // TODO: Avoid replicating loads and stores instead of
7191         // relying on instcombine to remove them.
7192         // Load: Scalar load + broadcast
7193         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
7194         InstructionCost Cost;
7195         if (isa<StoreInst>(&I) && VF.isScalable() &&
7196             isLegalGatherOrScatter(&I)) {
7197           Cost = getGatherScatterCost(&I, VF);
7198           setWideningDecision(&I, VF, CM_GatherScatter, Cost);
7199         } else {
7200           assert((isa<LoadInst>(&I) || !VF.isScalable()) &&
7201                  "Cannot yet scalarize uniform stores");
7202           Cost = getUniformMemOpCost(&I, VF);
7203           setWideningDecision(&I, VF, CM_Scalarize, Cost);
7204         }
7205         continue;
7206       }
7207 
7208       // We assume that widening is the best solution when possible.
7209       if (memoryInstructionCanBeWidened(&I, VF)) {
7210         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
7211         int ConsecutiveStride = Legal->isConsecutivePtr(
7212             getLoadStoreType(&I), getLoadStorePointerOperand(&I));
7213         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
7214                "Expected consecutive stride.");
7215         InstWidening Decision =
7216             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
7217         setWideningDecision(&I, VF, Decision, Cost);
7218         continue;
7219       }
7220 
7221       // Choose between Interleaving, Gather/Scatter or Scalarization.
7222       InstructionCost InterleaveCost = InstructionCost::getInvalid();
7223       unsigned NumAccesses = 1;
7224       if (isAccessInterleaved(&I)) {
7225         auto Group = getInterleavedAccessGroup(&I);
7226         assert(Group && "Fail to get an interleaved access group.");
7227 
7228         // Make one decision for the whole group.
7229         if (getWideningDecision(&I, VF) != CM_Unknown)
7230           continue;
7231 
7232         NumAccesses = Group->getNumMembers();
7233         if (interleavedAccessCanBeWidened(&I, VF))
7234           InterleaveCost = getInterleaveGroupCost(&I, VF);
7235       }
7236 
7237       InstructionCost GatherScatterCost =
7238           isLegalGatherOrScatter(&I)
7239               ? getGatherScatterCost(&I, VF) * NumAccesses
7240               : InstructionCost::getInvalid();
7241 
7242       InstructionCost ScalarizationCost =
7243           getMemInstScalarizationCost(&I, VF) * NumAccesses;
7244 
7245       // Choose better solution for the current VF,
7246       // write down this decision and use it during vectorization.
7247       InstructionCost Cost;
7248       InstWidening Decision;
7249       if (InterleaveCost <= GatherScatterCost &&
7250           InterleaveCost < ScalarizationCost) {
7251         Decision = CM_Interleave;
7252         Cost = InterleaveCost;
7253       } else if (GatherScatterCost < ScalarizationCost) {
7254         Decision = CM_GatherScatter;
7255         Cost = GatherScatterCost;
7256       } else {
7257         Decision = CM_Scalarize;
7258         Cost = ScalarizationCost;
7259       }
7260       // If the instructions belongs to an interleave group, the whole group
7261       // receives the same decision. The whole group receives the cost, but
7262       // the cost will actually be assigned to one instruction.
7263       if (auto Group = getInterleavedAccessGroup(&I))
7264         setWideningDecision(Group, VF, Decision, Cost);
7265       else
7266         setWideningDecision(&I, VF, Decision, Cost);
7267     }
7268   }
7269 
7270   // Make sure that any load of address and any other address computation
7271   // remains scalar unless there is gather/scatter support. This avoids
7272   // inevitable extracts into address registers, and also has the benefit of
7273   // activating LSR more, since that pass can't optimize vectorized
7274   // addresses.
7275   if (TTI.prefersVectorizedAddressing())
7276     return;
7277 
7278   // Start with all scalar pointer uses.
7279   SmallPtrSet<Instruction *, 8> AddrDefs;
7280   for (BasicBlock *BB : TheLoop->blocks())
7281     for (Instruction &I : *BB) {
7282       Instruction *PtrDef =
7283         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
7284       if (PtrDef && TheLoop->contains(PtrDef) &&
7285           getWideningDecision(&I, VF) != CM_GatherScatter)
7286         AddrDefs.insert(PtrDef);
7287     }
7288 
7289   // Add all instructions used to generate the addresses.
7290   SmallVector<Instruction *, 4> Worklist;
7291   append_range(Worklist, AddrDefs);
7292   while (!Worklist.empty()) {
7293     Instruction *I = Worklist.pop_back_val();
7294     for (auto &Op : I->operands())
7295       if (auto *InstOp = dyn_cast<Instruction>(Op))
7296         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
7297             AddrDefs.insert(InstOp).second)
7298           Worklist.push_back(InstOp);
7299   }
7300 
7301   for (auto *I : AddrDefs) {
7302     if (isa<LoadInst>(I)) {
7303       // Setting the desired widening decision should ideally be handled in
7304       // by cost functions, but since this involves the task of finding out
7305       // if the loaded register is involved in an address computation, it is
7306       // instead changed here when we know this is the case.
7307       InstWidening Decision = getWideningDecision(I, VF);
7308       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
7309         // Scalarize a widened load of address.
7310         setWideningDecision(
7311             I, VF, CM_Scalarize,
7312             (VF.getKnownMinValue() *
7313              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
7314       else if (auto Group = getInterleavedAccessGroup(I)) {
7315         // Scalarize an interleave group of address loads.
7316         for (unsigned I = 0; I < Group->getFactor(); ++I) {
7317           if (Instruction *Member = Group->getMember(I))
7318             setWideningDecision(
7319                 Member, VF, CM_Scalarize,
7320                 (VF.getKnownMinValue() *
7321                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
7322         }
7323       }
7324     } else
7325       // Make sure I gets scalarized and a cost estimate without
7326       // scalarization overhead.
7327       ForcedScalars[VF].insert(I);
7328   }
7329 }
7330 
7331 InstructionCost
7332 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
7333                                                Type *&VectorTy) {
7334   Type *RetTy = I->getType();
7335   if (canTruncateToMinimalBitwidth(I, VF))
7336     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
7337   auto SE = PSE.getSE();
7338   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7339 
7340   auto hasSingleCopyAfterVectorization = [this](Instruction *I,
7341                                                 ElementCount VF) -> bool {
7342     if (VF.isScalar())
7343       return true;
7344 
7345     auto Scalarized = InstsToScalarize.find(VF);
7346     assert(Scalarized != InstsToScalarize.end() &&
7347            "VF not yet analyzed for scalarization profitability");
7348     return !Scalarized->second.count(I) &&
7349            llvm::all_of(I->users(), [&](User *U) {
7350              auto *UI = cast<Instruction>(U);
7351              return !Scalarized->second.count(UI);
7352            });
7353   };
7354   (void) hasSingleCopyAfterVectorization;
7355 
7356   if (isScalarAfterVectorization(I, VF)) {
7357     // With the exception of GEPs and PHIs, after scalarization there should
7358     // only be one copy of the instruction generated in the loop. This is
7359     // because the VF is either 1, or any instructions that need scalarizing
7360     // have already been dealt with by the the time we get here. As a result,
7361     // it means we don't have to multiply the instruction cost by VF.
7362     assert(I->getOpcode() == Instruction::GetElementPtr ||
7363            I->getOpcode() == Instruction::PHI ||
7364            (I->getOpcode() == Instruction::BitCast &&
7365             I->getType()->isPointerTy()) ||
7366            hasSingleCopyAfterVectorization(I, VF));
7367     VectorTy = RetTy;
7368   } else
7369     VectorTy = ToVectorTy(RetTy, VF);
7370 
7371   // TODO: We need to estimate the cost of intrinsic calls.
7372   switch (I->getOpcode()) {
7373   case Instruction::GetElementPtr:
7374     // We mark this instruction as zero-cost because the cost of GEPs in
7375     // vectorized code depends on whether the corresponding memory instruction
7376     // is scalarized or not. Therefore, we handle GEPs with the memory
7377     // instruction cost.
7378     return 0;
7379   case Instruction::Br: {
7380     // In cases of scalarized and predicated instructions, there will be VF
7381     // predicated blocks in the vectorized loop. Each branch around these
7382     // blocks requires also an extract of its vector compare i1 element.
7383     bool ScalarPredicatedBB = false;
7384     BranchInst *BI = cast<BranchInst>(I);
7385     if (VF.isVector() && BI->isConditional() &&
7386         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
7387          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
7388       ScalarPredicatedBB = true;
7389 
7390     if (ScalarPredicatedBB) {
7391       // Not possible to scalarize scalable vector with predicated instructions.
7392       if (VF.isScalable())
7393         return InstructionCost::getInvalid();
7394       // Return cost for branches around scalarized and predicated blocks.
7395       auto *Vec_i1Ty =
7396           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
7397       return (
7398           TTI.getScalarizationOverhead(
7399               Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) +
7400           (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
7401     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
7402       // The back-edge branch will remain, as will all scalar branches.
7403       return TTI.getCFInstrCost(Instruction::Br, CostKind);
7404     else
7405       // This branch will be eliminated by if-conversion.
7406       return 0;
7407     // Note: We currently assume zero cost for an unconditional branch inside
7408     // a predicated block since it will become a fall-through, although we
7409     // may decide in the future to call TTI for all branches.
7410   }
7411   case Instruction::PHI: {
7412     auto *Phi = cast<PHINode>(I);
7413 
7414     // First-order recurrences are replaced by vector shuffles inside the loop.
7415     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
7416     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
7417       return TTI.getShuffleCost(
7418           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
7419           None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
7420 
7421     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7422     // converted into select instructions. We require N - 1 selects per phi
7423     // node, where N is the number of incoming values.
7424     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
7425       return (Phi->getNumIncomingValues() - 1) *
7426              TTI.getCmpSelInstrCost(
7427                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
7428                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
7429                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
7430 
7431     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
7432   }
7433   case Instruction::UDiv:
7434   case Instruction::SDiv:
7435   case Instruction::URem:
7436   case Instruction::SRem:
7437     // If we have a predicated instruction, it may not be executed for each
7438     // vector lane. Get the scalarization cost and scale this amount by the
7439     // probability of executing the predicated block. If the instruction is not
7440     // predicated, we fall through to the next case.
7441     if (VF.isVector() && isScalarWithPredication(I)) {
7442       InstructionCost Cost = 0;
7443 
7444       // These instructions have a non-void type, so account for the phi nodes
7445       // that we will create. This cost is likely to be zero. The phi node
7446       // cost, if any, should be scaled by the block probability because it
7447       // models a copy at the end of each predicated block.
7448       Cost += VF.getKnownMinValue() *
7449               TTI.getCFInstrCost(Instruction::PHI, CostKind);
7450 
7451       // The cost of the non-predicated instruction.
7452       Cost += VF.getKnownMinValue() *
7453               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
7454 
7455       // The cost of insertelement and extractelement instructions needed for
7456       // scalarization.
7457       Cost += getScalarizationOverhead(I, VF);
7458 
7459       // Scale the cost by the probability of executing the predicated blocks.
7460       // This assumes the predicated block for each vector lane is equally
7461       // likely.
7462       return Cost / getReciprocalPredBlockProb();
7463     }
7464     LLVM_FALLTHROUGH;
7465   case Instruction::Add:
7466   case Instruction::FAdd:
7467   case Instruction::Sub:
7468   case Instruction::FSub:
7469   case Instruction::Mul:
7470   case Instruction::FMul:
7471   case Instruction::FDiv:
7472   case Instruction::FRem:
7473   case Instruction::Shl:
7474   case Instruction::LShr:
7475   case Instruction::AShr:
7476   case Instruction::And:
7477   case Instruction::Or:
7478   case Instruction::Xor: {
7479     // Since we will replace the stride by 1 the multiplication should go away.
7480     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7481       return 0;
7482 
7483     // Detect reduction patterns
7484     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7485       return *RedCost;
7486 
7487     // Certain instructions can be cheaper to vectorize if they have a constant
7488     // second vector operand. One example of this are shifts on x86.
7489     Value *Op2 = I->getOperand(1);
7490     TargetTransformInfo::OperandValueProperties Op2VP;
7491     TargetTransformInfo::OperandValueKind Op2VK =
7492         TTI.getOperandInfo(Op2, Op2VP);
7493     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7494       Op2VK = TargetTransformInfo::OK_UniformValue;
7495 
7496     SmallVector<const Value *, 4> Operands(I->operand_values());
7497     return TTI.getArithmeticInstrCost(
7498         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7499         Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
7500   }
7501   case Instruction::FNeg: {
7502     return TTI.getArithmeticInstrCost(
7503         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7504         TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None,
7505         TargetTransformInfo::OP_None, I->getOperand(0), I);
7506   }
7507   case Instruction::Select: {
7508     SelectInst *SI = cast<SelectInst>(I);
7509     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7510     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7511 
7512     const Value *Op0, *Op1;
7513     using namespace llvm::PatternMatch;
7514     if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
7515                         match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
7516       // select x, y, false --> x & y
7517       // select x, true, y --> x | y
7518       TTI::OperandValueProperties Op1VP = TTI::OP_None;
7519       TTI::OperandValueProperties Op2VP = TTI::OP_None;
7520       TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP);
7521       TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP);
7522       assert(Op0->getType()->getScalarSizeInBits() == 1 &&
7523               Op1->getType()->getScalarSizeInBits() == 1);
7524 
7525       SmallVector<const Value *, 2> Operands{Op0, Op1};
7526       return TTI.getArithmeticInstrCost(
7527           match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
7528           CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I);
7529     }
7530 
7531     Type *CondTy = SI->getCondition()->getType();
7532     if (!ScalarCond)
7533       CondTy = VectorType::get(CondTy, VF);
7534 
7535     CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
7536     if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
7537       Pred = Cmp->getPredicate();
7538     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
7539                                   CostKind, I);
7540   }
7541   case Instruction::ICmp:
7542   case Instruction::FCmp: {
7543     Type *ValTy = I->getOperand(0)->getType();
7544     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7545     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7546       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7547     VectorTy = ToVectorTy(ValTy, VF);
7548     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7549                                   cast<CmpInst>(I)->getPredicate(), CostKind,
7550                                   I);
7551   }
7552   case Instruction::Store:
7553   case Instruction::Load: {
7554     ElementCount Width = VF;
7555     if (Width.isVector()) {
7556       InstWidening Decision = getWideningDecision(I, Width);
7557       assert(Decision != CM_Unknown &&
7558              "CM decision should be taken at this point");
7559       if (Decision == CM_Scalarize)
7560         Width = ElementCount::getFixed(1);
7561     }
7562     VectorTy = ToVectorTy(getLoadStoreType(I), Width);
7563     return getMemoryInstructionCost(I, VF);
7564   }
7565   case Instruction::BitCast:
7566     if (I->getType()->isPointerTy())
7567       return 0;
7568     LLVM_FALLTHROUGH;
7569   case Instruction::ZExt:
7570   case Instruction::SExt:
7571   case Instruction::FPToUI:
7572   case Instruction::FPToSI:
7573   case Instruction::FPExt:
7574   case Instruction::PtrToInt:
7575   case Instruction::IntToPtr:
7576   case Instruction::SIToFP:
7577   case Instruction::UIToFP:
7578   case Instruction::Trunc:
7579   case Instruction::FPTrunc: {
7580     // Computes the CastContextHint from a Load/Store instruction.
7581     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7582       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7583              "Expected a load or a store!");
7584 
7585       if (VF.isScalar() || !TheLoop->contains(I))
7586         return TTI::CastContextHint::Normal;
7587 
7588       switch (getWideningDecision(I, VF)) {
7589       case LoopVectorizationCostModel::CM_GatherScatter:
7590         return TTI::CastContextHint::GatherScatter;
7591       case LoopVectorizationCostModel::CM_Interleave:
7592         return TTI::CastContextHint::Interleave;
7593       case LoopVectorizationCostModel::CM_Scalarize:
7594       case LoopVectorizationCostModel::CM_Widen:
7595         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7596                                         : TTI::CastContextHint::Normal;
7597       case LoopVectorizationCostModel::CM_Widen_Reverse:
7598         return TTI::CastContextHint::Reversed;
7599       case LoopVectorizationCostModel::CM_Unknown:
7600         llvm_unreachable("Instr did not go through cost modelling?");
7601       }
7602 
7603       llvm_unreachable("Unhandled case!");
7604     };
7605 
7606     unsigned Opcode = I->getOpcode();
7607     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7608     // For Trunc, the context is the only user, which must be a StoreInst.
7609     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7610       if (I->hasOneUse())
7611         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7612           CCH = ComputeCCH(Store);
7613     }
7614     // For Z/Sext, the context is the operand, which must be a LoadInst.
7615     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7616              Opcode == Instruction::FPExt) {
7617       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7618         CCH = ComputeCCH(Load);
7619     }
7620 
7621     // We optimize the truncation of induction variables having constant
7622     // integer steps. The cost of these truncations is the same as the scalar
7623     // operation.
7624     if (isOptimizableIVTruncate(I, VF)) {
7625       auto *Trunc = cast<TruncInst>(I);
7626       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7627                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7628     }
7629 
7630     // Detect reduction patterns
7631     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7632       return *RedCost;
7633 
7634     Type *SrcScalarTy = I->getOperand(0)->getType();
7635     Type *SrcVecTy =
7636         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7637     if (canTruncateToMinimalBitwidth(I, VF)) {
7638       // This cast is going to be shrunk. This may remove the cast or it might
7639       // turn it into slightly different cast. For example, if MinBW == 16,
7640       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7641       //
7642       // Calculate the modified src and dest types.
7643       Type *MinVecTy = VectorTy;
7644       if (Opcode == Instruction::Trunc) {
7645         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7646         VectorTy =
7647             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7648       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7649         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7650         VectorTy =
7651             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7652       }
7653     }
7654 
7655     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7656   }
7657   case Instruction::Call: {
7658     if (RecurrenceDescriptor::isFMulAddIntrinsic(I))
7659       if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7660         return *RedCost;
7661     bool NeedToScalarize;
7662     CallInst *CI = cast<CallInst>(I);
7663     InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7664     if (getVectorIntrinsicIDForCall(CI, TLI)) {
7665       InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
7666       return std::min(CallCost, IntrinsicCost);
7667     }
7668     return CallCost;
7669   }
7670   case Instruction::ExtractValue:
7671     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7672   case Instruction::Alloca:
7673     // We cannot easily widen alloca to a scalable alloca, as
7674     // the result would need to be a vector of pointers.
7675     if (VF.isScalable())
7676       return InstructionCost::getInvalid();
7677     LLVM_FALLTHROUGH;
7678   default:
7679     // This opcode is unknown. Assume that it is the same as 'mul'.
7680     return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7681   } // end of switch.
7682 }
7683 
7684 char LoopVectorize::ID = 0;
7685 
7686 static const char lv_name[] = "Loop Vectorization";
7687 
7688 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7689 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7690 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7691 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
7692 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7693 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7694 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7695 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7696 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7697 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7698 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7699 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7700 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7701 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
7702 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
7703 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7704 
7705 namespace llvm {
7706 
7707 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7708 
7709 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7710                               bool VectorizeOnlyWhenForced) {
7711   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7712 }
7713 
7714 } // end namespace llvm
7715 
7716 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
7717   // Check if the pointer operand of a load or store instruction is
7718   // consecutive.
7719   if (auto *Ptr = getLoadStorePointerOperand(Inst))
7720     return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr);
7721   return false;
7722 }
7723 
7724 void LoopVectorizationCostModel::collectValuesToIgnore() {
7725   // Ignore ephemeral values.
7726   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7727 
7728   // Ignore type-promoting instructions we identified during reduction
7729   // detection.
7730   for (auto &Reduction : Legal->getReductionVars()) {
7731     const RecurrenceDescriptor &RedDes = Reduction.second;
7732     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7733     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7734   }
7735   // Ignore type-casting instructions we identified during induction
7736   // detection.
7737   for (auto &Induction : Legal->getInductionVars()) {
7738     const InductionDescriptor &IndDes = Induction.second;
7739     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7740     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7741   }
7742 }
7743 
7744 void LoopVectorizationCostModel::collectInLoopReductions() {
7745   for (auto &Reduction : Legal->getReductionVars()) {
7746     PHINode *Phi = Reduction.first;
7747     const RecurrenceDescriptor &RdxDesc = Reduction.second;
7748 
7749     // We don't collect reductions that are type promoted (yet).
7750     if (RdxDesc.getRecurrenceType() != Phi->getType())
7751       continue;
7752 
7753     // If the target would prefer this reduction to happen "in-loop", then we
7754     // want to record it as such.
7755     unsigned Opcode = RdxDesc.getOpcode();
7756     if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7757         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7758                                    TargetTransformInfo::ReductionFlags()))
7759       continue;
7760 
7761     // Check that we can correctly put the reductions into the loop, by
7762     // finding the chain of operations that leads from the phi to the loop
7763     // exit value.
7764     SmallVector<Instruction *, 4> ReductionOperations =
7765         RdxDesc.getReductionOpChain(Phi, TheLoop);
7766     bool InLoop = !ReductionOperations.empty();
7767     if (InLoop) {
7768       InLoopReductionChains[Phi] = ReductionOperations;
7769       // Add the elements to InLoopReductionImmediateChains for cost modelling.
7770       Instruction *LastChain = Phi;
7771       for (auto *I : ReductionOperations) {
7772         InLoopReductionImmediateChains[I] = LastChain;
7773         LastChain = I;
7774       }
7775     }
7776     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7777                       << " reduction for phi: " << *Phi << "\n");
7778   }
7779 }
7780 
7781 // TODO: we could return a pair of values that specify the max VF and
7782 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7783 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7784 // doesn't have a cost model that can choose which plan to execute if
7785 // more than one is generated.
7786 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7787                                  LoopVectorizationCostModel &CM) {
7788   unsigned WidestType;
7789   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7790   return WidestVectorRegBits / WidestType;
7791 }
7792 
7793 VectorizationFactor
7794 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7795   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
7796   ElementCount VF = UserVF;
7797   // Outer loop handling: They may require CFG and instruction level
7798   // transformations before even evaluating whether vectorization is profitable.
7799   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7800   // the vectorization pipeline.
7801   if (!OrigLoop->isInnermost()) {
7802     // If the user doesn't provide a vectorization factor, determine a
7803     // reasonable one.
7804     if (UserVF.isZero()) {
7805       VF = ElementCount::getFixed(determineVPlanVF(
7806           TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
7807               .getFixedSize(),
7808           CM));
7809       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7810 
7811       // Make sure we have a VF > 1 for stress testing.
7812       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7813         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7814                           << "overriding computed VF.\n");
7815         VF = ElementCount::getFixed(4);
7816       }
7817     }
7818     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7819     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7820            "VF needs to be a power of two");
7821     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7822                       << "VF " << VF << " to build VPlans.\n");
7823     buildVPlans(VF, VF);
7824 
7825     // For VPlan build stress testing, we bail out after VPlan construction.
7826     if (VPlanBuildStressTest)
7827       return VectorizationFactor::Disabled();
7828 
7829     return {VF, 0 /*Cost*/};
7830   }
7831 
7832   LLVM_DEBUG(
7833       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7834                 "VPlan-native path.\n");
7835   return VectorizationFactor::Disabled();
7836 }
7837 
7838 Optional<VectorizationFactor>
7839 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7840   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7841   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7842   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7843     return None;
7844 
7845   // Invalidate interleave groups if all blocks of loop will be predicated.
7846   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7847       !useMaskedInterleavedAccesses(*TTI)) {
7848     LLVM_DEBUG(
7849         dbgs()
7850         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7851            "which requires masked-interleaved support.\n");
7852     if (CM.InterleaveInfo.invalidateGroups())
7853       // Invalidating interleave groups also requires invalidating all decisions
7854       // based on them, which includes widening decisions and uniform and scalar
7855       // values.
7856       CM.invalidateCostModelingDecisions();
7857   }
7858 
7859   ElementCount MaxUserVF =
7860       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7861   bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
7862   if (!UserVF.isZero() && UserVFIsLegal) {
7863     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7864            "VF needs to be a power of two");
7865     // Collect the instructions (and their associated costs) that will be more
7866     // profitable to scalarize.
7867     if (CM.selectUserVectorizationFactor(UserVF)) {
7868       LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7869       CM.collectInLoopReductions();
7870       buildVPlansWithVPRecipes(UserVF, UserVF);
7871       LLVM_DEBUG(printPlans(dbgs()));
7872       return {{UserVF, 0}};
7873     } else
7874       reportVectorizationInfo("UserVF ignored because of invalid costs.",
7875                               "InvalidCost", ORE, OrigLoop);
7876   }
7877 
7878   // Populate the set of Vectorization Factor Candidates.
7879   ElementCountSet VFCandidates;
7880   for (auto VF = ElementCount::getFixed(1);
7881        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7882     VFCandidates.insert(VF);
7883   for (auto VF = ElementCount::getScalable(1);
7884        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7885     VFCandidates.insert(VF);
7886 
7887   for (const auto &VF : VFCandidates) {
7888     // Collect Uniform and Scalar instructions after vectorization with VF.
7889     CM.collectUniformsAndScalars(VF);
7890 
7891     // Collect the instructions (and their associated costs) that will be more
7892     // profitable to scalarize.
7893     if (VF.isVector())
7894       CM.collectInstsToScalarize(VF);
7895   }
7896 
7897   CM.collectInLoopReductions();
7898   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7899   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7900 
7901   LLVM_DEBUG(printPlans(dbgs()));
7902   if (!MaxFactors.hasVector())
7903     return VectorizationFactor::Disabled();
7904 
7905   // Select the optimal vectorization factor.
7906   auto SelectedVF = CM.selectVectorizationFactor(VFCandidates);
7907 
7908   // Check if it is profitable to vectorize with runtime checks.
7909   unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
7910   if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) {
7911     bool PragmaThresholdReached =
7912         NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
7913     bool ThresholdReached =
7914         NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
7915     if ((ThresholdReached && !Hints.allowReordering()) ||
7916         PragmaThresholdReached) {
7917       ORE->emit([&]() {
7918         return OptimizationRemarkAnalysisAliasing(
7919                    DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(),
7920                    OrigLoop->getHeader())
7921                << "loop not vectorized: cannot prove it is safe to reorder "
7922                   "memory operations";
7923       });
7924       LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
7925       Hints.emitRemarkWithHints();
7926       return VectorizationFactor::Disabled();
7927     }
7928   }
7929   return SelectedVF;
7930 }
7931 
7932 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
7933   assert(count_if(VPlans,
7934                   [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7935              1 &&
7936          "Best VF has not a single VPlan.");
7937 
7938   for (const VPlanPtr &Plan : VPlans) {
7939     if (Plan->hasVF(VF))
7940       return *Plan.get();
7941   }
7942   llvm_unreachable("No plan found!");
7943 }
7944 
7945 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
7946                                            VPlan &BestVPlan,
7947                                            InnerLoopVectorizer &ILV,
7948                                            DominatorTree *DT) {
7949   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
7950                     << '\n');
7951 
7952   // Perform the actual loop transformation.
7953 
7954   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
7955   VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
7956   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7957   State.TripCount = ILV.getOrCreateTripCount(nullptr);
7958   State.CanonicalIV = ILV.Induction;
7959   ILV.collectPoisonGeneratingRecipes(State);
7960 
7961   ILV.printDebugTracesAtStart();
7962 
7963   //===------------------------------------------------===//
7964   //
7965   // Notice: any optimization or new instruction that go
7966   // into the code below should also be implemented in
7967   // the cost-model.
7968   //
7969   //===------------------------------------------------===//
7970 
7971   // 2. Copy and widen instructions from the old loop into the new loop.
7972   BestVPlan.execute(&State);
7973 
7974   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7975   //    predication, updating analyses.
7976   ILV.fixVectorizedLoop(State);
7977 
7978   ILV.printDebugTracesAtEnd();
7979 }
7980 
7981 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
7982 void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
7983   for (const auto &Plan : VPlans)
7984     if (PrintVPlansInDotFormat)
7985       Plan->printDOT(O);
7986     else
7987       Plan->print(O);
7988 }
7989 #endif
7990 
7991 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
7992     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7993 
7994   // We create new control-flow for the vectorized loop, so the original exit
7995   // conditions will be dead after vectorization if it's only used by the
7996   // terminator
7997   SmallVector<BasicBlock*> ExitingBlocks;
7998   OrigLoop->getExitingBlocks(ExitingBlocks);
7999   for (auto *BB : ExitingBlocks) {
8000     auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));
8001     if (!Cmp || !Cmp->hasOneUse())
8002       continue;
8003 
8004     // TODO: we should introduce a getUniqueExitingBlocks on Loop
8005     if (!DeadInstructions.insert(Cmp).second)
8006       continue;
8007 
8008     // The operands of the icmp is often a dead trunc, used by IndUpdate.
8009     // TODO: can recurse through operands in general
8010     for (Value *Op : Cmp->operands()) {
8011       if (isa<TruncInst>(Op) && Op->hasOneUse())
8012           DeadInstructions.insert(cast<Instruction>(Op));
8013     }
8014   }
8015 
8016   // We create new "steps" for induction variable updates to which the original
8017   // induction variables map. An original update instruction will be dead if
8018   // all its users except the induction variable are dead.
8019   auto *Latch = OrigLoop->getLoopLatch();
8020   for (auto &Induction : Legal->getInductionVars()) {
8021     PHINode *Ind = Induction.first;
8022     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
8023 
8024     // If the tail is to be folded by masking, the primary induction variable,
8025     // if exists, isn't dead: it will be used for masking. Don't kill it.
8026     if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
8027       continue;
8028 
8029     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
8030           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
8031         }))
8032       DeadInstructions.insert(IndUpdate);
8033   }
8034 }
8035 
8036 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
8037 
8038 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
8039 
8040 Value *InnerLoopUnroller::getStepVector(Value *Val, Value *StartIdx,
8041                                         Value *Step,
8042                                         Instruction::BinaryOps BinOp) {
8043   // When unrolling and the VF is 1, we only need to add a simple scalar.
8044   Type *Ty = Val->getType();
8045   assert(!Ty->isVectorTy() && "Val must be a scalar");
8046 
8047   if (Ty->isFloatingPointTy()) {
8048     // Floating-point operations inherit FMF via the builder's flags.
8049     Value *MulOp = Builder.CreateFMul(StartIdx, Step);
8050     return Builder.CreateBinOp(BinOp, Val, MulOp);
8051   }
8052   return Builder.CreateAdd(Val, Builder.CreateMul(StartIdx, Step), "induction");
8053 }
8054 
8055 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
8056   SmallVector<Metadata *, 4> MDs;
8057   // Reserve first location for self reference to the LoopID metadata node.
8058   MDs.push_back(nullptr);
8059   bool IsUnrollMetadata = false;
8060   MDNode *LoopID = L->getLoopID();
8061   if (LoopID) {
8062     // First find existing loop unrolling disable metadata.
8063     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
8064       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
8065       if (MD) {
8066         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
8067         IsUnrollMetadata =
8068             S && S->getString().startswith("llvm.loop.unroll.disable");
8069       }
8070       MDs.push_back(LoopID->getOperand(i));
8071     }
8072   }
8073 
8074   if (!IsUnrollMetadata) {
8075     // Add runtime unroll disable metadata.
8076     LLVMContext &Context = L->getHeader()->getContext();
8077     SmallVector<Metadata *, 1> DisableOperands;
8078     DisableOperands.push_back(
8079         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
8080     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
8081     MDs.push_back(DisableNode);
8082     MDNode *NewLoopID = MDNode::get(Context, MDs);
8083     // Set operand 0 to refer to the loop id itself.
8084     NewLoopID->replaceOperandWith(0, NewLoopID);
8085     L->setLoopID(NewLoopID);
8086   }
8087 }
8088 
8089 //===--------------------------------------------------------------------===//
8090 // EpilogueVectorizerMainLoop
8091 //===--------------------------------------------------------------------===//
8092 
8093 /// This function is partially responsible for generating the control flow
8094 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
8095 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
8096   MDNode *OrigLoopID = OrigLoop->getLoopID();
8097   Loop *Lp = createVectorLoopSkeleton("");
8098 
8099   // Generate the code to check the minimum iteration count of the vector
8100   // epilogue (see below).
8101   EPI.EpilogueIterationCountCheck =
8102       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true);
8103   EPI.EpilogueIterationCountCheck->setName("iter.check");
8104 
8105   // Generate the code to check any assumptions that we've made for SCEV
8106   // expressions.
8107   EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader);
8108 
8109   // Generate the code that checks at runtime if arrays overlap. We put the
8110   // checks into a separate block to make the more common case of few elements
8111   // faster.
8112   EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
8113 
8114   // Generate the iteration count check for the main loop, *after* the check
8115   // for the epilogue loop, so that the path-length is shorter for the case
8116   // that goes directly through the vector epilogue. The longer-path length for
8117   // the main loop is compensated for, by the gain from vectorizing the larger
8118   // trip count. Note: the branch will get updated later on when we vectorize
8119   // the epilogue.
8120   EPI.MainLoopIterationCountCheck =
8121       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
8122 
8123   // Generate the induction variable.
8124   OldInduction = Legal->getPrimaryInduction();
8125   Type *IdxTy = Legal->getWidestInductionType();
8126   Value *StartIdx = ConstantInt::get(IdxTy, 0);
8127 
8128   IRBuilder<> B(&*Lp->getLoopPreheader()->getFirstInsertionPt());
8129   Value *Step = getRuntimeVF(B, IdxTy, VF * UF);
8130   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
8131   EPI.VectorTripCount = CountRoundDown;
8132   Induction =
8133       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
8134                               getDebugLocFromInstOrOperands(OldInduction));
8135 
8136   // Skip induction resume value creation here because they will be created in
8137   // the second pass. If we created them here, they wouldn't be used anyway,
8138   // because the vplan in the second pass still contains the inductions from the
8139   // original loop.
8140 
8141   return completeLoopSkeleton(Lp, OrigLoopID);
8142 }
8143 
8144 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
8145   LLVM_DEBUG({
8146     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
8147            << "Main Loop VF:" << EPI.MainLoopVF
8148            << ", Main Loop UF:" << EPI.MainLoopUF
8149            << ", Epilogue Loop VF:" << EPI.EpilogueVF
8150            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8151   });
8152 }
8153 
8154 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
8155   DEBUG_WITH_TYPE(VerboseDebug, {
8156     dbgs() << "intermediate fn:\n"
8157            << *OrigLoop->getHeader()->getParent() << "\n";
8158   });
8159 }
8160 
8161 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
8162     Loop *L, BasicBlock *Bypass, bool ForEpilogue) {
8163   assert(L && "Expected valid Loop.");
8164   assert(Bypass && "Expected valid bypass basic block.");
8165   ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
8166   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
8167   Value *Count = getOrCreateTripCount(L);
8168   // Reuse existing vector loop preheader for TC checks.
8169   // Note that new preheader block is generated for vector loop.
8170   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
8171   IRBuilder<> Builder(TCCheckBlock->getTerminator());
8172 
8173   // Generate code to check if the loop's trip count is less than VF * UF of the
8174   // main vector loop.
8175   auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ?
8176       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
8177 
8178   Value *CheckMinIters = Builder.CreateICmp(
8179       P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
8180       "min.iters.check");
8181 
8182   if (!ForEpilogue)
8183     TCCheckBlock->setName("vector.main.loop.iter.check");
8184 
8185   // Create new preheader for vector loop.
8186   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
8187                                    DT, LI, nullptr, "vector.ph");
8188 
8189   if (ForEpilogue) {
8190     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
8191                                  DT->getNode(Bypass)->getIDom()) &&
8192            "TC check is expected to dominate Bypass");
8193 
8194     // Update dominator for Bypass & LoopExit.
8195     DT->changeImmediateDominator(Bypass, TCCheckBlock);
8196     if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
8197       // For loops with multiple exits, there's no edge from the middle block
8198       // to exit blocks (as the epilogue must run) and thus no need to update
8199       // the immediate dominator of the exit blocks.
8200       DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
8201 
8202     LoopBypassBlocks.push_back(TCCheckBlock);
8203 
8204     // Save the trip count so we don't have to regenerate it in the
8205     // vec.epilog.iter.check. This is safe to do because the trip count
8206     // generated here dominates the vector epilog iter check.
8207     EPI.TripCount = Count;
8208   }
8209 
8210   ReplaceInstWithInst(
8211       TCCheckBlock->getTerminator(),
8212       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
8213 
8214   return TCCheckBlock;
8215 }
8216 
8217 //===--------------------------------------------------------------------===//
8218 // EpilogueVectorizerEpilogueLoop
8219 //===--------------------------------------------------------------------===//
8220 
8221 /// This function is partially responsible for generating the control flow
8222 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
8223 BasicBlock *
8224 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
8225   MDNode *OrigLoopID = OrigLoop->getLoopID();
8226   Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
8227 
8228   // Now, compare the remaining count and if there aren't enough iterations to
8229   // execute the vectorized epilogue skip to the scalar part.
8230   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
8231   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
8232   LoopVectorPreHeader =
8233       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
8234                  LI, nullptr, "vec.epilog.ph");
8235   emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader,
8236                                           VecEpilogueIterationCountCheck);
8237 
8238   // Adjust the control flow taking the state info from the main loop
8239   // vectorization into account.
8240   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
8241          "expected this to be saved from the previous pass.");
8242   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
8243       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
8244 
8245   DT->changeImmediateDominator(LoopVectorPreHeader,
8246                                EPI.MainLoopIterationCountCheck);
8247 
8248   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
8249       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8250 
8251   if (EPI.SCEVSafetyCheck)
8252     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
8253         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8254   if (EPI.MemSafetyCheck)
8255     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
8256         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8257 
8258   DT->changeImmediateDominator(
8259       VecEpilogueIterationCountCheck,
8260       VecEpilogueIterationCountCheck->getSinglePredecessor());
8261 
8262   DT->changeImmediateDominator(LoopScalarPreHeader,
8263                                EPI.EpilogueIterationCountCheck);
8264   if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
8265     // If there is an epilogue which must run, there's no edge from the
8266     // middle block to exit blocks  and thus no need to update the immediate
8267     // dominator of the exit blocks.
8268     DT->changeImmediateDominator(LoopExitBlock,
8269                                  EPI.EpilogueIterationCountCheck);
8270 
8271   // Keep track of bypass blocks, as they feed start values to the induction
8272   // phis in the scalar loop preheader.
8273   if (EPI.SCEVSafetyCheck)
8274     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
8275   if (EPI.MemSafetyCheck)
8276     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
8277   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
8278 
8279   // Generate a resume induction for the vector epilogue and put it in the
8280   // vector epilogue preheader
8281   Type *IdxTy = Legal->getWidestInductionType();
8282   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
8283                                          LoopVectorPreHeader->getFirstNonPHI());
8284   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
8285   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
8286                            EPI.MainLoopIterationCountCheck);
8287 
8288   // Generate the induction variable.
8289   OldInduction = Legal->getPrimaryInduction();
8290   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
8291   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
8292   Value *StartIdx = EPResumeVal;
8293   Induction =
8294       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
8295                               getDebugLocFromInstOrOperands(OldInduction));
8296 
8297   // Generate induction resume values. These variables save the new starting
8298   // indexes for the scalar loop. They are used to test if there are any tail
8299   // iterations left once the vector loop has completed.
8300   // Note that when the vectorized epilogue is skipped due to iteration count
8301   // check, then the resume value for the induction variable comes from
8302   // the trip count of the main vector loop, hence passing the AdditionalBypass
8303   // argument.
8304   createInductionResumeValues(Lp, CountRoundDown,
8305                               {VecEpilogueIterationCountCheck,
8306                                EPI.VectorTripCount} /* AdditionalBypass */);
8307 
8308   AddRuntimeUnrollDisableMetaData(Lp);
8309   return completeLoopSkeleton(Lp, OrigLoopID);
8310 }
8311 
8312 BasicBlock *
8313 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
8314     Loop *L, BasicBlock *Bypass, BasicBlock *Insert) {
8315 
8316   assert(EPI.TripCount &&
8317          "Expected trip count to have been safed in the first pass.");
8318   assert(
8319       (!isa<Instruction>(EPI.TripCount) ||
8320        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
8321       "saved trip count does not dominate insertion point.");
8322   Value *TC = EPI.TripCount;
8323   IRBuilder<> Builder(Insert->getTerminator());
8324   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
8325 
8326   // Generate code to check if the loop's trip count is less than VF * UF of the
8327   // vector epilogue loop.
8328   auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ?
8329       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
8330 
8331   Value *CheckMinIters =
8332       Builder.CreateICmp(P, Count,
8333                          createStepForVF(Builder, Count->getType(),
8334                                          EPI.EpilogueVF, EPI.EpilogueUF),
8335                          "min.epilog.iters.check");
8336 
8337   ReplaceInstWithInst(
8338       Insert->getTerminator(),
8339       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
8340 
8341   LoopBypassBlocks.push_back(Insert);
8342   return Insert;
8343 }
8344 
8345 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
8346   LLVM_DEBUG({
8347     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
8348            << "Epilogue Loop VF:" << EPI.EpilogueVF
8349            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8350   });
8351 }
8352 
8353 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
8354   DEBUG_WITH_TYPE(VerboseDebug, {
8355     dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
8356   });
8357 }
8358 
8359 bool LoopVectorizationPlanner::getDecisionAndClampRange(
8360     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
8361   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
8362   bool PredicateAtRangeStart = Predicate(Range.Start);
8363 
8364   for (ElementCount TmpVF = Range.Start * 2;
8365        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
8366     if (Predicate(TmpVF) != PredicateAtRangeStart) {
8367       Range.End = TmpVF;
8368       break;
8369     }
8370 
8371   return PredicateAtRangeStart;
8372 }
8373 
8374 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
8375 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
8376 /// of VF's starting at a given VF and extending it as much as possible. Each
8377 /// vectorization decision can potentially shorten this sub-range during
8378 /// buildVPlan().
8379 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
8380                                            ElementCount MaxVF) {
8381   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8382   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8383     VFRange SubRange = {VF, MaxVFPlusOne};
8384     VPlans.push_back(buildVPlan(SubRange));
8385     VF = SubRange.End;
8386   }
8387 }
8388 
8389 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
8390                                          VPlanPtr &Plan) {
8391   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8392 
8393   // Look for cached value.
8394   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8395   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8396   if (ECEntryIt != EdgeMaskCache.end())
8397     return ECEntryIt->second;
8398 
8399   VPValue *SrcMask = createBlockInMask(Src, Plan);
8400 
8401   // The terminator has to be a branch inst!
8402   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8403   assert(BI && "Unexpected terminator found");
8404 
8405   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8406     return EdgeMaskCache[Edge] = SrcMask;
8407 
8408   // If source is an exiting block, we know the exit edge is dynamically dead
8409   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
8410   // adding uses of an otherwise potentially dead instruction.
8411   if (OrigLoop->isLoopExiting(Src))
8412     return EdgeMaskCache[Edge] = SrcMask;
8413 
8414   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
8415   assert(EdgeMask && "No Edge Mask found for condition");
8416 
8417   if (BI->getSuccessor(0) != Dst)
8418     EdgeMask = Builder.createNot(EdgeMask);
8419 
8420   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8421     // The condition is 'SrcMask && EdgeMask', which is equivalent to
8422     // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8423     // The select version does not introduce new UB if SrcMask is false and
8424     // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8425     VPValue *False = Plan->getOrAddVPValue(
8426         ConstantInt::getFalse(BI->getCondition()->getType()));
8427     EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False);
8428   }
8429 
8430   return EdgeMaskCache[Edge] = EdgeMask;
8431 }
8432 
8433 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
8434   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8435 
8436   // Look for cached value.
8437   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
8438   if (BCEntryIt != BlockMaskCache.end())
8439     return BCEntryIt->second;
8440 
8441   // All-one mask is modelled as no-mask following the convention for masked
8442   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8443   VPValue *BlockMask = nullptr;
8444 
8445   if (OrigLoop->getHeader() == BB) {
8446     if (!CM.blockNeedsPredicationForAnyReason(BB))
8447       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
8448 
8449     // Create the block in mask as the first non-phi instruction in the block.
8450     VPBuilder::InsertPointGuard Guard(Builder);
8451     auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
8452     Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
8453 
8454     // Introduce the early-exit compare IV <= BTC to form header block mask.
8455     // This is used instead of IV < TC because TC may wrap, unlike BTC.
8456     // Start by constructing the desired canonical IV.
8457     VPValue *IV = nullptr;
8458     if (Legal->getPrimaryInduction())
8459       IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
8460     else {
8461       auto *IVRecipe = new VPWidenCanonicalIVRecipe();
8462       Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint);
8463       IV = IVRecipe;
8464     }
8465     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
8466     bool TailFolded = !CM.isScalarEpilogueAllowed();
8467 
8468     if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
8469       // While ActiveLaneMask is a binary op that consumes the loop tripcount
8470       // as a second argument, we only pass the IV here and extract the
8471       // tripcount from the transform state where codegen of the VP instructions
8472       // happen.
8473       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
8474     } else {
8475       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
8476     }
8477     return BlockMaskCache[BB] = BlockMask;
8478   }
8479 
8480   // This is the block mask. We OR all incoming edges.
8481   for (auto *Predecessor : predecessors(BB)) {
8482     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8483     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
8484       return BlockMaskCache[BB] = EdgeMask;
8485 
8486     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8487       BlockMask = EdgeMask;
8488       continue;
8489     }
8490 
8491     BlockMask = Builder.createOr(BlockMask, EdgeMask);
8492   }
8493 
8494   return BlockMaskCache[BB] = BlockMask;
8495 }
8496 
8497 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
8498                                                 ArrayRef<VPValue *> Operands,
8499                                                 VFRange &Range,
8500                                                 VPlanPtr &Plan) {
8501   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8502          "Must be called with either a load or store");
8503 
8504   auto willWiden = [&](ElementCount VF) -> bool {
8505     if (VF.isScalar())
8506       return false;
8507     LoopVectorizationCostModel::InstWidening Decision =
8508         CM.getWideningDecision(I, VF);
8509     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8510            "CM decision should be taken at this point.");
8511     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8512       return true;
8513     if (CM.isScalarAfterVectorization(I, VF) ||
8514         CM.isProfitableToScalarize(I, VF))
8515       return false;
8516     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8517   };
8518 
8519   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8520     return nullptr;
8521 
8522   VPValue *Mask = nullptr;
8523   if (Legal->isMaskRequired(I))
8524     Mask = createBlockInMask(I->getParent(), Plan);
8525 
8526   // Determine if the pointer operand of the access is either consecutive or
8527   // reverse consecutive.
8528   LoopVectorizationCostModel::InstWidening Decision =
8529       CM.getWideningDecision(I, Range.Start);
8530   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8531   bool Consecutive =
8532       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8533 
8534   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8535     return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
8536                                               Consecutive, Reverse);
8537 
8538   StoreInst *Store = cast<StoreInst>(I);
8539   return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
8540                                             Mask, Consecutive, Reverse);
8541 }
8542 
8543 VPWidenIntOrFpInductionRecipe *
8544 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi,
8545                                            ArrayRef<VPValue *> Operands) const {
8546   // Check if this is an integer or fp induction. If so, build the recipe that
8547   // produces its scalar and vector values.
8548   if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) {
8549     assert(II->getStartValue() ==
8550            Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8551     return new VPWidenIntOrFpInductionRecipe(Phi, Operands[0], *II);
8552   }
8553 
8554   return nullptr;
8555 }
8556 
8557 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8558     TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range,
8559     VPlan &Plan) const {
8560   // Optimize the special case where the source is a constant integer
8561   // induction variable. Notice that we can only optimize the 'trunc' case
8562   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8563   // (c) other casts depend on pointer size.
8564 
8565   // Determine whether \p K is a truncation based on an induction variable that
8566   // can be optimized.
8567   auto isOptimizableIVTruncate =
8568       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8569     return [=](ElementCount VF) -> bool {
8570       return CM.isOptimizableIVTruncate(K, VF);
8571     };
8572   };
8573 
8574   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8575           isOptimizableIVTruncate(I), Range)) {
8576 
8577     auto *Phi = cast<PHINode>(I->getOperand(0));
8578     const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8579     VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
8580     return new VPWidenIntOrFpInductionRecipe(Phi, Start, II, I);
8581   }
8582   return nullptr;
8583 }
8584 
8585 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
8586                                                 ArrayRef<VPValue *> Operands,
8587                                                 VPlanPtr &Plan) {
8588   // If all incoming values are equal, the incoming VPValue can be used directly
8589   // instead of creating a new VPBlendRecipe.
8590   VPValue *FirstIncoming = Operands[0];
8591   if (all_of(Operands, [FirstIncoming](const VPValue *Inc) {
8592         return FirstIncoming == Inc;
8593       })) {
8594     return Operands[0];
8595   }
8596 
8597   // We know that all PHIs in non-header blocks are converted into selects, so
8598   // we don't have to worry about the insertion order and we can just use the
8599   // builder. At this point we generate the predication tree. There may be
8600   // duplications since this is a simple recursive scan, but future
8601   // optimizations will clean it up.
8602   SmallVector<VPValue *, 2> OperandsWithMask;
8603   unsigned NumIncoming = Phi->getNumIncomingValues();
8604 
8605   for (unsigned In = 0; In < NumIncoming; In++) {
8606     VPValue *EdgeMask =
8607       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8608     assert((EdgeMask || NumIncoming == 1) &&
8609            "Multiple predecessors with one having a full mask");
8610     OperandsWithMask.push_back(Operands[In]);
8611     if (EdgeMask)
8612       OperandsWithMask.push_back(EdgeMask);
8613   }
8614   return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
8615 }
8616 
8617 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8618                                                    ArrayRef<VPValue *> Operands,
8619                                                    VFRange &Range) const {
8620 
8621   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8622       [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); },
8623       Range);
8624 
8625   if (IsPredicated)
8626     return nullptr;
8627 
8628   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8629   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8630              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8631              ID == Intrinsic::pseudoprobe ||
8632              ID == Intrinsic::experimental_noalias_scope_decl))
8633     return nullptr;
8634 
8635   auto willWiden = [&](ElementCount VF) -> bool {
8636     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8637     // The following case may be scalarized depending on the VF.
8638     // The flag shows whether we use Intrinsic or a usual Call for vectorized
8639     // version of the instruction.
8640     // Is it beneficial to perform intrinsic call compared to lib call?
8641     bool NeedToScalarize = false;
8642     InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
8643     InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
8644     bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
8645     return UseVectorIntrinsic || !NeedToScalarize;
8646   };
8647 
8648   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8649     return nullptr;
8650 
8651   ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());
8652   return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()));
8653 }
8654 
8655 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8656   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8657          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8658   // Instruction should be widened, unless it is scalar after vectorization,
8659   // scalarization is profitable or it is predicated.
8660   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8661     return CM.isScalarAfterVectorization(I, VF) ||
8662            CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I);
8663   };
8664   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8665                                                              Range);
8666 }
8667 
8668 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8669                                            ArrayRef<VPValue *> Operands) const {
8670   auto IsVectorizableOpcode = [](unsigned Opcode) {
8671     switch (Opcode) {
8672     case Instruction::Add:
8673     case Instruction::And:
8674     case Instruction::AShr:
8675     case Instruction::BitCast:
8676     case Instruction::FAdd:
8677     case Instruction::FCmp:
8678     case Instruction::FDiv:
8679     case Instruction::FMul:
8680     case Instruction::FNeg:
8681     case Instruction::FPExt:
8682     case Instruction::FPToSI:
8683     case Instruction::FPToUI:
8684     case Instruction::FPTrunc:
8685     case Instruction::FRem:
8686     case Instruction::FSub:
8687     case Instruction::ICmp:
8688     case Instruction::IntToPtr:
8689     case Instruction::LShr:
8690     case Instruction::Mul:
8691     case Instruction::Or:
8692     case Instruction::PtrToInt:
8693     case Instruction::SDiv:
8694     case Instruction::Select:
8695     case Instruction::SExt:
8696     case Instruction::Shl:
8697     case Instruction::SIToFP:
8698     case Instruction::SRem:
8699     case Instruction::Sub:
8700     case Instruction::Trunc:
8701     case Instruction::UDiv:
8702     case Instruction::UIToFP:
8703     case Instruction::URem:
8704     case Instruction::Xor:
8705     case Instruction::ZExt:
8706       return true;
8707     }
8708     return false;
8709   };
8710 
8711   if (!IsVectorizableOpcode(I->getOpcode()))
8712     return nullptr;
8713 
8714   // Success: widen this instruction.
8715   return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8716 }
8717 
8718 void VPRecipeBuilder::fixHeaderPhis() {
8719   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8720   for (VPWidenPHIRecipe *R : PhisToFix) {
8721     auto *PN = cast<PHINode>(R->getUnderlyingValue());
8722     VPRecipeBase *IncR =
8723         getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8724     R->addOperand(IncR->getVPSingleValue());
8725   }
8726 }
8727 
8728 VPBasicBlock *VPRecipeBuilder::handleReplication(
8729     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8730     VPlanPtr &Plan) {
8731   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8732       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8733       Range);
8734 
8735   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8736       [&](ElementCount VF) { return CM.isPredicatedInst(I, IsUniform); },
8737       Range);
8738 
8739   // Even if the instruction is not marked as uniform, there are certain
8740   // intrinsic calls that can be effectively treated as such, so we check for
8741   // them here. Conservatively, we only do this for scalable vectors, since
8742   // for fixed-width VFs we can always fall back on full scalarization.
8743   if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8744     switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8745     case Intrinsic::assume:
8746     case Intrinsic::lifetime_start:
8747     case Intrinsic::lifetime_end:
8748       // For scalable vectors if one of the operands is variant then we still
8749       // want to mark as uniform, which will generate one instruction for just
8750       // the first lane of the vector. We can't scalarize the call in the same
8751       // way as for fixed-width vectors because we don't know how many lanes
8752       // there are.
8753       //
8754       // The reasons for doing it this way for scalable vectors are:
8755       //   1. For the assume intrinsic generating the instruction for the first
8756       //      lane is still be better than not generating any at all. For
8757       //      example, the input may be a splat across all lanes.
8758       //   2. For the lifetime start/end intrinsics the pointer operand only
8759       //      does anything useful when the input comes from a stack object,
8760       //      which suggests it should always be uniform. For non-stack objects
8761       //      the effect is to poison the object, which still allows us to
8762       //      remove the call.
8763       IsUniform = true;
8764       break;
8765     default:
8766       break;
8767     }
8768   }
8769 
8770   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8771                                        IsUniform, IsPredicated);
8772   setRecipe(I, Recipe);
8773   Plan->addVPValue(I, Recipe);
8774 
8775   // Find if I uses a predicated instruction. If so, it will use its scalar
8776   // value. Avoid hoisting the insert-element which packs the scalar value into
8777   // a vector value, as that happens iff all users use the vector value.
8778   for (VPValue *Op : Recipe->operands()) {
8779     auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef());
8780     if (!PredR)
8781       continue;
8782     auto *RepR =
8783         cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef());
8784     assert(RepR->isPredicated() &&
8785            "expected Replicate recipe to be predicated");
8786     RepR->setAlsoPack(false);
8787   }
8788 
8789   // Finalize the recipe for Instr, first if it is not predicated.
8790   if (!IsPredicated) {
8791     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8792     VPBB->appendRecipe(Recipe);
8793     return VPBB;
8794   }
8795   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8796   assert(VPBB->getSuccessors().empty() &&
8797          "VPBB has successors when handling predicated replication.");
8798   // Record predicated instructions for above packing optimizations.
8799   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
8800   VPBlockUtils::insertBlockAfter(Region, VPBB);
8801   auto *RegSucc = new VPBasicBlock();
8802   VPBlockUtils::insertBlockAfter(RegSucc, Region);
8803   return RegSucc;
8804 }
8805 
8806 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
8807                                                       VPRecipeBase *PredRecipe,
8808                                                       VPlanPtr &Plan) {
8809   // Instructions marked for predication are replicated and placed under an
8810   // if-then construct to prevent side-effects.
8811 
8812   // Generate recipes to compute the block mask for this region.
8813   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
8814 
8815   // Build the triangular if-then region.
8816   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8817   assert(Instr->getParent() && "Predicated instruction not in any basic block");
8818   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8819   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8820   auto *PHIRecipe = Instr->getType()->isVoidTy()
8821                         ? nullptr
8822                         : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
8823   if (PHIRecipe) {
8824     Plan->removeVPValueFor(Instr);
8825     Plan->addVPValue(Instr, PHIRecipe);
8826   }
8827   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8828   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8829   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
8830 
8831   // Note: first set Entry as region entry and then connect successors starting
8832   // from it in order, to propagate the "parent" of each VPBasicBlock.
8833   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
8834   VPBlockUtils::connectBlocks(Pred, Exit);
8835 
8836   return Region;
8837 }
8838 
8839 VPRecipeOrVPValueTy
8840 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8841                                         ArrayRef<VPValue *> Operands,
8842                                         VFRange &Range, VPlanPtr &Plan) {
8843   // First, check for specific widening recipes that deal with calls, memory
8844   // operations, inductions and Phi nodes.
8845   if (auto *CI = dyn_cast<CallInst>(Instr))
8846     return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
8847 
8848   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8849     return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
8850 
8851   VPRecipeBase *Recipe;
8852   if (auto Phi = dyn_cast<PHINode>(Instr)) {
8853     if (Phi->getParent() != OrigLoop->getHeader())
8854       return tryToBlend(Phi, Operands, Plan);
8855     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands)))
8856       return toVPRecipeResult(Recipe);
8857 
8858     VPWidenPHIRecipe *PhiRecipe = nullptr;
8859     if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) {
8860       VPValue *StartV = Operands[0];
8861       if (Legal->isReductionVariable(Phi)) {
8862         const RecurrenceDescriptor &RdxDesc =
8863             Legal->getReductionVars().find(Phi)->second;
8864         assert(RdxDesc.getRecurrenceStartValue() ==
8865                Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8866         PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8867                                              CM.isInLoopReduction(Phi),
8868                                              CM.useOrderedReductions(RdxDesc));
8869       } else {
8870         PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8871       }
8872 
8873       // Record the incoming value from the backedge, so we can add the incoming
8874       // value from the backedge after all recipes have been created.
8875       recordRecipeOf(cast<Instruction>(
8876           Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())));
8877       PhisToFix.push_back(PhiRecipe);
8878     } else {
8879       // TODO: record start and backedge value for remaining pointer induction
8880       // phis.
8881       assert(Phi->getType()->isPointerTy() &&
8882              "only pointer phis should be handled here");
8883       PhiRecipe = new VPWidenPHIRecipe(Phi);
8884     }
8885 
8886     return toVPRecipeResult(PhiRecipe);
8887   }
8888 
8889   if (isa<TruncInst>(Instr) &&
8890       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
8891                                                Range, *Plan)))
8892     return toVPRecipeResult(Recipe);
8893 
8894   if (!shouldWiden(Instr, Range))
8895     return nullptr;
8896 
8897   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8898     return toVPRecipeResult(new VPWidenGEPRecipe(
8899         GEP, make_range(Operands.begin(), Operands.end()), OrigLoop));
8900 
8901   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8902     bool InvariantCond =
8903         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
8904     return toVPRecipeResult(new VPWidenSelectRecipe(
8905         *SI, make_range(Operands.begin(), Operands.end()), InvariantCond));
8906   }
8907 
8908   return toVPRecipeResult(tryToWiden(Instr, Operands));
8909 }
8910 
8911 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8912                                                         ElementCount MaxVF) {
8913   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8914 
8915   // Collect instructions from the original loop that will become trivially dead
8916   // in the vectorized loop. We don't need to vectorize these instructions. For
8917   // example, original induction update instructions can become dead because we
8918   // separately emit induction "steps" when generating code for the new loop.
8919   // Similarly, we create a new latch condition when setting up the structure
8920   // of the new loop, so the old one can become dead.
8921   SmallPtrSet<Instruction *, 4> DeadInstructions;
8922   collectTriviallyDeadInstructions(DeadInstructions);
8923 
8924   // Add assume instructions we need to drop to DeadInstructions, to prevent
8925   // them from being added to the VPlan.
8926   // TODO: We only need to drop assumes in blocks that get flattend. If the
8927   // control flow is preserved, we should keep them.
8928   auto &ConditionalAssumes = Legal->getConditionalAssumes();
8929   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
8930 
8931   MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
8932   // Dead instructions do not need sinking. Remove them from SinkAfter.
8933   for (Instruction *I : DeadInstructions)
8934     SinkAfter.erase(I);
8935 
8936   // Cannot sink instructions after dead instructions (there won't be any
8937   // recipes for them). Instead, find the first non-dead previous instruction.
8938   for (auto &P : Legal->getSinkAfter()) {
8939     Instruction *SinkTarget = P.second;
8940     Instruction *FirstInst = &*SinkTarget->getParent()->begin();
8941     (void)FirstInst;
8942     while (DeadInstructions.contains(SinkTarget)) {
8943       assert(
8944           SinkTarget != FirstInst &&
8945           "Must find a live instruction (at least the one feeding the "
8946           "first-order recurrence PHI) before reaching beginning of the block");
8947       SinkTarget = SinkTarget->getPrevNode();
8948       assert(SinkTarget != P.first &&
8949              "sink source equals target, no sinking required");
8950     }
8951     P.second = SinkTarget;
8952   }
8953 
8954   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8955   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8956     VFRange SubRange = {VF, MaxVFPlusOne};
8957     VPlans.push_back(
8958         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
8959     VF = SubRange.End;
8960   }
8961 }
8962 
8963 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
8964     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
8965     const MapVector<Instruction *, Instruction *> &SinkAfter) {
8966 
8967   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8968 
8969   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8970 
8971   // ---------------------------------------------------------------------------
8972   // Pre-construction: record ingredients whose recipes we'll need to further
8973   // process after constructing the initial VPlan.
8974   // ---------------------------------------------------------------------------
8975 
8976   // Mark instructions we'll need to sink later and their targets as
8977   // ingredients whose recipe we'll need to record.
8978   for (auto &Entry : SinkAfter) {
8979     RecipeBuilder.recordRecipeOf(Entry.first);
8980     RecipeBuilder.recordRecipeOf(Entry.second);
8981   }
8982   for (auto &Reduction : CM.getInLoopReductionChains()) {
8983     PHINode *Phi = Reduction.first;
8984     RecurKind Kind =
8985         Legal->getReductionVars().find(Phi)->second.getRecurrenceKind();
8986     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8987 
8988     RecipeBuilder.recordRecipeOf(Phi);
8989     for (auto &R : ReductionOperations) {
8990       RecipeBuilder.recordRecipeOf(R);
8991       // For min/max reducitons, where we have a pair of icmp/select, we also
8992       // need to record the ICmp recipe, so it can be removed later.
8993       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
8994              "Only min/max recurrences allowed for inloop reductions");
8995       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
8996         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
8997     }
8998   }
8999 
9000   // For each interleave group which is relevant for this (possibly trimmed)
9001   // Range, add it to the set of groups to be later applied to the VPlan and add
9002   // placeholders for its members' Recipes which we'll be replacing with a
9003   // single VPInterleaveRecipe.
9004   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
9005     auto applyIG = [IG, this](ElementCount VF) -> bool {
9006       return (VF.isVector() && // Query is illegal for VF == 1
9007               CM.getWideningDecision(IG->getInsertPos(), VF) ==
9008                   LoopVectorizationCostModel::CM_Interleave);
9009     };
9010     if (!getDecisionAndClampRange(applyIG, Range))
9011       continue;
9012     InterleaveGroups.insert(IG);
9013     for (unsigned i = 0; i < IG->getFactor(); i++)
9014       if (Instruction *Member = IG->getMember(i))
9015         RecipeBuilder.recordRecipeOf(Member);
9016   };
9017 
9018   // ---------------------------------------------------------------------------
9019   // Build initial VPlan: Scan the body of the loop in a topological order to
9020   // visit each basic block after having visited its predecessor basic blocks.
9021   // ---------------------------------------------------------------------------
9022 
9023   auto Plan = std::make_unique<VPlan>();
9024 
9025   // Scan the body of the loop in a topological order to visit each basic block
9026   // after having visited its predecessor basic blocks.
9027   LoopBlocksDFS DFS(OrigLoop);
9028   DFS.perform(LI);
9029 
9030   VPBasicBlock *VPBB = nullptr;
9031   VPBasicBlock *HeaderVPBB = nullptr;
9032   SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove;
9033   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
9034     // Relevant instructions from basic block BB will be grouped into VPRecipe
9035     // ingredients and fill a new VPBasicBlock.
9036     unsigned VPBBsForBB = 0;
9037     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
9038     if (VPBB)
9039       VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
9040     else {
9041       auto *TopRegion = new VPRegionBlock("vector loop");
9042       TopRegion->setEntry(FirstVPBBForBB);
9043       Plan->setEntry(TopRegion);
9044       HeaderVPBB = FirstVPBBForBB;
9045     }
9046     VPBB = FirstVPBBForBB;
9047     Builder.setInsertPoint(VPBB);
9048 
9049     // Introduce each ingredient into VPlan.
9050     // TODO: Model and preserve debug instrinsics in VPlan.
9051     for (Instruction &I : BB->instructionsWithoutDebug()) {
9052       Instruction *Instr = &I;
9053 
9054       // First filter out irrelevant instructions, to ensure no recipes are
9055       // built for them.
9056       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
9057         continue;
9058 
9059       SmallVector<VPValue *, 4> Operands;
9060       auto *Phi = dyn_cast<PHINode>(Instr);
9061       if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
9062         Operands.push_back(Plan->getOrAddVPValue(
9063             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
9064       } else {
9065         auto OpRange = Plan->mapToVPValues(Instr->operands());
9066         Operands = {OpRange.begin(), OpRange.end()};
9067       }
9068       if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
9069               Instr, Operands, Range, Plan)) {
9070         // If Instr can be simplified to an existing VPValue, use it.
9071         if (RecipeOrValue.is<VPValue *>()) {
9072           auto *VPV = RecipeOrValue.get<VPValue *>();
9073           Plan->addVPValue(Instr, VPV);
9074           // If the re-used value is a recipe, register the recipe for the
9075           // instruction, in case the recipe for Instr needs to be recorded.
9076           if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef()))
9077             RecipeBuilder.setRecipe(Instr, R);
9078           continue;
9079         }
9080         // Otherwise, add the new recipe.
9081         VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>();
9082         for (auto *Def : Recipe->definedValues()) {
9083           auto *UV = Def->getUnderlyingValue();
9084           Plan->addVPValue(UV, Def);
9085         }
9086 
9087         if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) &&
9088             HeaderVPBB->getFirstNonPhi() != VPBB->end()) {
9089           // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section
9090           // of the header block. That can happen for truncates of induction
9091           // variables. Those recipes are moved to the phi section of the header
9092           // block after applying SinkAfter, which relies on the original
9093           // position of the trunc.
9094           assert(isa<TruncInst>(Instr));
9095           InductionsToMove.push_back(
9096               cast<VPWidenIntOrFpInductionRecipe>(Recipe));
9097         }
9098         RecipeBuilder.setRecipe(Instr, Recipe);
9099         VPBB->appendRecipe(Recipe);
9100         continue;
9101       }
9102 
9103       // Otherwise, if all widening options failed, Instruction is to be
9104       // replicated. This may create a successor for VPBB.
9105       VPBasicBlock *NextVPBB =
9106           RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan);
9107       if (NextVPBB != VPBB) {
9108         VPBB = NextVPBB;
9109         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
9110                                     : "");
9111       }
9112     }
9113   }
9114 
9115   assert(isa<VPRegionBlock>(Plan->getEntry()) &&
9116          !Plan->getEntry()->getEntryBasicBlock()->empty() &&
9117          "entry block must be set to a VPRegionBlock having a non-empty entry "
9118          "VPBasicBlock");
9119   RecipeBuilder.fixHeaderPhis();
9120 
9121   // ---------------------------------------------------------------------------
9122   // Transform initial VPlan: Apply previously taken decisions, in order, to
9123   // bring the VPlan to its final state.
9124   // ---------------------------------------------------------------------------
9125 
9126   // Apply Sink-After legal constraints.
9127   auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {
9128     auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
9129     if (Region && Region->isReplicator()) {
9130       assert(Region->getNumSuccessors() == 1 &&
9131              Region->getNumPredecessors() == 1 && "Expected SESE region!");
9132       assert(R->getParent()->size() == 1 &&
9133              "A recipe in an original replicator region must be the only "
9134              "recipe in its block");
9135       return Region;
9136     }
9137     return nullptr;
9138   };
9139   for (auto &Entry : SinkAfter) {
9140     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
9141     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
9142 
9143     auto *TargetRegion = GetReplicateRegion(Target);
9144     auto *SinkRegion = GetReplicateRegion(Sink);
9145     if (!SinkRegion) {
9146       // If the sink source is not a replicate region, sink the recipe directly.
9147       if (TargetRegion) {
9148         // The target is in a replication region, make sure to move Sink to
9149         // the block after it, not into the replication region itself.
9150         VPBasicBlock *NextBlock =
9151             cast<VPBasicBlock>(TargetRegion->getSuccessors().front());
9152         Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
9153       } else
9154         Sink->moveAfter(Target);
9155       continue;
9156     }
9157 
9158     // The sink source is in a replicate region. Unhook the region from the CFG.
9159     auto *SinkPred = SinkRegion->getSinglePredecessor();
9160     auto *SinkSucc = SinkRegion->getSingleSuccessor();
9161     VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion);
9162     VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc);
9163     VPBlockUtils::connectBlocks(SinkPred, SinkSucc);
9164 
9165     if (TargetRegion) {
9166       // The target recipe is also in a replicate region, move the sink region
9167       // after the target region.
9168       auto *TargetSucc = TargetRegion->getSingleSuccessor();
9169       VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc);
9170       VPBlockUtils::connectBlocks(TargetRegion, SinkRegion);
9171       VPBlockUtils::connectBlocks(SinkRegion, TargetSucc);
9172     } else {
9173       // The sink source is in a replicate region, we need to move the whole
9174       // replicate region, which should only contain a single recipe in the
9175       // main block.
9176       auto *SplitBlock =
9177           Target->getParent()->splitAt(std::next(Target->getIterator()));
9178 
9179       auto *SplitPred = SplitBlock->getSinglePredecessor();
9180 
9181       VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock);
9182       VPBlockUtils::connectBlocks(SplitPred, SinkRegion);
9183       VPBlockUtils::connectBlocks(SinkRegion, SplitBlock);
9184       if (VPBB == SplitPred)
9185         VPBB = SplitBlock;
9186     }
9187   }
9188 
9189   cast<VPRegionBlock>(Plan->getEntry())->setExit(VPBB);
9190 
9191   VPlanTransforms::removeRedundantInductionCasts(*Plan);
9192 
9193   // Now that sink-after is done, move induction recipes for optimized truncates
9194   // to the phi section of the header block.
9195   for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove)
9196     Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
9197 
9198   // Adjust the recipes for any inloop reductions.
9199   adjustRecipesForReductions(VPBB, Plan, RecipeBuilder, Range.Start);
9200 
9201   // Introduce a recipe to combine the incoming and previous values of a
9202   // first-order recurrence.
9203   for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
9204     auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R);
9205     if (!RecurPhi)
9206       continue;
9207 
9208     VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe();
9209     VPBasicBlock *InsertBlock = PrevRecipe->getParent();
9210     auto *Region = GetReplicateRegion(PrevRecipe);
9211     if (Region)
9212       InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor());
9213     if (Region || PrevRecipe->isPhi())
9214       Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
9215     else
9216       Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator()));
9217 
9218     auto *RecurSplice = cast<VPInstruction>(
9219         Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
9220                              {RecurPhi, RecurPhi->getBackedgeValue()}));
9221 
9222     RecurPhi->replaceAllUsesWith(RecurSplice);
9223     // Set the first operand of RecurSplice to RecurPhi again, after replacing
9224     // all users.
9225     RecurSplice->setOperand(0, RecurPhi);
9226   }
9227 
9228   // Interleave memory: for each Interleave Group we marked earlier as relevant
9229   // for this VPlan, replace the Recipes widening its memory instructions with a
9230   // single VPInterleaveRecipe at its insertion point.
9231   for (auto IG : InterleaveGroups) {
9232     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
9233         RecipeBuilder.getRecipe(IG->getInsertPos()));
9234     SmallVector<VPValue *, 4> StoredValues;
9235     for (unsigned i = 0; i < IG->getFactor(); ++i)
9236       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
9237         auto *StoreR =
9238             cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
9239         StoredValues.push_back(StoreR->getStoredValue());
9240       }
9241 
9242     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
9243                                         Recipe->getMask());
9244     VPIG->insertBefore(Recipe);
9245     unsigned J = 0;
9246     for (unsigned i = 0; i < IG->getFactor(); ++i)
9247       if (Instruction *Member = IG->getMember(i)) {
9248         if (!Member->getType()->isVoidTy()) {
9249           VPValue *OriginalV = Plan->getVPValue(Member);
9250           Plan->removeVPValueFor(Member);
9251           Plan->addVPValue(Member, VPIG->getVPValue(J));
9252           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
9253           J++;
9254         }
9255         RecipeBuilder.getRecipe(Member)->eraseFromParent();
9256       }
9257   }
9258 
9259   // From this point onwards, VPlan-to-VPlan transformations may change the plan
9260   // in ways that accessing values using original IR values is incorrect.
9261   Plan->disableValue2VPValue();
9262 
9263   VPlanTransforms::sinkScalarOperands(*Plan);
9264   VPlanTransforms::mergeReplicateRegions(*Plan);
9265 
9266   std::string PlanName;
9267   raw_string_ostream RSO(PlanName);
9268   ElementCount VF = Range.Start;
9269   Plan->addVF(VF);
9270   RSO << "Initial VPlan for VF={" << VF;
9271   for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
9272     Plan->addVF(VF);
9273     RSO << "," << VF;
9274   }
9275   RSO << "},UF>=1";
9276   RSO.flush();
9277   Plan->setName(PlanName);
9278 
9279   assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
9280   return Plan;
9281 }
9282 
9283 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9284   // Outer loop handling: They may require CFG and instruction level
9285   // transformations before even evaluating whether vectorization is profitable.
9286   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9287   // the vectorization pipeline.
9288   assert(!OrigLoop->isInnermost());
9289   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9290 
9291   // Create new empty VPlan
9292   auto Plan = std::make_unique<VPlan>();
9293 
9294   // Build hierarchical CFG
9295   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9296   HCFGBuilder.buildHierarchicalCFG();
9297 
9298   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
9299        VF *= 2)
9300     Plan->addVF(VF);
9301 
9302   if (EnableVPlanPredication) {
9303     VPlanPredicator VPP(*Plan);
9304     VPP.predicate();
9305 
9306     // Avoid running transformation to recipes until masked code generation in
9307     // VPlan-native path is in place.
9308     return Plan;
9309   }
9310 
9311   SmallPtrSet<Instruction *, 1> DeadInstructions;
9312   VPlanTransforms::VPInstructionsToVPRecipes(
9313       OrigLoop, Plan,
9314       [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
9315       DeadInstructions, *PSE.getSE());
9316   return Plan;
9317 }
9318 
9319 // Adjust the recipes for reductions. For in-loop reductions the chain of
9320 // instructions leading from the loop exit instr to the phi need to be converted
9321 // to reductions, with one operand being vector and the other being the scalar
9322 // reduction chain. For other reductions, a select is introduced between the phi
9323 // and live-out recipes when folding the tail.
9324 void LoopVectorizationPlanner::adjustRecipesForReductions(
9325     VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
9326     ElementCount MinVF) {
9327   for (auto &Reduction : CM.getInLoopReductionChains()) {
9328     PHINode *Phi = Reduction.first;
9329     const RecurrenceDescriptor &RdxDesc =
9330         Legal->getReductionVars().find(Phi)->second;
9331     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9332 
9333     if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc))
9334       continue;
9335 
9336     // ReductionOperations are orders top-down from the phi's use to the
9337     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
9338     // which of the two operands will remain scalar and which will be reduced.
9339     // For minmax the chain will be the select instructions.
9340     Instruction *Chain = Phi;
9341     for (Instruction *R : ReductionOperations) {
9342       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
9343       RecurKind Kind = RdxDesc.getRecurrenceKind();
9344 
9345       VPValue *ChainOp = Plan->getVPValue(Chain);
9346       unsigned FirstOpId;
9347       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
9348              "Only min/max recurrences allowed for inloop reductions");
9349       // Recognize a call to the llvm.fmuladd intrinsic.
9350       bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9351       assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) &&
9352              "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9353       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9354         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
9355                "Expected to replace a VPWidenSelectSC");
9356         FirstOpId = 1;
9357       } else {
9358         assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) ||
9359                 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) &&
9360                "Expected to replace a VPWidenSC");
9361         FirstOpId = 0;
9362       }
9363       unsigned VecOpId =
9364           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
9365       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
9366 
9367       auto *CondOp = CM.foldTailByMasking()
9368                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
9369                          : nullptr;
9370 
9371       if (IsFMulAdd) {
9372         // If the instruction is a call to the llvm.fmuladd intrinsic then we
9373         // need to create an fmul recipe to use as the vector operand for the
9374         // fadd reduction.
9375         VPInstruction *FMulRecipe = new VPInstruction(
9376             Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))});
9377         FMulRecipe->setFastMathFlags(R->getFastMathFlags());
9378         WidenRecipe->getParent()->insert(FMulRecipe,
9379                                          WidenRecipe->getIterator());
9380         VecOp = FMulRecipe;
9381       }
9382       VPReductionRecipe *RedRecipe =
9383           new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
9384       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9385       Plan->removeVPValueFor(R);
9386       Plan->addVPValue(R, RedRecipe);
9387       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
9388       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9389       WidenRecipe->eraseFromParent();
9390 
9391       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9392         VPRecipeBase *CompareRecipe =
9393             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
9394         assert(isa<VPWidenRecipe>(CompareRecipe) &&
9395                "Expected to replace a VPWidenSC");
9396         assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
9397                "Expected no remaining users");
9398         CompareRecipe->eraseFromParent();
9399       }
9400       Chain = R;
9401     }
9402   }
9403 
9404   // If tail is folded by masking, introduce selects between the phi
9405   // and the live-out instruction of each reduction, at the end of the latch.
9406   if (CM.foldTailByMasking()) {
9407     for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
9408       VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9409       if (!PhiR || PhiR->isInLoop())
9410         continue;
9411       Builder.setInsertPoint(LatchVPBB);
9412       VPValue *Cond =
9413           RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
9414       VPValue *Red = PhiR->getBackedgeValue();
9415       Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});
9416     }
9417   }
9418 }
9419 
9420 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9421 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9422                                VPSlotTracker &SlotTracker) const {
9423   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9424   IG->getInsertPos()->printAsOperand(O, false);
9425   O << ", ";
9426   getAddr()->printAsOperand(O, SlotTracker);
9427   VPValue *Mask = getMask();
9428   if (Mask) {
9429     O << ", ";
9430     Mask->printAsOperand(O, SlotTracker);
9431   }
9432 
9433   unsigned OpIdx = 0;
9434   for (unsigned i = 0; i < IG->getFactor(); ++i) {
9435     if (!IG->getMember(i))
9436       continue;
9437     if (getNumStoreOperands() > 0) {
9438       O << "\n" << Indent << "  store ";
9439       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9440       O << " to index " << i;
9441     } else {
9442       O << "\n" << Indent << "  ";
9443       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
9444       O << " = load from index " << i;
9445     }
9446     ++OpIdx;
9447   }
9448 }
9449 #endif
9450 
9451 void VPWidenCallRecipe::execute(VPTransformState &State) {
9452   State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
9453                                   *this, State);
9454 }
9455 
9456 void VPWidenSelectRecipe::execute(VPTransformState &State) {
9457   auto &I = *cast<SelectInst>(getUnderlyingInstr());
9458   State.ILV->setDebugLocFromInst(&I);
9459 
9460   // The condition can be loop invariant  but still defined inside the
9461   // loop. This means that we can't just use the original 'cond' value.
9462   // We have to take the 'vectorized' value and pick the first lane.
9463   // Instcombine will make this a no-op.
9464   auto *InvarCond =
9465       InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr;
9466 
9467   for (unsigned Part = 0; Part < State.UF; ++Part) {
9468     Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part);
9469     Value *Op0 = State.get(getOperand(1), Part);
9470     Value *Op1 = State.get(getOperand(2), Part);
9471     Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
9472     State.set(this, Sel, Part);
9473     State.ILV->addMetadata(Sel, &I);
9474   }
9475 }
9476 
9477 void VPWidenRecipe::execute(VPTransformState &State) {
9478   auto &I = *cast<Instruction>(getUnderlyingValue());
9479   auto &Builder = State.Builder;
9480   switch (I.getOpcode()) {
9481   case Instruction::Call:
9482   case Instruction::Br:
9483   case Instruction::PHI:
9484   case Instruction::GetElementPtr:
9485   case Instruction::Select:
9486     llvm_unreachable("This instruction is handled by a different recipe.");
9487   case Instruction::UDiv:
9488   case Instruction::SDiv:
9489   case Instruction::SRem:
9490   case Instruction::URem:
9491   case Instruction::Add:
9492   case Instruction::FAdd:
9493   case Instruction::Sub:
9494   case Instruction::FSub:
9495   case Instruction::FNeg:
9496   case Instruction::Mul:
9497   case Instruction::FMul:
9498   case Instruction::FDiv:
9499   case Instruction::FRem:
9500   case Instruction::Shl:
9501   case Instruction::LShr:
9502   case Instruction::AShr:
9503   case Instruction::And:
9504   case Instruction::Or:
9505   case Instruction::Xor: {
9506     // Just widen unops and binops.
9507     State.ILV->setDebugLocFromInst(&I);
9508 
9509     for (unsigned Part = 0; Part < State.UF; ++Part) {
9510       SmallVector<Value *, 2> Ops;
9511       for (VPValue *VPOp : operands())
9512         Ops.push_back(State.get(VPOp, Part));
9513 
9514       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
9515 
9516       if (auto *VecOp = dyn_cast<Instruction>(V)) {
9517         VecOp->copyIRFlags(&I);
9518 
9519         // If the instruction is vectorized and was in a basic block that needed
9520         // predication, we can't propagate poison-generating flags (nuw/nsw,
9521         // exact, etc.). The control flow has been linearized and the
9522         // instruction is no longer guarded by the predicate, which could make
9523         // the flag properties to no longer hold.
9524         if (State.MayGeneratePoisonRecipes.count(this) > 0)
9525           VecOp->dropPoisonGeneratingFlags();
9526       }
9527 
9528       // Use this vector value for all users of the original instruction.
9529       State.set(this, V, Part);
9530       State.ILV->addMetadata(V, &I);
9531     }
9532 
9533     break;
9534   }
9535   case Instruction::ICmp:
9536   case Instruction::FCmp: {
9537     // Widen compares. Generate vector compares.
9538     bool FCmp = (I.getOpcode() == Instruction::FCmp);
9539     auto *Cmp = cast<CmpInst>(&I);
9540     State.ILV->setDebugLocFromInst(Cmp);
9541     for (unsigned Part = 0; Part < State.UF; ++Part) {
9542       Value *A = State.get(getOperand(0), Part);
9543       Value *B = State.get(getOperand(1), Part);
9544       Value *C = nullptr;
9545       if (FCmp) {
9546         // Propagate fast math flags.
9547         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
9548         Builder.setFastMathFlags(Cmp->getFastMathFlags());
9549         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
9550       } else {
9551         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
9552       }
9553       State.set(this, C, Part);
9554       State.ILV->addMetadata(C, &I);
9555     }
9556 
9557     break;
9558   }
9559 
9560   case Instruction::ZExt:
9561   case Instruction::SExt:
9562   case Instruction::FPToUI:
9563   case Instruction::FPToSI:
9564   case Instruction::FPExt:
9565   case Instruction::PtrToInt:
9566   case Instruction::IntToPtr:
9567   case Instruction::SIToFP:
9568   case Instruction::UIToFP:
9569   case Instruction::Trunc:
9570   case Instruction::FPTrunc:
9571   case Instruction::BitCast: {
9572     auto *CI = cast<CastInst>(&I);
9573     State.ILV->setDebugLocFromInst(CI);
9574 
9575     /// Vectorize casts.
9576     Type *DestTy = (State.VF.isScalar())
9577                        ? CI->getType()
9578                        : VectorType::get(CI->getType(), State.VF);
9579 
9580     for (unsigned Part = 0; Part < State.UF; ++Part) {
9581       Value *A = State.get(getOperand(0), Part);
9582       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
9583       State.set(this, Cast, Part);
9584       State.ILV->addMetadata(Cast, &I);
9585     }
9586     break;
9587   }
9588   default:
9589     // This instruction is not vectorized by simple widening.
9590     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
9591     llvm_unreachable("Unhandled instruction!");
9592   } // end of switch.
9593 }
9594 
9595 void VPWidenGEPRecipe::execute(VPTransformState &State) {
9596   auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
9597   // Construct a vector GEP by widening the operands of the scalar GEP as
9598   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
9599   // results in a vector of pointers when at least one operand of the GEP
9600   // is vector-typed. Thus, to keep the representation compact, we only use
9601   // vector-typed operands for loop-varying values.
9602 
9603   if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
9604     // If we are vectorizing, but the GEP has only loop-invariant operands,
9605     // the GEP we build (by only using vector-typed operands for
9606     // loop-varying values) would be a scalar pointer. Thus, to ensure we
9607     // produce a vector of pointers, we need to either arbitrarily pick an
9608     // operand to broadcast, or broadcast a clone of the original GEP.
9609     // Here, we broadcast a clone of the original.
9610     //
9611     // TODO: If at some point we decide to scalarize instructions having
9612     //       loop-invariant operands, this special case will no longer be
9613     //       required. We would add the scalarization decision to
9614     //       collectLoopScalars() and teach getVectorValue() to broadcast
9615     //       the lane-zero scalar value.
9616     auto *Clone = State.Builder.Insert(GEP->clone());
9617     for (unsigned Part = 0; Part < State.UF; ++Part) {
9618       Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone);
9619       State.set(this, EntryPart, Part);
9620       State.ILV->addMetadata(EntryPart, GEP);
9621     }
9622   } else {
9623     // If the GEP has at least one loop-varying operand, we are sure to
9624     // produce a vector of pointers. But if we are only unrolling, we want
9625     // to produce a scalar GEP for each unroll part. Thus, the GEP we
9626     // produce with the code below will be scalar (if VF == 1) or vector
9627     // (otherwise). Note that for the unroll-only case, we still maintain
9628     // values in the vector mapping with initVector, as we do for other
9629     // instructions.
9630     for (unsigned Part = 0; Part < State.UF; ++Part) {
9631       // The pointer operand of the new GEP. If it's loop-invariant, we
9632       // won't broadcast it.
9633       auto *Ptr = IsPtrLoopInvariant
9634                       ? State.get(getOperand(0), VPIteration(0, 0))
9635                       : State.get(getOperand(0), Part);
9636 
9637       // Collect all the indices for the new GEP. If any index is
9638       // loop-invariant, we won't broadcast it.
9639       SmallVector<Value *, 4> Indices;
9640       for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
9641         VPValue *Operand = getOperand(I);
9642         if (IsIndexLoopInvariant[I - 1])
9643           Indices.push_back(State.get(Operand, VPIteration(0, 0)));
9644         else
9645           Indices.push_back(State.get(Operand, Part));
9646       }
9647 
9648       // If the GEP instruction is vectorized and was in a basic block that
9649       // needed predication, we can't propagate the poison-generating 'inbounds'
9650       // flag. The control flow has been linearized and the GEP is no longer
9651       // guarded by the predicate, which could make the 'inbounds' properties to
9652       // no longer hold.
9653       bool IsInBounds =
9654           GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0;
9655 
9656       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
9657       // but it should be a vector, otherwise.
9658       auto *NewGEP = IsInBounds
9659                          ? State.Builder.CreateInBoundsGEP(
9660                                GEP->getSourceElementType(), Ptr, Indices)
9661                          : State.Builder.CreateGEP(GEP->getSourceElementType(),
9662                                                    Ptr, Indices);
9663       assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
9664              "NewGEP is not a pointer vector");
9665       State.set(this, NewGEP, Part);
9666       State.ILV->addMetadata(NewGEP, GEP);
9667     }
9668   }
9669 }
9670 
9671 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
9672   assert(!State.Instance && "Int or FP induction being replicated.");
9673   State.ILV->widenIntOrFpInduction(IV, getInductionDescriptor(),
9674                                    getStartValue()->getLiveInIRValue(),
9675                                    getTruncInst(), getVPValue(0), State);
9676 }
9677 
9678 void VPWidenPHIRecipe::execute(VPTransformState &State) {
9679   State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this,
9680                                  State);
9681 }
9682 
9683 void VPBlendRecipe::execute(VPTransformState &State) {
9684   State.ILV->setDebugLocFromInst(Phi, &State.Builder);
9685   // We know that all PHIs in non-header blocks are converted into
9686   // selects, so we don't have to worry about the insertion order and we
9687   // can just use the builder.
9688   // At this point we generate the predication tree. There may be
9689   // duplications since this is a simple recursive scan, but future
9690   // optimizations will clean it up.
9691 
9692   unsigned NumIncoming = getNumIncomingValues();
9693 
9694   // Generate a sequence of selects of the form:
9695   // SELECT(Mask3, In3,
9696   //        SELECT(Mask2, In2,
9697   //               SELECT(Mask1, In1,
9698   //                      In0)))
9699   // Note that Mask0 is never used: lanes for which no path reaches this phi and
9700   // are essentially undef are taken from In0.
9701   InnerLoopVectorizer::VectorParts Entry(State.UF);
9702   for (unsigned In = 0; In < NumIncoming; ++In) {
9703     for (unsigned Part = 0; Part < State.UF; ++Part) {
9704       // We might have single edge PHIs (blocks) - use an identity
9705       // 'select' for the first PHI operand.
9706       Value *In0 = State.get(getIncomingValue(In), Part);
9707       if (In == 0)
9708         Entry[Part] = In0; // Initialize with the first incoming value.
9709       else {
9710         // Select between the current value and the previous incoming edge
9711         // based on the incoming mask.
9712         Value *Cond = State.get(getMask(In), Part);
9713         Entry[Part] =
9714             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
9715       }
9716     }
9717   }
9718   for (unsigned Part = 0; Part < State.UF; ++Part)
9719     State.set(this, Entry[Part], Part);
9720 }
9721 
9722 void VPInterleaveRecipe::execute(VPTransformState &State) {
9723   assert(!State.Instance && "Interleave group being replicated.");
9724   State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9725                                       getStoredValues(), getMask());
9726 }
9727 
9728 void VPReductionRecipe::execute(VPTransformState &State) {
9729   assert(!State.Instance && "Reduction being replicated.");
9730   Value *PrevInChain = State.get(getChainOp(), 0);
9731   RecurKind Kind = RdxDesc->getRecurrenceKind();
9732   bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc);
9733   // Propagate the fast-math flags carried by the underlying instruction.
9734   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
9735   State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags());
9736   for (unsigned Part = 0; Part < State.UF; ++Part) {
9737     Value *NewVecOp = State.get(getVecOp(), Part);
9738     if (VPValue *Cond = getCondOp()) {
9739       Value *NewCond = State.get(Cond, Part);
9740       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
9741       Value *Iden = RdxDesc->getRecurrenceIdentity(
9742           Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags());
9743       Value *IdenVec =
9744           State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
9745       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
9746       NewVecOp = Select;
9747     }
9748     Value *NewRed;
9749     Value *NextInChain;
9750     if (IsOrdered) {
9751       if (State.VF.isVector())
9752         NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp,
9753                                         PrevInChain);
9754       else
9755         NewRed = State.Builder.CreateBinOp(
9756             (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain,
9757             NewVecOp);
9758       PrevInChain = NewRed;
9759     } else {
9760       PrevInChain = State.get(getChainOp(), Part);
9761       NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
9762     }
9763     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9764       NextInChain =
9765           createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
9766                          NewRed, PrevInChain);
9767     } else if (IsOrdered)
9768       NextInChain = NewRed;
9769     else
9770       NextInChain = State.Builder.CreateBinOp(
9771           (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed,
9772           PrevInChain);
9773     State.set(this, NextInChain, Part);
9774   }
9775 }
9776 
9777 void VPReplicateRecipe::execute(VPTransformState &State) {
9778   if (State.Instance) { // Generate a single instance.
9779     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9780     State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance,
9781                                     IsPredicated, State);
9782     // Insert scalar instance packing it into a vector.
9783     if (AlsoPack && State.VF.isVector()) {
9784       // If we're constructing lane 0, initialize to start from poison.
9785       if (State.Instance->Lane.isFirstLane()) {
9786         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9787         Value *Poison = PoisonValue::get(
9788             VectorType::get(getUnderlyingValue()->getType(), State.VF));
9789         State.set(this, Poison, State.Instance->Part);
9790       }
9791       State.ILV->packScalarIntoVectorValue(this, *State.Instance, State);
9792     }
9793     return;
9794   }
9795 
9796   // Generate scalar instances for all VF lanes of all UF parts, unless the
9797   // instruction is uniform inwhich case generate only the first lane for each
9798   // of the UF parts.
9799   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
9800   assert((!State.VF.isScalable() || IsUniform) &&
9801          "Can't scalarize a scalable vector");
9802   for (unsigned Part = 0; Part < State.UF; ++Part)
9803     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9804       State.ILV->scalarizeInstruction(getUnderlyingInstr(), this,
9805                                       VPIteration(Part, Lane), IsPredicated,
9806                                       State);
9807 }
9808 
9809 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
9810   assert(State.Instance && "Branch on Mask works only on single instance.");
9811 
9812   unsigned Part = State.Instance->Part;
9813   unsigned Lane = State.Instance->Lane.getKnownLane();
9814 
9815   Value *ConditionBit = nullptr;
9816   VPValue *BlockInMask = getMask();
9817   if (BlockInMask) {
9818     ConditionBit = State.get(BlockInMask, Part);
9819     if (ConditionBit->getType()->isVectorTy())
9820       ConditionBit = State.Builder.CreateExtractElement(
9821           ConditionBit, State.Builder.getInt32(Lane));
9822   } else // Block in mask is all-one.
9823     ConditionBit = State.Builder.getTrue();
9824 
9825   // Replace the temporary unreachable terminator with a new conditional branch,
9826   // whose two destinations will be set later when they are created.
9827   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
9828   assert(isa<UnreachableInst>(CurrentTerminator) &&
9829          "Expected to replace unreachable terminator with conditional branch.");
9830   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
9831   CondBr->setSuccessor(0, nullptr);
9832   ReplaceInstWithInst(CurrentTerminator, CondBr);
9833 }
9834 
9835 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
9836   assert(State.Instance && "Predicated instruction PHI works per instance.");
9837   Instruction *ScalarPredInst =
9838       cast<Instruction>(State.get(getOperand(0), *State.Instance));
9839   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
9840   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
9841   assert(PredicatingBB && "Predicated block has no single predecessor.");
9842   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
9843          "operand must be VPReplicateRecipe");
9844 
9845   // By current pack/unpack logic we need to generate only a single phi node: if
9846   // a vector value for the predicated instruction exists at this point it means
9847   // the instruction has vector users only, and a phi for the vector value is
9848   // needed. In this case the recipe of the predicated instruction is marked to
9849   // also do that packing, thereby "hoisting" the insert-element sequence.
9850   // Otherwise, a phi node for the scalar value is needed.
9851   unsigned Part = State.Instance->Part;
9852   if (State.hasVectorValue(getOperand(0), Part)) {
9853     Value *VectorValue = State.get(getOperand(0), Part);
9854     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
9855     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
9856     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
9857     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
9858     if (State.hasVectorValue(this, Part))
9859       State.reset(this, VPhi, Part);
9860     else
9861       State.set(this, VPhi, Part);
9862     // NOTE: Currently we need to update the value of the operand, so the next
9863     // predicated iteration inserts its generated value in the correct vector.
9864     State.reset(getOperand(0), VPhi, Part);
9865   } else {
9866     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
9867     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
9868     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
9869                      PredicatingBB);
9870     Phi->addIncoming(ScalarPredInst, PredicatedBB);
9871     if (State.hasScalarValue(this, *State.Instance))
9872       State.reset(this, Phi, *State.Instance);
9873     else
9874       State.set(this, Phi, *State.Instance);
9875     // NOTE: Currently we need to update the value of the operand, so the next
9876     // predicated iteration inserts its generated value in the correct vector.
9877     State.reset(getOperand(0), Phi, *State.Instance);
9878   }
9879 }
9880 
9881 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9882   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9883 
9884   // Attempt to issue a wide load.
9885   LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
9886   StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
9887 
9888   assert((LI || SI) && "Invalid Load/Store instruction");
9889   assert((!SI || StoredValue) && "No stored value provided for widened store");
9890   assert((!LI || !StoredValue) && "Stored value provided for widened load");
9891 
9892   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9893 
9894   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9895   const Align Alignment = getLoadStoreAlignment(&Ingredient);
9896   bool CreateGatherScatter = !Consecutive;
9897 
9898   auto &Builder = State.Builder;
9899   InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
9900   bool isMaskRequired = getMask();
9901   if (isMaskRequired)
9902     for (unsigned Part = 0; Part < State.UF; ++Part)
9903       BlockInMaskParts[Part] = State.get(getMask(), Part);
9904 
9905   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
9906     // Calculate the pointer for the specific unroll-part.
9907     GetElementPtrInst *PartPtr = nullptr;
9908 
9909     bool InBounds = false;
9910     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
9911       InBounds = gep->isInBounds();
9912     if (Reverse) {
9913       // If the address is consecutive but reversed, then the
9914       // wide store needs to start at the last vector element.
9915       // RunTimeVF =  VScale * VF.getKnownMinValue()
9916       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
9917       Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF);
9918       // NumElt = -Part * RunTimeVF
9919       Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
9920       // LastLane = 1 - RunTimeVF
9921       Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
9922       PartPtr =
9923           cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
9924       PartPtr->setIsInBounds(InBounds);
9925       PartPtr = cast<GetElementPtrInst>(
9926           Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
9927       PartPtr->setIsInBounds(InBounds);
9928       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
9929         BlockInMaskParts[Part] =
9930             Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse");
9931     } else {
9932       Value *Increment =
9933           createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part);
9934       PartPtr = cast<GetElementPtrInst>(
9935           Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
9936       PartPtr->setIsInBounds(InBounds);
9937     }
9938 
9939     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
9940     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
9941   };
9942 
9943   // Handle Stores:
9944   if (SI) {
9945     State.ILV->setDebugLocFromInst(SI);
9946 
9947     for (unsigned Part = 0; Part < State.UF; ++Part) {
9948       Instruction *NewSI = nullptr;
9949       Value *StoredVal = State.get(StoredValue, Part);
9950       if (CreateGatherScatter) {
9951         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9952         Value *VectorGep = State.get(getAddr(), Part);
9953         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
9954                                             MaskPart);
9955       } else {
9956         if (Reverse) {
9957           // If we store to reverse consecutive memory locations, then we need
9958           // to reverse the order of elements in the stored value.
9959           StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
9960           // We don't want to update the value in the map as it might be used in
9961           // another expression. So don't call resetVectorValue(StoredVal).
9962         }
9963         auto *VecPtr =
9964             CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9965         if (isMaskRequired)
9966           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
9967                                             BlockInMaskParts[Part]);
9968         else
9969           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
9970       }
9971       State.ILV->addMetadata(NewSI, SI);
9972     }
9973     return;
9974   }
9975 
9976   // Handle loads.
9977   assert(LI && "Must have a load instruction");
9978   State.ILV->setDebugLocFromInst(LI);
9979   for (unsigned Part = 0; Part < State.UF; ++Part) {
9980     Value *NewLI;
9981     if (CreateGatherScatter) {
9982       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9983       Value *VectorGep = State.get(getAddr(), Part);
9984       NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
9985                                          nullptr, "wide.masked.gather");
9986       State.ILV->addMetadata(NewLI, LI);
9987     } else {
9988       auto *VecPtr =
9989           CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9990       if (isMaskRequired)
9991         NewLI = Builder.CreateMaskedLoad(
9992             DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
9993             PoisonValue::get(DataTy), "wide.masked.load");
9994       else
9995         NewLI =
9996             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
9997 
9998       // Add metadata to the load, but setVectorValue to the reverse shuffle.
9999       State.ILV->addMetadata(NewLI, LI);
10000       if (Reverse)
10001         NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
10002     }
10003 
10004     State.set(getVPSingleValue(), NewLI, Part);
10005   }
10006 }
10007 
10008 // Determine how to lower the scalar epilogue, which depends on 1) optimising
10009 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
10010 // predication, and 4) a TTI hook that analyses whether the loop is suitable
10011 // for predication.
10012 static ScalarEpilogueLowering getScalarEpilogueLowering(
10013     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
10014     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
10015     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
10016     LoopVectorizationLegality &LVL) {
10017   // 1) OptSize takes precedence over all other options, i.e. if this is set,
10018   // don't look at hints or options, and don't request a scalar epilogue.
10019   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
10020   // LoopAccessInfo (due to code dependency and not being able to reliably get
10021   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
10022   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
10023   // versioning when the vectorization is forced, unlike hasOptSize. So revert
10024   // back to the old way and vectorize with versioning when forced. See D81345.)
10025   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
10026                                                       PGSOQueryType::IRPass) &&
10027                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
10028     return CM_ScalarEpilogueNotAllowedOptSize;
10029 
10030   // 2) If set, obey the directives
10031   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
10032     switch (PreferPredicateOverEpilogue) {
10033     case PreferPredicateTy::ScalarEpilogue:
10034       return CM_ScalarEpilogueAllowed;
10035     case PreferPredicateTy::PredicateElseScalarEpilogue:
10036       return CM_ScalarEpilogueNotNeededUsePredicate;
10037     case PreferPredicateTy::PredicateOrDontVectorize:
10038       return CM_ScalarEpilogueNotAllowedUsePredicate;
10039     };
10040   }
10041 
10042   // 3) If set, obey the hints
10043   switch (Hints.getPredicate()) {
10044   case LoopVectorizeHints::FK_Enabled:
10045     return CM_ScalarEpilogueNotNeededUsePredicate;
10046   case LoopVectorizeHints::FK_Disabled:
10047     return CM_ScalarEpilogueAllowed;
10048   };
10049 
10050   // 4) if the TTI hook indicates this is profitable, request predication.
10051   if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
10052                                        LVL.getLAI()))
10053     return CM_ScalarEpilogueNotNeededUsePredicate;
10054 
10055   return CM_ScalarEpilogueAllowed;
10056 }
10057 
10058 Value *VPTransformState::get(VPValue *Def, unsigned Part) {
10059   // If Values have been set for this Def return the one relevant for \p Part.
10060   if (hasVectorValue(Def, Part))
10061     return Data.PerPartOutput[Def][Part];
10062 
10063   if (!hasScalarValue(Def, {Part, 0})) {
10064     Value *IRV = Def->getLiveInIRValue();
10065     Value *B = ILV->getBroadcastInstrs(IRV);
10066     set(Def, B, Part);
10067     return B;
10068   }
10069 
10070   Value *ScalarValue = get(Def, {Part, 0});
10071   // If we aren't vectorizing, we can just copy the scalar map values over
10072   // to the vector map.
10073   if (VF.isScalar()) {
10074     set(Def, ScalarValue, Part);
10075     return ScalarValue;
10076   }
10077 
10078   auto *RepR = dyn_cast<VPReplicateRecipe>(Def);
10079   bool IsUniform = RepR && RepR->isUniform();
10080 
10081   unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
10082   // Check if there is a scalar value for the selected lane.
10083   if (!hasScalarValue(Def, {Part, LastLane})) {
10084     // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform.
10085     assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) &&
10086            "unexpected recipe found to be invariant");
10087     IsUniform = true;
10088     LastLane = 0;
10089   }
10090 
10091   auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));
10092   // Set the insert point after the last scalarized instruction or after the
10093   // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
10094   // will directly follow the scalar definitions.
10095   auto OldIP = Builder.saveIP();
10096   auto NewIP =
10097       isa<PHINode>(LastInst)
10098           ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
10099           : std::next(BasicBlock::iterator(LastInst));
10100   Builder.SetInsertPoint(&*NewIP);
10101 
10102   // However, if we are vectorizing, we need to construct the vector values.
10103   // If the value is known to be uniform after vectorization, we can just
10104   // broadcast the scalar value corresponding to lane zero for each unroll
10105   // iteration. Otherwise, we construct the vector values using
10106   // insertelement instructions. Since the resulting vectors are stored in
10107   // State, we will only generate the insertelements once.
10108   Value *VectorValue = nullptr;
10109   if (IsUniform) {
10110     VectorValue = ILV->getBroadcastInstrs(ScalarValue);
10111     set(Def, VectorValue, Part);
10112   } else {
10113     // Initialize packing with insertelements to start from undef.
10114     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
10115     Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
10116     set(Def, Undef, Part);
10117     for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
10118       ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this);
10119     VectorValue = get(Def, Part);
10120   }
10121   Builder.restoreIP(OldIP);
10122   return VectorValue;
10123 }
10124 
10125 // Process the loop in the VPlan-native vectorization path. This path builds
10126 // VPlan upfront in the vectorization pipeline, which allows to apply
10127 // VPlan-to-VPlan transformations from the very beginning without modifying the
10128 // input LLVM IR.
10129 static bool processLoopInVPlanNativePath(
10130     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
10131     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
10132     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
10133     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
10134     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
10135     LoopVectorizationRequirements &Requirements) {
10136 
10137   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
10138     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
10139     return false;
10140   }
10141   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
10142   Function *F = L->getHeader()->getParent();
10143   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
10144 
10145   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10146       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
10147 
10148   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
10149                                 &Hints, IAI);
10150   // Use the planner for outer loop vectorization.
10151   // TODO: CM is not used at this point inside the planner. Turn CM into an
10152   // optional argument if we don't need it in the future.
10153   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints,
10154                                Requirements, ORE);
10155 
10156   // Get user vectorization factor.
10157   ElementCount UserVF = Hints.getWidth();
10158 
10159   CM.collectElementTypesForWidening();
10160 
10161   // Plan how to best vectorize, return the best VF and its cost.
10162   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
10163 
10164   // If we are stress testing VPlan builds, do not attempt to generate vector
10165   // code. Masked vector code generation support will follow soon.
10166   // Also, do not attempt to vectorize if no vector code will be produced.
10167   if (VPlanBuildStressTest || EnableVPlanPredication ||
10168       VectorizationFactor::Disabled() == VF)
10169     return false;
10170 
10171   VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10172 
10173   {
10174     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10175                              F->getParent()->getDataLayout());
10176     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
10177                            &CM, BFI, PSI, Checks);
10178     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
10179                       << L->getHeader()->getParent()->getName() << "\"\n");
10180     LVP.executePlan(VF.Width, 1, BestPlan, LB, DT);
10181   }
10182 
10183   // Mark the loop as already vectorized to avoid vectorizing again.
10184   Hints.setAlreadyVectorized();
10185   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10186   return true;
10187 }
10188 
10189 // Emit a remark if there are stores to floats that required a floating point
10190 // extension. If the vectorized loop was generated with floating point there
10191 // will be a performance penalty from the conversion overhead and the change in
10192 // the vector width.
10193 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
10194   SmallVector<Instruction *, 4> Worklist;
10195   for (BasicBlock *BB : L->getBlocks()) {
10196     for (Instruction &Inst : *BB) {
10197       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
10198         if (S->getValueOperand()->getType()->isFloatTy())
10199           Worklist.push_back(S);
10200       }
10201     }
10202   }
10203 
10204   // Traverse the floating point stores upwards searching, for floating point
10205   // conversions.
10206   SmallPtrSet<const Instruction *, 4> Visited;
10207   SmallPtrSet<const Instruction *, 4> EmittedRemark;
10208   while (!Worklist.empty()) {
10209     auto *I = Worklist.pop_back_val();
10210     if (!L->contains(I))
10211       continue;
10212     if (!Visited.insert(I).second)
10213       continue;
10214 
10215     // Emit a remark if the floating point store required a floating
10216     // point conversion.
10217     // TODO: More work could be done to identify the root cause such as a
10218     // constant or a function return type and point the user to it.
10219     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
10220       ORE->emit([&]() {
10221         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
10222                                           I->getDebugLoc(), L->getHeader())
10223                << "floating point conversion changes vector width. "
10224                << "Mixed floating point precision requires an up/down "
10225                << "cast that will negatively impact performance.";
10226       });
10227 
10228     for (Use &Op : I->operands())
10229       if (auto *OpI = dyn_cast<Instruction>(Op))
10230         Worklist.push_back(OpI);
10231   }
10232 }
10233 
10234 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
10235     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10236                                !EnableLoopInterleaving),
10237       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10238                               !EnableLoopVectorization) {}
10239 
10240 bool LoopVectorizePass::processLoop(Loop *L) {
10241   assert((EnableVPlanNativePath || L->isInnermost()) &&
10242          "VPlan-native path is not enabled. Only process inner loops.");
10243 
10244 #ifndef NDEBUG
10245   const std::string DebugLocStr = getDebugLocString(L);
10246 #endif /* NDEBUG */
10247 
10248   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
10249                     << L->getHeader()->getParent()->getName() << "\" from "
10250                     << DebugLocStr << "\n");
10251 
10252   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
10253 
10254   LLVM_DEBUG(
10255       dbgs() << "LV: Loop hints:"
10256              << " force="
10257              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
10258                      ? "disabled"
10259                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
10260                             ? "enabled"
10261                             : "?"))
10262              << " width=" << Hints.getWidth()
10263              << " interleave=" << Hints.getInterleave() << "\n");
10264 
10265   // Function containing loop
10266   Function *F = L->getHeader()->getParent();
10267 
10268   // Looking at the diagnostic output is the only way to determine if a loop
10269   // was vectorized (other than looking at the IR or machine code), so it
10270   // is important to generate an optimization remark for each loop. Most of
10271   // these messages are generated as OptimizationRemarkAnalysis. Remarks
10272   // generated as OptimizationRemark and OptimizationRemarkMissed are
10273   // less verbose reporting vectorized loops and unvectorized loops that may
10274   // benefit from vectorization, respectively.
10275 
10276   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10277     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10278     return false;
10279   }
10280 
10281   PredicatedScalarEvolution PSE(*SE, *L);
10282 
10283   // Check if it is legal to vectorize the loop.
10284   LoopVectorizationRequirements Requirements;
10285   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
10286                                 &Requirements, &Hints, DB, AC, BFI, PSI);
10287   if (!LVL.canVectorize(EnableVPlanNativePath)) {
10288     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10289     Hints.emitRemarkWithHints();
10290     return false;
10291   }
10292 
10293   // Check the function attributes and profiles to find out if this function
10294   // should be optimized for size.
10295   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10296       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
10297 
10298   // Entrance to the VPlan-native vectorization path. Outer loops are processed
10299   // here. They may require CFG and instruction level transformations before
10300   // even evaluating whether vectorization is profitable. Since we cannot modify
10301   // the incoming IR, we need to build VPlan upfront in the vectorization
10302   // pipeline.
10303   if (!L->isInnermost())
10304     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10305                                         ORE, BFI, PSI, Hints, Requirements);
10306 
10307   assert(L->isInnermost() && "Inner loop expected.");
10308 
10309   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10310   // count by optimizing for size, to minimize overheads.
10311   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
10312   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10313     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10314                       << "This loop is worth vectorizing only if no scalar "
10315                       << "iteration overheads are incurred.");
10316     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10317       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10318     else {
10319       LLVM_DEBUG(dbgs() << "\n");
10320       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10321     }
10322   }
10323 
10324   // Check the function attributes to see if implicit floats are allowed.
10325   // FIXME: This check doesn't seem possibly correct -- what if the loop is
10326   // an integer loop and the vector instructions selected are purely integer
10327   // vector instructions?
10328   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10329     reportVectorizationFailure(
10330         "Can't vectorize when the NoImplicitFloat attribute is used",
10331         "loop not vectorized due to NoImplicitFloat attribute",
10332         "NoImplicitFloat", ORE, L);
10333     Hints.emitRemarkWithHints();
10334     return false;
10335   }
10336 
10337   // Check if the target supports potentially unsafe FP vectorization.
10338   // FIXME: Add a check for the type of safety issue (denormal, signaling)
10339   // for the target we're vectorizing for, to make sure none of the
10340   // additional fp-math flags can help.
10341   if (Hints.isPotentiallyUnsafe() &&
10342       TTI->isFPVectorizationPotentiallyUnsafe()) {
10343     reportVectorizationFailure(
10344         "Potentially unsafe FP op prevents vectorization",
10345         "loop not vectorized due to unsafe FP support.",
10346         "UnsafeFP", ORE, L);
10347     Hints.emitRemarkWithHints();
10348     return false;
10349   }
10350 
10351   bool AllowOrderedReductions;
10352   // If the flag is set, use that instead and override the TTI behaviour.
10353   if (ForceOrderedReductions.getNumOccurrences() > 0)
10354     AllowOrderedReductions = ForceOrderedReductions;
10355   else
10356     AllowOrderedReductions = TTI->enableOrderedReductions();
10357   if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10358     ORE->emit([&]() {
10359       auto *ExactFPMathInst = Requirements.getExactFPInst();
10360       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10361                                                  ExactFPMathInst->getDebugLoc(),
10362                                                  ExactFPMathInst->getParent())
10363              << "loop not vectorized: cannot prove it is safe to reorder "
10364                 "floating-point operations";
10365     });
10366     LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10367                          "reorder floating-point operations\n");
10368     Hints.emitRemarkWithHints();
10369     return false;
10370   }
10371 
10372   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10373   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10374 
10375   // If an override option has been passed in for interleaved accesses, use it.
10376   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
10377     UseInterleaved = EnableInterleavedMemAccesses;
10378 
10379   // Analyze interleaved memory accesses.
10380   if (UseInterleaved) {
10381     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
10382   }
10383 
10384   // Use the cost model.
10385   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10386                                 F, &Hints, IAI);
10387   CM.collectValuesToIgnore();
10388   CM.collectElementTypesForWidening();
10389 
10390   // Use the planner for vectorization.
10391   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints,
10392                                Requirements, ORE);
10393 
10394   // Get user vectorization factor and interleave count.
10395   ElementCount UserVF = Hints.getWidth();
10396   unsigned UserIC = Hints.getInterleave();
10397 
10398   // Plan how to best vectorize, return the best VF and its cost.
10399   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
10400 
10401   VectorizationFactor VF = VectorizationFactor::Disabled();
10402   unsigned IC = 1;
10403 
10404   if (MaybeVF) {
10405     VF = *MaybeVF;
10406     // Select the interleave count.
10407     IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());
10408   }
10409 
10410   // Identify the diagnostic messages that should be produced.
10411   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10412   bool VectorizeLoop = true, InterleaveLoop = true;
10413   if (VF.Width.isScalar()) {
10414     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10415     VecDiagMsg = std::make_pair(
10416         "VectorizationNotBeneficial",
10417         "the cost-model indicates that vectorization is not beneficial");
10418     VectorizeLoop = false;
10419   }
10420 
10421   if (!MaybeVF && UserIC > 1) {
10422     // Tell the user interleaving was avoided up-front, despite being explicitly
10423     // requested.
10424     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10425                          "interleaving should be avoided up front\n");
10426     IntDiagMsg = std::make_pair(
10427         "InterleavingAvoided",
10428         "Ignoring UserIC, because interleaving was avoided up front");
10429     InterleaveLoop = false;
10430   } else if (IC == 1 && UserIC <= 1) {
10431     // Tell the user interleaving is not beneficial.
10432     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10433     IntDiagMsg = std::make_pair(
10434         "InterleavingNotBeneficial",
10435         "the cost-model indicates that interleaving is not beneficial");
10436     InterleaveLoop = false;
10437     if (UserIC == 1) {
10438       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10439       IntDiagMsg.second +=
10440           " and is explicitly disabled or interleave count is set to 1";
10441     }
10442   } else if (IC > 1 && UserIC == 1) {
10443     // Tell the user interleaving is beneficial, but it explicitly disabled.
10444     LLVM_DEBUG(
10445         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10446     IntDiagMsg = std::make_pair(
10447         "InterleavingBeneficialButDisabled",
10448         "the cost-model indicates that interleaving is beneficial "
10449         "but is explicitly disabled or interleave count is set to 1");
10450     InterleaveLoop = false;
10451   }
10452 
10453   // Override IC if user provided an interleave count.
10454   IC = UserIC > 0 ? UserIC : IC;
10455 
10456   // Emit diagnostic messages, if any.
10457   const char *VAPassName = Hints.vectorizeAnalysisPassName();
10458   if (!VectorizeLoop && !InterleaveLoop) {
10459     // Do not vectorize or interleaving the loop.
10460     ORE->emit([&]() {
10461       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10462                                       L->getStartLoc(), L->getHeader())
10463              << VecDiagMsg.second;
10464     });
10465     ORE->emit([&]() {
10466       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10467                                       L->getStartLoc(), L->getHeader())
10468              << IntDiagMsg.second;
10469     });
10470     return false;
10471   } else if (!VectorizeLoop && InterleaveLoop) {
10472     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10473     ORE->emit([&]() {
10474       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10475                                         L->getStartLoc(), L->getHeader())
10476              << VecDiagMsg.second;
10477     });
10478   } else if (VectorizeLoop && !InterleaveLoop) {
10479     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10480                       << ") in " << DebugLocStr << '\n');
10481     ORE->emit([&]() {
10482       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10483                                         L->getStartLoc(), L->getHeader())
10484              << IntDiagMsg.second;
10485     });
10486   } else if (VectorizeLoop && InterleaveLoop) {
10487     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10488                       << ") in " << DebugLocStr << '\n');
10489     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10490   }
10491 
10492   bool DisableRuntimeUnroll = false;
10493   MDNode *OrigLoopID = L->getLoopID();
10494   {
10495     // Optimistically generate runtime checks. Drop them if they turn out to not
10496     // be profitable. Limit the scope of Checks, so the cleanup happens
10497     // immediately after vector codegeneration is done.
10498     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10499                              F->getParent()->getDataLayout());
10500     if (!VF.Width.isScalar() || IC > 1)
10501       Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate());
10502 
10503     using namespace ore;
10504     if (!VectorizeLoop) {
10505       assert(IC > 1 && "interleave count should not be 1 or 0");
10506       // If we decided that it is not legal to vectorize the loop, then
10507       // interleave it.
10508       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10509                                  &CM, BFI, PSI, Checks);
10510 
10511       VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10512       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT);
10513 
10514       ORE->emit([&]() {
10515         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10516                                   L->getHeader())
10517                << "interleaved loop (interleaved count: "
10518                << NV("InterleaveCount", IC) << ")";
10519       });
10520     } else {
10521       // If we decided that it is *legal* to vectorize the loop, then do it.
10522 
10523       // Consider vectorizing the epilogue too if it's profitable.
10524       VectorizationFactor EpilogueVF =
10525           CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
10526       if (EpilogueVF.Width.isVector()) {
10527 
10528         // The first pass vectorizes the main loop and creates a scalar epilogue
10529         // to be vectorized by executing the plan (potentially with a different
10530         // factor) again shortly afterwards.
10531         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10532         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10533                                            EPI, &LVL, &CM, BFI, PSI, Checks);
10534 
10535         VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
10536         LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV,
10537                         DT);
10538         ++LoopsVectorized;
10539 
10540         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10541         formLCSSARecursively(*L, *DT, LI, SE);
10542 
10543         // Second pass vectorizes the epilogue and adjusts the control flow
10544         // edges from the first pass.
10545         EPI.MainLoopVF = EPI.EpilogueVF;
10546         EPI.MainLoopUF = EPI.EpilogueUF;
10547         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10548                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
10549                                                  Checks);
10550 
10551         VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
10552         LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10553                         DT);
10554         ++LoopsEpilogueVectorized;
10555 
10556         if (!MainILV.areSafetyChecksAdded())
10557           DisableRuntimeUnroll = true;
10558       } else {
10559         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
10560                                &LVL, &CM, BFI, PSI, Checks);
10561 
10562         VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10563         LVP.executePlan(VF.Width, IC, BestPlan, LB, DT);
10564         ++LoopsVectorized;
10565 
10566         // Add metadata to disable runtime unrolling a scalar loop when there
10567         // are no runtime checks about strides and memory. A scalar loop that is
10568         // rarely used is not worth unrolling.
10569         if (!LB.areSafetyChecksAdded())
10570           DisableRuntimeUnroll = true;
10571       }
10572       // Report the vectorization decision.
10573       ORE->emit([&]() {
10574         return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
10575                                   L->getHeader())
10576                << "vectorized loop (vectorization width: "
10577                << NV("VectorizationFactor", VF.Width)
10578                << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
10579       });
10580     }
10581 
10582     if (ORE->allowExtraAnalysis(LV_NAME))
10583       checkMixedPrecision(L, ORE);
10584   }
10585 
10586   Optional<MDNode *> RemainderLoopID =
10587       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10588                                       LLVMLoopVectorizeFollowupEpilogue});
10589   if (RemainderLoopID.hasValue()) {
10590     L->setLoopID(RemainderLoopID.getValue());
10591   } else {
10592     if (DisableRuntimeUnroll)
10593       AddRuntimeUnrollDisableMetaData(L);
10594 
10595     // Mark the loop as already vectorized to avoid vectorizing again.
10596     Hints.setAlreadyVectorized();
10597   }
10598 
10599   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10600   return true;
10601 }
10602 
10603 LoopVectorizeResult LoopVectorizePass::runImpl(
10604     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10605     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
10606     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
10607     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
10608     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10609   SE = &SE_;
10610   LI = &LI_;
10611   TTI = &TTI_;
10612   DT = &DT_;
10613   BFI = &BFI_;
10614   TLI = TLI_;
10615   AA = &AA_;
10616   AC = &AC_;
10617   GetLAA = &GetLAA_;
10618   DB = &DB_;
10619   ORE = &ORE_;
10620   PSI = PSI_;
10621 
10622   // Don't attempt if
10623   // 1. the target claims to have no vector registers, and
10624   // 2. interleaving won't help ILP.
10625   //
10626   // The second condition is necessary because, even if the target has no
10627   // vector registers, loop vectorization may still enable scalar
10628   // interleaving.
10629   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10630       TTI->getMaxInterleaveFactor(1) < 2)
10631     return LoopVectorizeResult(false, false);
10632 
10633   bool Changed = false, CFGChanged = false;
10634 
10635   // The vectorizer requires loops to be in simplified form.
10636   // Since simplification may add new inner loops, it has to run before the
10637   // legality and profitability checks. This means running the loop vectorizer
10638   // will simplify all loops, regardless of whether anything end up being
10639   // vectorized.
10640   for (auto &L : *LI)
10641     Changed |= CFGChanged |=
10642         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10643 
10644   // Build up a worklist of inner-loops to vectorize. This is necessary as
10645   // the act of vectorizing or partially unrolling a loop creates new loops
10646   // and can invalidate iterators across the loops.
10647   SmallVector<Loop *, 8> Worklist;
10648 
10649   for (Loop *L : *LI)
10650     collectSupportedLoops(*L, LI, ORE, Worklist);
10651 
10652   LoopsAnalyzed += Worklist.size();
10653 
10654   // Now walk the identified inner loops.
10655   while (!Worklist.empty()) {
10656     Loop *L = Worklist.pop_back_val();
10657 
10658     // For the inner loops we actually process, form LCSSA to simplify the
10659     // transform.
10660     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10661 
10662     Changed |= CFGChanged |= processLoop(L);
10663   }
10664 
10665   // Process each loop nest in the function.
10666   return LoopVectorizeResult(Changed, CFGChanged);
10667 }
10668 
10669 PreservedAnalyses LoopVectorizePass::run(Function &F,
10670                                          FunctionAnalysisManager &AM) {
10671     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
10672     auto &LI = AM.getResult<LoopAnalysis>(F);
10673     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10674     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10675     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
10676     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10677     auto &AA = AM.getResult<AAManager>(F);
10678     auto &AC = AM.getResult<AssumptionAnalysis>(F);
10679     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10680     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10681 
10682     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
10683     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
10684         [&](Loop &L) -> const LoopAccessInfo & {
10685       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,      SE,
10686                                         TLI, TTI, nullptr, nullptr, nullptr};
10687       return LAM.getResult<LoopAccessAnalysis>(L, AR);
10688     };
10689     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10690     ProfileSummaryInfo *PSI =
10691         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10692     LoopVectorizeResult Result =
10693         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
10694     if (!Result.MadeAnyChange)
10695       return PreservedAnalyses::all();
10696     PreservedAnalyses PA;
10697 
10698     // We currently do not preserve loopinfo/dominator analyses with outer loop
10699     // vectorization. Until this is addressed, mark these analyses as preserved
10700     // only for non-VPlan-native path.
10701     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10702     if (!EnableVPlanNativePath) {
10703       PA.preserve<LoopAnalysis>();
10704       PA.preserve<DominatorTreeAnalysis>();
10705     }
10706 
10707     if (Result.MadeCFGChange) {
10708       // Making CFG changes likely means a loop got vectorized. Indicate that
10709       // extra simplification passes should be run.
10710       // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10711       // be run if runtime checks have been added.
10712       AM.getResult<ShouldRunExtraVectorPasses>(F);
10713       PA.preserve<ShouldRunExtraVectorPasses>();
10714     } else {
10715       PA.preserveSet<CFGAnalyses>();
10716     }
10717     return PA;
10718 }
10719 
10720 void LoopVectorizePass::printPipeline(
10721     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10722   static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10723       OS, MapClassName2PassName);
10724 
10725   OS << "<";
10726   OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10727   OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10728   OS << ">";
10729 }
10730