1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SmallPtrSet.h"
73 #include "llvm/ADT/SmallSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
91 #include "llvm/Analysis/ProfileSummaryInfo.h"
92 #include "llvm/Analysis/ScalarEvolution.h"
93 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
94 #include "llvm/Analysis/TargetLibraryInfo.h"
95 #include "llvm/Analysis/TargetTransformInfo.h"
96 #include "llvm/Analysis/VectorUtils.h"
97 #include "llvm/IR/Attributes.h"
98 #include "llvm/IR/BasicBlock.h"
99 #include "llvm/IR/CFG.h"
100 #include "llvm/IR/Constant.h"
101 #include "llvm/IR/Constants.h"
102 #include "llvm/IR/DataLayout.h"
103 #include "llvm/IR/DebugInfoMetadata.h"
104 #include "llvm/IR/DebugLoc.h"
105 #include "llvm/IR/DerivedTypes.h"
106 #include "llvm/IR/DiagnosticInfo.h"
107 #include "llvm/IR/Dominators.h"
108 #include "llvm/IR/Function.h"
109 #include "llvm/IR/IRBuilder.h"
110 #include "llvm/IR/InstrTypes.h"
111 #include "llvm/IR/Instruction.h"
112 #include "llvm/IR/Instructions.h"
113 #include "llvm/IR/IntrinsicInst.h"
114 #include "llvm/IR/Intrinsics.h"
115 #include "llvm/IR/LLVMContext.h"
116 #include "llvm/IR/Metadata.h"
117 #include "llvm/IR/Module.h"
118 #include "llvm/IR/Operator.h"
119 #include "llvm/IR/PatternMatch.h"
120 #include "llvm/IR/Type.h"
121 #include "llvm/IR/Use.h"
122 #include "llvm/IR/User.h"
123 #include "llvm/IR/Value.h"
124 #include "llvm/IR/ValueHandle.h"
125 #include "llvm/IR/Verifier.h"
126 #include "llvm/InitializePasses.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/InstructionCost.h"
134 #include "llvm/Support/MathExtras.h"
135 #include "llvm/Support/raw_ostream.h"
136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
137 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
138 #include "llvm/Transforms/Utils/LoopSimplify.h"
139 #include "llvm/Transforms/Utils/LoopUtils.h"
140 #include "llvm/Transforms/Utils/LoopVersioning.h"
141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
142 #include "llvm/Transforms/Utils/SizeOpts.h"
143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
144 #include <algorithm>
145 #include <cassert>
146 #include <cstdint>
147 #include <cstdlib>
148 #include <functional>
149 #include <iterator>
150 #include <limits>
151 #include <memory>
152 #include <string>
153 #include <tuple>
154 #include <utility>
155 
156 using namespace llvm;
157 
158 #define LV_NAME "loop-vectorize"
159 #define DEBUG_TYPE LV_NAME
160 
161 #ifndef NDEBUG
162 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
163 #endif
164 
165 /// @{
166 /// Metadata attribute names
167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
168 const char LLVMLoopVectorizeFollowupVectorized[] =
169     "llvm.loop.vectorize.followup_vectorized";
170 const char LLVMLoopVectorizeFollowupEpilogue[] =
171     "llvm.loop.vectorize.followup_epilogue";
172 /// @}
173 
174 STATISTIC(LoopsVectorized, "Number of loops vectorized");
175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
177 
178 static cl::opt<bool> EnableEpilogueVectorization(
179     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
180     cl::desc("Enable vectorization of epilogue loops."));
181 
182 static cl::opt<unsigned> EpilogueVectorizationForceVF(
183     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
184     cl::desc("When epilogue vectorization is enabled, and a value greater than "
185              "1 is specified, forces the given VF for all applicable epilogue "
186              "loops."));
187 
188 static cl::opt<unsigned> EpilogueVectorizationMinVF(
189     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
190     cl::desc("Only loops with vectorization factor equal to or larger than "
191              "the specified value are considered for epilogue vectorization."));
192 
193 /// Loops with a known constant trip count below this number are vectorized only
194 /// if no scalar iteration overheads are incurred.
195 static cl::opt<unsigned> TinyTripCountVectorThreshold(
196     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
197     cl::desc("Loops with a constant trip count that is smaller than this "
198              "value are vectorized only if no scalar iteration overheads "
199              "are incurred."));
200 
201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
202     "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
203     cl::desc("The maximum allowed number of runtime memory checks with a "
204              "vectorize(enable) pragma."));
205 
206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
207 // that predication is preferred, and this lists all options. I.e., the
208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
209 // and predicate the instructions accordingly. If tail-folding fails, there are
210 // different fallback strategies depending on these values:
211 namespace PreferPredicateTy {
212   enum Option {
213     ScalarEpilogue = 0,
214     PredicateElseScalarEpilogue,
215     PredicateOrDontVectorize
216   };
217 } // namespace PreferPredicateTy
218 
219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
220     "prefer-predicate-over-epilogue",
221     cl::init(PreferPredicateTy::ScalarEpilogue),
222     cl::Hidden,
223     cl::desc("Tail-folding and predication preferences over creating a scalar "
224              "epilogue loop."),
225     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
226                          "scalar-epilogue",
227                          "Don't tail-predicate loops, create scalar epilogue"),
228               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
229                          "predicate-else-scalar-epilogue",
230                          "prefer tail-folding, create scalar epilogue if tail "
231                          "folding fails."),
232               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
233                          "predicate-dont-vectorize",
234                          "prefers tail-folding, don't attempt vectorization if "
235                          "tail-folding fails.")));
236 
237 static cl::opt<bool> MaximizeBandwidth(
238     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
239     cl::desc("Maximize bandwidth when selecting vectorization factor which "
240              "will be determined by the smallest type in loop."));
241 
242 static cl::opt<bool> EnableInterleavedMemAccesses(
243     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
244     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
245 
246 /// An interleave-group may need masking if it resides in a block that needs
247 /// predication, or in order to mask away gaps.
248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
249     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
250     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
251 
252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
253     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
254     cl::desc("We don't interleave loops with a estimated constant trip count "
255              "below this number"));
256 
257 static cl::opt<unsigned> ForceTargetNumScalarRegs(
258     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
259     cl::desc("A flag that overrides the target's number of scalar registers."));
260 
261 static cl::opt<unsigned> ForceTargetNumVectorRegs(
262     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
263     cl::desc("A flag that overrides the target's number of vector registers."));
264 
265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
266     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
267     cl::desc("A flag that overrides the target's max interleave factor for "
268              "scalar loops."));
269 
270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
271     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
272     cl::desc("A flag that overrides the target's max interleave factor for "
273              "vectorized loops."));
274 
275 static cl::opt<unsigned> ForceTargetInstructionCost(
276     "force-target-instruction-cost", cl::init(0), cl::Hidden,
277     cl::desc("A flag that overrides the target's expected cost for "
278              "an instruction to a single constant value. Mostly "
279              "useful for getting consistent testing."));
280 
281 static cl::opt<bool> ForceTargetSupportsScalableVectors(
282     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
283     cl::desc(
284         "Pretend that scalable vectors are supported, even if the target does "
285         "not support them. This flag should only be used for testing."));
286 
287 static cl::opt<unsigned> SmallLoopCost(
288     "small-loop-cost", cl::init(20), cl::Hidden,
289     cl::desc(
290         "The cost of a loop that is considered 'small' by the interleaver."));
291 
292 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
293     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
294     cl::desc("Enable the use of the block frequency analysis to access PGO "
295              "heuristics minimizing code growth in cold regions and being more "
296              "aggressive in hot regions."));
297 
298 // Runtime interleave loops for load/store throughput.
299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
300     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
301     cl::desc(
302         "Enable runtime interleaving until load/store ports are saturated"));
303 
304 /// Interleave small loops with scalar reductions.
305 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
306     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
307     cl::desc("Enable interleaving for loops with small iteration counts that "
308              "contain scalar reductions to expose ILP."));
309 
310 /// The number of stores in a loop that are allowed to need predication.
311 static cl::opt<unsigned> NumberOfStoresToPredicate(
312     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
313     cl::desc("Max number of stores to be predicated behind an if."));
314 
315 static cl::opt<bool> EnableIndVarRegisterHeur(
316     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
317     cl::desc("Count the induction variable only once when interleaving"));
318 
319 static cl::opt<bool> EnableCondStoresVectorization(
320     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
321     cl::desc("Enable if predication of stores during vectorization."));
322 
323 static cl::opt<unsigned> MaxNestedScalarReductionIC(
324     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
325     cl::desc("The maximum interleave count to use when interleaving a scalar "
326              "reduction in a nested loop."));
327 
328 static cl::opt<bool>
329     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
330                            cl::Hidden,
331                            cl::desc("Prefer in-loop vector reductions, "
332                                     "overriding the targets preference."));
333 
334 static cl::opt<bool> ForceOrderedReductions(
335     "force-ordered-reductions", cl::init(false), cl::Hidden,
336     cl::desc("Enable the vectorisation of loops with in-order (strict) "
337              "FP reductions"));
338 
339 static cl::opt<bool> PreferPredicatedReductionSelect(
340     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
341     cl::desc(
342         "Prefer predicating a reduction operation over an after loop select."));
343 
344 cl::opt<bool> EnableVPlanNativePath(
345     "enable-vplan-native-path", cl::init(false), cl::Hidden,
346     cl::desc("Enable VPlan-native vectorization path with "
347              "support for outer loop vectorization."));
348 
349 // FIXME: Remove this switch once we have divergence analysis. Currently we
350 // assume divergent non-backedge branches when this switch is true.
351 cl::opt<bool> EnableVPlanPredication(
352     "enable-vplan-predication", cl::init(false), cl::Hidden,
353     cl::desc("Enable VPlan-native vectorization path predicator with "
354              "support for outer loop vectorization."));
355 
356 // This flag enables the stress testing of the VPlan H-CFG construction in the
357 // VPlan-native vectorization path. It must be used in conjuction with
358 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
359 // verification of the H-CFGs built.
360 static cl::opt<bool> VPlanBuildStressTest(
361     "vplan-build-stress-test", cl::init(false), cl::Hidden,
362     cl::desc(
363         "Build VPlan for every supported loop nest in the function and bail "
364         "out right after the build (stress test the VPlan H-CFG construction "
365         "in the VPlan-native vectorization path)."));
366 
367 cl::opt<bool> llvm::EnableLoopInterleaving(
368     "interleave-loops", cl::init(true), cl::Hidden,
369     cl::desc("Enable loop interleaving in Loop vectorization passes"));
370 cl::opt<bool> llvm::EnableLoopVectorization(
371     "vectorize-loops", cl::init(true), cl::Hidden,
372     cl::desc("Run the Loop vectorization passes"));
373 
374 cl::opt<bool> PrintVPlansInDotFormat(
375     "vplan-print-in-dot-format", cl::init(false), cl::Hidden,
376     cl::desc("Use dot format instead of plain text when dumping VPlans"));
377 
378 /// A helper function that returns true if the given type is irregular. The
379 /// type is irregular if its allocated size doesn't equal the store size of an
380 /// element of the corresponding vector type.
381 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
382   // Determine if an array of N elements of type Ty is "bitcast compatible"
383   // with a <N x Ty> vector.
384   // This is only true if there is no padding between the array elements.
385   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
386 }
387 
388 /// A helper function that returns the reciprocal of the block probability of
389 /// predicated blocks. If we return X, we are assuming the predicated block
390 /// will execute once for every X iterations of the loop header.
391 ///
392 /// TODO: We should use actual block probability here, if available. Currently,
393 ///       we always assume predicated blocks have a 50% chance of executing.
394 static unsigned getReciprocalPredBlockProb() { return 2; }
395 
396 /// A helper function that returns an integer or floating-point constant with
397 /// value C.
398 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
399   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
400                            : ConstantFP::get(Ty, C);
401 }
402 
403 /// Returns "best known" trip count for the specified loop \p L as defined by
404 /// the following procedure:
405 ///   1) Returns exact trip count if it is known.
406 ///   2) Returns expected trip count according to profile data if any.
407 ///   3) Returns upper bound estimate if it is known.
408 ///   4) Returns None if all of the above failed.
409 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
410   // Check if exact trip count is known.
411   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
412     return ExpectedTC;
413 
414   // Check if there is an expected trip count available from profile data.
415   if (LoopVectorizeWithBlockFrequency)
416     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
417       return EstimatedTC;
418 
419   // Check if upper bound estimate is known.
420   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
421     return ExpectedTC;
422 
423   return None;
424 }
425 
426 // Forward declare GeneratedRTChecks.
427 class GeneratedRTChecks;
428 
429 namespace llvm {
430 
431 AnalysisKey ShouldRunExtraVectorPasses::Key;
432 
433 /// InnerLoopVectorizer vectorizes loops which contain only one basic
434 /// block to a specified vectorization factor (VF).
435 /// This class performs the widening of scalars into vectors, or multiple
436 /// scalars. This class also implements the following features:
437 /// * It inserts an epilogue loop for handling loops that don't have iteration
438 ///   counts that are known to be a multiple of the vectorization factor.
439 /// * It handles the code generation for reduction variables.
440 /// * Scalarization (implementation using scalars) of un-vectorizable
441 ///   instructions.
442 /// InnerLoopVectorizer does not perform any vectorization-legality
443 /// checks, and relies on the caller to check for the different legality
444 /// aspects. The InnerLoopVectorizer relies on the
445 /// LoopVectorizationLegality class to provide information about the induction
446 /// and reduction variables that were found to a given vectorization factor.
447 class InnerLoopVectorizer {
448 public:
449   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
450                       LoopInfo *LI, DominatorTree *DT,
451                       const TargetLibraryInfo *TLI,
452                       const TargetTransformInfo *TTI, AssumptionCache *AC,
453                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
454                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
455                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
456                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
457       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
458         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
459         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
460         PSI(PSI), RTChecks(RTChecks) {
461     // Query this against the original loop and save it here because the profile
462     // of the original loop header may change as the transformation happens.
463     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
464         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
465   }
466 
467   virtual ~InnerLoopVectorizer() = default;
468 
469   /// Create a new empty loop that will contain vectorized instructions later
470   /// on, while the old loop will be used as the scalar remainder. Control flow
471   /// is generated around the vectorized (and scalar epilogue) loops consisting
472   /// of various checks and bypasses. Return the pre-header block of the new
473   /// loop.
474   /// In the case of epilogue vectorization, this function is overriden to
475   /// handle the more complex control flow around the loops.
476   virtual BasicBlock *createVectorizedLoopSkeleton();
477 
478   /// Widen a single call instruction within the innermost loop.
479   void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
480                             VPTransformState &State);
481 
482   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
483   void fixVectorizedLoop(VPTransformState &State);
484 
485   // Return true if any runtime check is added.
486   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
487 
488   /// A type for vectorized values in the new loop. Each value from the
489   /// original loop, when vectorized, is represented by UF vector values in the
490   /// new unrolled loop, where UF is the unroll factor.
491   using VectorParts = SmallVector<Value *, 2>;
492 
493   /// Vectorize a single first-order recurrence or pointer induction PHINode in
494   /// a block. This method handles the induction variable canonicalization. It
495   /// supports both VF = 1 for unrolled loops and arbitrary length vectors.
496   void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR,
497                            VPTransformState &State);
498 
499   /// A helper function to scalarize a single Instruction in the innermost loop.
500   /// Generates a sequence of scalar instances for each lane between \p MinLane
501   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
502   /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
503   /// Instr's operands.
504   void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe,
505                             const VPIteration &Instance, bool IfPredicateInstr,
506                             VPTransformState &State);
507 
508   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
509   /// is provided, the integer induction variable will first be truncated to
510   /// the corresponding type.
511   void widenIntOrFpInduction(PHINode *IV, const InductionDescriptor &ID,
512                              Value *Start, TruncInst *Trunc, VPValue *Def,
513                              VPTransformState &State);
514 
515   /// Construct the vector value of a scalarized value \p V one lane at a time.
516   void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
517                                  VPTransformState &State);
518 
519   /// Try to vectorize interleaved access group \p Group with the base address
520   /// given in \p Addr, optionally masking the vector operations if \p
521   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
522   /// values in the vectorized loop.
523   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
524                                 ArrayRef<VPValue *> VPDefs,
525                                 VPTransformState &State, VPValue *Addr,
526                                 ArrayRef<VPValue *> StoredValues,
527                                 VPValue *BlockInMask = nullptr);
528 
529   /// Set the debug location in the builder \p Ptr using the debug location in
530   /// \p V. If \p Ptr is None then it uses the class member's Builder.
531   void setDebugLocFromInst(const Value *V,
532                            Optional<IRBuilder<> *> CustomBuilder = None);
533 
534   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
535   void fixNonInductionPHIs(VPTransformState &State);
536 
537   /// Returns true if the reordering of FP operations is not allowed, but we are
538   /// able to vectorize with strict in-order reductions for the given RdxDesc.
539   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
540 
541   /// Create a broadcast instruction. This method generates a broadcast
542   /// instruction (shuffle) for loop invariant values and for the induction
543   /// value. If this is the induction variable then we extend it to N, N+1, ...
544   /// this is needed because each iteration in the loop corresponds to a SIMD
545   /// element.
546   virtual Value *getBroadcastInstrs(Value *V);
547 
548   /// Add metadata from one instruction to another.
549   ///
550   /// This includes both the original MDs from \p From and additional ones (\see
551   /// addNewMetadata).  Use this for *newly created* instructions in the vector
552   /// loop.
553   void addMetadata(Instruction *To, Instruction *From);
554 
555   /// Similar to the previous function but it adds the metadata to a
556   /// vector of instructions.
557   void addMetadata(ArrayRef<Value *> To, Instruction *From);
558 
559 protected:
560   friend class LoopVectorizationPlanner;
561 
562   /// A small list of PHINodes.
563   using PhiVector = SmallVector<PHINode *, 4>;
564 
565   /// A type for scalarized values in the new loop. Each value from the
566   /// original loop, when scalarized, is represented by UF x VF scalar values
567   /// in the new unrolled loop, where UF is the unroll factor and VF is the
568   /// vectorization factor.
569   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
570 
571   /// Set up the values of the IVs correctly when exiting the vector loop.
572   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
573                     Value *CountRoundDown, Value *EndValue,
574                     BasicBlock *MiddleBlock);
575 
576   /// Create a new induction variable inside L.
577   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
578                                    Value *Step, Instruction *DL);
579 
580   /// Handle all cross-iteration phis in the header.
581   void fixCrossIterationPHIs(VPTransformState &State);
582 
583   /// Create the exit value of first order recurrences in the middle block and
584   /// update their users.
585   void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
586                                VPTransformState &State);
587 
588   /// Create code for the loop exit value of the reduction.
589   void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
590 
591   /// Clear NSW/NUW flags from reduction instructions if necessary.
592   void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
593                                VPTransformState &State);
594 
595   /// Fixup the LCSSA phi nodes in the unique exit block.  This simply
596   /// means we need to add the appropriate incoming value from the middle
597   /// block as exiting edges from the scalar epilogue loop (if present) are
598   /// already in place, and we exit the vector loop exclusively to the middle
599   /// block.
600   void fixLCSSAPHIs(VPTransformState &State);
601 
602   /// Iteratively sink the scalarized operands of a predicated instruction into
603   /// the block that was created for it.
604   void sinkScalarOperands(Instruction *PredInst);
605 
606   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
607   /// represented as.
608   void truncateToMinimalBitwidths(VPTransformState &State);
609 
610   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
611   /// variable on which to base the steps, \p Step is the size of the step, and
612   /// \p EntryVal is the value from the original loop that maps to the steps.
613   /// Note that \p EntryVal doesn't have to be an induction variable - it
614   /// can also be a truncate instruction.
615   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
616                         const InductionDescriptor &ID, VPValue *Def,
617                         VPTransformState &State);
618 
619   /// Create a vector induction phi node based on an existing scalar one. \p
620   /// EntryVal is the value from the original loop that maps to the vector phi
621   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
622   /// truncate instruction, instead of widening the original IV, we widen a
623   /// version of the IV truncated to \p EntryVal's type.
624   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
625                                        Value *Step, Value *Start,
626                                        Instruction *EntryVal, VPValue *Def,
627                                        VPTransformState &State);
628 
629   /// Returns true if an instruction \p I should be scalarized instead of
630   /// vectorized for the chosen vectorization factor.
631   bool shouldScalarizeInstruction(Instruction *I) const;
632 
633   /// Returns true if we should generate a scalar version of \p IV.
634   bool needsScalarInduction(Instruction *IV) const;
635 
636   /// Generate a shuffle sequence that will reverse the vector Vec.
637   virtual Value *reverseVector(Value *Vec);
638 
639   /// Returns (and creates if needed) the original loop trip count.
640   Value *getOrCreateTripCount(Loop *NewLoop);
641 
642   /// Returns (and creates if needed) the trip count of the widened loop.
643   Value *getOrCreateVectorTripCount(Loop *NewLoop);
644 
645   /// Returns a bitcasted value to the requested vector type.
646   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
647   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
648                                 const DataLayout &DL);
649 
650   /// Emit a bypass check to see if the vector trip count is zero, including if
651   /// it overflows.
652   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
653 
654   /// Emit a bypass check to see if all of the SCEV assumptions we've
655   /// had to make are correct. Returns the block containing the checks or
656   /// nullptr if no checks have been added.
657   BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass);
658 
659   /// Emit bypass checks to check any memory assumptions we may have made.
660   /// Returns the block containing the checks or nullptr if no checks have been
661   /// added.
662   BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
663 
664   /// Compute the transformed value of Index at offset StartValue using step
665   /// StepValue.
666   /// For integer induction, returns StartValue + Index * StepValue.
667   /// For pointer induction, returns StartValue[Index * StepValue].
668   /// FIXME: The newly created binary instructions should contain nsw/nuw
669   /// flags, which can be found from the original scalar operations.
670   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
671                               const DataLayout &DL,
672                               const InductionDescriptor &ID,
673                               BasicBlock *VectorHeader) const;
674 
675   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
676   /// vector loop preheader, middle block and scalar preheader. Also
677   /// allocate a loop object for the new vector loop and return it.
678   Loop *createVectorLoopSkeleton(StringRef Prefix);
679 
680   /// Create new phi nodes for the induction variables to resume iteration count
681   /// in the scalar epilogue, from where the vectorized loop left off (given by
682   /// \p VectorTripCount).
683   /// In cases where the loop skeleton is more complicated (eg. epilogue
684   /// vectorization) and the resume values can come from an additional bypass
685   /// block, the \p AdditionalBypass pair provides information about the bypass
686   /// block and the end value on the edge from bypass to this loop.
687   void createInductionResumeValues(
688       Loop *L, Value *VectorTripCount,
689       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
690 
691   /// Complete the loop skeleton by adding debug MDs, creating appropriate
692   /// conditional branches in the middle block, preparing the builder and
693   /// running the verifier. Take in the vector loop \p L as argument, and return
694   /// the preheader of the completed vector loop.
695   BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
696 
697   /// Add additional metadata to \p To that was not present on \p Orig.
698   ///
699   /// Currently this is used to add the noalias annotations based on the
700   /// inserted memchecks.  Use this for instructions that are *cloned* into the
701   /// vector loop.
702   void addNewMetadata(Instruction *To, const Instruction *Orig);
703 
704   /// Collect poison-generating recipes that may generate a poison value that is
705   /// used after vectorization, even when their operands are not poison. Those
706   /// recipes meet the following conditions:
707   ///  * Contribute to the address computation of a recipe generating a widen
708   ///    memory load/store (VPWidenMemoryInstructionRecipe or
709   ///    VPInterleaveRecipe).
710   ///  * Such a widen memory load/store has at least one underlying Instruction
711   ///    that is in a basic block that needs predication and after vectorization
712   ///    the generated instruction won't be predicated.
713   void collectPoisonGeneratingRecipes(VPTransformState &State);
714 
715   /// Allow subclasses to override and print debug traces before/after vplan
716   /// execution, when trace information is requested.
717   virtual void printDebugTracesAtStart(){};
718   virtual void printDebugTracesAtEnd(){};
719 
720   /// The original loop.
721   Loop *OrigLoop;
722 
723   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
724   /// dynamic knowledge to simplify SCEV expressions and converts them to a
725   /// more usable form.
726   PredicatedScalarEvolution &PSE;
727 
728   /// Loop Info.
729   LoopInfo *LI;
730 
731   /// Dominator Tree.
732   DominatorTree *DT;
733 
734   /// Alias Analysis.
735   AAResults *AA;
736 
737   /// Target Library Info.
738   const TargetLibraryInfo *TLI;
739 
740   /// Target Transform Info.
741   const TargetTransformInfo *TTI;
742 
743   /// Assumption Cache.
744   AssumptionCache *AC;
745 
746   /// Interface to emit optimization remarks.
747   OptimizationRemarkEmitter *ORE;
748 
749   /// LoopVersioning.  It's only set up (non-null) if memchecks were
750   /// used.
751   ///
752   /// This is currently only used to add no-alias metadata based on the
753   /// memchecks.  The actually versioning is performed manually.
754   std::unique_ptr<LoopVersioning> LVer;
755 
756   /// The vectorization SIMD factor to use. Each vector will have this many
757   /// vector elements.
758   ElementCount VF;
759 
760   /// The vectorization unroll factor to use. Each scalar is vectorized to this
761   /// many different vector instructions.
762   unsigned UF;
763 
764   /// The builder that we use
765   IRBuilder<> Builder;
766 
767   // --- Vectorization state ---
768 
769   /// The vector-loop preheader.
770   BasicBlock *LoopVectorPreHeader;
771 
772   /// The scalar-loop preheader.
773   BasicBlock *LoopScalarPreHeader;
774 
775   /// Middle Block between the vector and the scalar.
776   BasicBlock *LoopMiddleBlock;
777 
778   /// The unique ExitBlock of the scalar loop if one exists.  Note that
779   /// there can be multiple exiting edges reaching this block.
780   BasicBlock *LoopExitBlock;
781 
782   /// The vector loop body.
783   BasicBlock *LoopVectorBody;
784 
785   /// The scalar loop body.
786   BasicBlock *LoopScalarBody;
787 
788   /// A list of all bypass blocks. The first block is the entry of the loop.
789   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
790 
791   /// The new Induction variable which was added to the new block.
792   PHINode *Induction = nullptr;
793 
794   /// The induction variable of the old basic block.
795   PHINode *OldInduction = nullptr;
796 
797   /// Store instructions that were predicated.
798   SmallVector<Instruction *, 4> PredicatedInstructions;
799 
800   /// Trip count of the original loop.
801   Value *TripCount = nullptr;
802 
803   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
804   Value *VectorTripCount = nullptr;
805 
806   /// The legality analysis.
807   LoopVectorizationLegality *Legal;
808 
809   /// The profitablity analysis.
810   LoopVectorizationCostModel *Cost;
811 
812   // Record whether runtime checks are added.
813   bool AddedSafetyChecks = false;
814 
815   // Holds the end values for each induction variable. We save the end values
816   // so we can later fix-up the external users of the induction variables.
817   DenseMap<PHINode *, Value *> IVEndValues;
818 
819   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
820   // fixed up at the end of vector code generation.
821   SmallVector<PHINode *, 8> OrigPHIsToFix;
822 
823   /// BFI and PSI are used to check for profile guided size optimizations.
824   BlockFrequencyInfo *BFI;
825   ProfileSummaryInfo *PSI;
826 
827   // Whether this loop should be optimized for size based on profile guided size
828   // optimizatios.
829   bool OptForSizeBasedOnProfile;
830 
831   /// Structure to hold information about generated runtime checks, responsible
832   /// for cleaning the checks, if vectorization turns out unprofitable.
833   GeneratedRTChecks &RTChecks;
834 };
835 
836 class InnerLoopUnroller : public InnerLoopVectorizer {
837 public:
838   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
839                     LoopInfo *LI, DominatorTree *DT,
840                     const TargetLibraryInfo *TLI,
841                     const TargetTransformInfo *TTI, AssumptionCache *AC,
842                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
843                     LoopVectorizationLegality *LVL,
844                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
845                     ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
846       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
847                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
848                             BFI, PSI, Check) {}
849 
850 private:
851   Value *getBroadcastInstrs(Value *V) override;
852   Value *reverseVector(Value *Vec) override;
853 };
854 
855 /// Encapsulate information regarding vectorization of a loop and its epilogue.
856 /// This information is meant to be updated and used across two stages of
857 /// epilogue vectorization.
858 struct EpilogueLoopVectorizationInfo {
859   ElementCount MainLoopVF = ElementCount::getFixed(0);
860   unsigned MainLoopUF = 0;
861   ElementCount EpilogueVF = ElementCount::getFixed(0);
862   unsigned EpilogueUF = 0;
863   BasicBlock *MainLoopIterationCountCheck = nullptr;
864   BasicBlock *EpilogueIterationCountCheck = nullptr;
865   BasicBlock *SCEVSafetyCheck = nullptr;
866   BasicBlock *MemSafetyCheck = nullptr;
867   Value *TripCount = nullptr;
868   Value *VectorTripCount = nullptr;
869 
870   EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
871                                 ElementCount EVF, unsigned EUF)
872       : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
873     assert(EUF == 1 &&
874            "A high UF for the epilogue loop is likely not beneficial.");
875   }
876 };
877 
878 /// An extension of the inner loop vectorizer that creates a skeleton for a
879 /// vectorized loop that has its epilogue (residual) also vectorized.
880 /// The idea is to run the vplan on a given loop twice, firstly to setup the
881 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
882 /// from the first step and vectorize the epilogue.  This is achieved by
883 /// deriving two concrete strategy classes from this base class and invoking
884 /// them in succession from the loop vectorizer planner.
885 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
886 public:
887   InnerLoopAndEpilogueVectorizer(
888       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
889       DominatorTree *DT, const TargetLibraryInfo *TLI,
890       const TargetTransformInfo *TTI, AssumptionCache *AC,
891       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
892       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
893       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
894       GeneratedRTChecks &Checks)
895       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
896                             EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI,
897                             Checks),
898         EPI(EPI) {}
899 
900   // Override this function to handle the more complex control flow around the
901   // three loops.
902   BasicBlock *createVectorizedLoopSkeleton() final override {
903     return createEpilogueVectorizedLoopSkeleton();
904   }
905 
906   /// The interface for creating a vectorized skeleton using one of two
907   /// different strategies, each corresponding to one execution of the vplan
908   /// as described above.
909   virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
910 
911   /// Holds and updates state information required to vectorize the main loop
912   /// and its epilogue in two separate passes. This setup helps us avoid
913   /// regenerating and recomputing runtime safety checks. It also helps us to
914   /// shorten the iteration-count-check path length for the cases where the
915   /// iteration count of the loop is so small that the main vector loop is
916   /// completely skipped.
917   EpilogueLoopVectorizationInfo &EPI;
918 };
919 
920 /// A specialized derived class of inner loop vectorizer that performs
921 /// vectorization of *main* loops in the process of vectorizing loops and their
922 /// epilogues.
923 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
924 public:
925   EpilogueVectorizerMainLoop(
926       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
927       DominatorTree *DT, const TargetLibraryInfo *TLI,
928       const TargetTransformInfo *TTI, AssumptionCache *AC,
929       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
930       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
931       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
932       GeneratedRTChecks &Check)
933       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
934                                        EPI, LVL, CM, BFI, PSI, Check) {}
935   /// Implements the interface for creating a vectorized skeleton using the
936   /// *main loop* strategy (ie the first pass of vplan execution).
937   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
938 
939 protected:
940   /// Emits an iteration count bypass check once for the main loop (when \p
941   /// ForEpilogue is false) and once for the epilogue loop (when \p
942   /// ForEpilogue is true).
943   BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
944                                              bool ForEpilogue);
945   void printDebugTracesAtStart() override;
946   void printDebugTracesAtEnd() override;
947 };
948 
949 // A specialized derived class of inner loop vectorizer that performs
950 // vectorization of *epilogue* loops in the process of vectorizing loops and
951 // their epilogues.
952 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
953 public:
954   EpilogueVectorizerEpilogueLoop(
955       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
956       DominatorTree *DT, const TargetLibraryInfo *TLI,
957       const TargetTransformInfo *TTI, AssumptionCache *AC,
958       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
959       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
960       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
961       GeneratedRTChecks &Checks)
962       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
963                                        EPI, LVL, CM, BFI, PSI, Checks) {}
964   /// Implements the interface for creating a vectorized skeleton using the
965   /// *epilogue loop* strategy (ie the second pass of vplan execution).
966   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
967 
968 protected:
969   /// Emits an iteration count bypass check after the main vector loop has
970   /// finished to see if there are any iterations left to execute by either
971   /// the vector epilogue or the scalar epilogue.
972   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
973                                                       BasicBlock *Bypass,
974                                                       BasicBlock *Insert);
975   void printDebugTracesAtStart() override;
976   void printDebugTracesAtEnd() override;
977 };
978 } // end namespace llvm
979 
980 /// Look for a meaningful debug location on the instruction or it's
981 /// operands.
982 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
983   if (!I)
984     return I;
985 
986   DebugLoc Empty;
987   if (I->getDebugLoc() != Empty)
988     return I;
989 
990   for (Use &Op : I->operands()) {
991     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
992       if (OpInst->getDebugLoc() != Empty)
993         return OpInst;
994   }
995 
996   return I;
997 }
998 
999 void InnerLoopVectorizer::setDebugLocFromInst(
1000     const Value *V, Optional<IRBuilder<> *> CustomBuilder) {
1001   IRBuilder<> *B = (CustomBuilder == None) ? &Builder : *CustomBuilder;
1002   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) {
1003     const DILocation *DIL = Inst->getDebugLoc();
1004 
1005     // When a FSDiscriminator is enabled, we don't need to add the multiply
1006     // factors to the discriminators.
1007     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
1008         !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) {
1009       // FIXME: For scalable vectors, assume vscale=1.
1010       auto NewDIL =
1011           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
1012       if (NewDIL)
1013         B->SetCurrentDebugLocation(NewDIL.getValue());
1014       else
1015         LLVM_DEBUG(dbgs()
1016                    << "Failed to create new discriminator: "
1017                    << DIL->getFilename() << " Line: " << DIL->getLine());
1018     } else
1019       B->SetCurrentDebugLocation(DIL);
1020   } else
1021     B->SetCurrentDebugLocation(DebugLoc());
1022 }
1023 
1024 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
1025 /// is passed, the message relates to that particular instruction.
1026 #ifndef NDEBUG
1027 static void debugVectorizationMessage(const StringRef Prefix,
1028                                       const StringRef DebugMsg,
1029                                       Instruction *I) {
1030   dbgs() << "LV: " << Prefix << DebugMsg;
1031   if (I != nullptr)
1032     dbgs() << " " << *I;
1033   else
1034     dbgs() << '.';
1035   dbgs() << '\n';
1036 }
1037 #endif
1038 
1039 /// Create an analysis remark that explains why vectorization failed
1040 ///
1041 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
1042 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
1043 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
1044 /// the location of the remark.  \return the remark object that can be
1045 /// streamed to.
1046 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
1047     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
1048   Value *CodeRegion = TheLoop->getHeader();
1049   DebugLoc DL = TheLoop->getStartLoc();
1050 
1051   if (I) {
1052     CodeRegion = I->getParent();
1053     // If there is no debug location attached to the instruction, revert back to
1054     // using the loop's.
1055     if (I->getDebugLoc())
1056       DL = I->getDebugLoc();
1057   }
1058 
1059   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
1060 }
1061 
1062 /// Return a value for Step multiplied by VF.
1063 static Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF,
1064                               int64_t Step) {
1065   assert(Ty->isIntegerTy() && "Expected an integer step");
1066   Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue());
1067   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
1068 }
1069 
1070 namespace llvm {
1071 
1072 /// Return the runtime value for VF.
1073 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) {
1074   Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
1075   return VF.isScalable() ? B.CreateVScale(EC) : EC;
1076 }
1077 
1078 static Value *getRuntimeVFAsFloat(IRBuilder<> &B, Type *FTy, ElementCount VF) {
1079   assert(FTy->isFloatingPointTy() && "Expected floating point type!");
1080   Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
1081   Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
1082   return B.CreateUIToFP(RuntimeVF, FTy);
1083 }
1084 
1085 void reportVectorizationFailure(const StringRef DebugMsg,
1086                                 const StringRef OREMsg, const StringRef ORETag,
1087                                 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1088                                 Instruction *I) {
1089   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
1090   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1091   ORE->emit(
1092       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1093       << "loop not vectorized: " << OREMsg);
1094 }
1095 
1096 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1097                              OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1098                              Instruction *I) {
1099   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
1100   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1101   ORE->emit(
1102       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1103       << Msg);
1104 }
1105 
1106 } // end namespace llvm
1107 
1108 #ifndef NDEBUG
1109 /// \return string containing a file name and a line # for the given loop.
1110 static std::string getDebugLocString(const Loop *L) {
1111   std::string Result;
1112   if (L) {
1113     raw_string_ostream OS(Result);
1114     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1115       LoopDbgLoc.print(OS);
1116     else
1117       // Just print the module name.
1118       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1119     OS.flush();
1120   }
1121   return Result;
1122 }
1123 #endif
1124 
1125 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1126                                          const Instruction *Orig) {
1127   // If the loop was versioned with memchecks, add the corresponding no-alias
1128   // metadata.
1129   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1130     LVer->annotateInstWithNoAlias(To, Orig);
1131 }
1132 
1133 void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
1134     VPTransformState &State) {
1135 
1136   // Collect recipes in the backward slice of `Root` that may generate a poison
1137   // value that is used after vectorization.
1138   SmallPtrSet<VPRecipeBase *, 16> Visited;
1139   auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
1140     SmallVector<VPRecipeBase *, 16> Worklist;
1141     Worklist.push_back(Root);
1142 
1143     // Traverse the backward slice of Root through its use-def chain.
1144     while (!Worklist.empty()) {
1145       VPRecipeBase *CurRec = Worklist.back();
1146       Worklist.pop_back();
1147 
1148       if (!Visited.insert(CurRec).second)
1149         continue;
1150 
1151       // Prune search if we find another recipe generating a widen memory
1152       // instruction. Widen memory instructions involved in address computation
1153       // will lead to gather/scatter instructions, which don't need to be
1154       // handled.
1155       if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
1156           isa<VPInterleaveRecipe>(CurRec))
1157         continue;
1158 
1159       // This recipe contributes to the address computation of a widen
1160       // load/store. Collect recipe if its underlying instruction has
1161       // poison-generating flags.
1162       Instruction *Instr = CurRec->getUnderlyingInstr();
1163       if (Instr && Instr->hasPoisonGeneratingFlags())
1164         State.MayGeneratePoisonRecipes.insert(CurRec);
1165 
1166       // Add new definitions to the worklist.
1167       for (VPValue *operand : CurRec->operands())
1168         if (VPDef *OpDef = operand->getDef())
1169           Worklist.push_back(cast<VPRecipeBase>(OpDef));
1170     }
1171   });
1172 
1173   // Traverse all the recipes in the VPlan and collect the poison-generating
1174   // recipes in the backward slice starting at the address of a VPWidenRecipe or
1175   // VPInterleaveRecipe.
1176   auto Iter = depth_first(
1177       VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry()));
1178   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1179     for (VPRecipeBase &Recipe : *VPBB) {
1180       if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
1181         Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr();
1182         VPDef *AddrDef = WidenRec->getAddr()->getDef();
1183         if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr &&
1184             Legal->blockNeedsPredication(UnderlyingInstr->getParent()))
1185           collectPoisonGeneratingInstrsInBackwardSlice(
1186               cast<VPRecipeBase>(AddrDef));
1187       } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1188         VPDef *AddrDef = InterleaveRec->getAddr()->getDef();
1189         if (AddrDef) {
1190           // Check if any member of the interleave group needs predication.
1191           const InterleaveGroup<Instruction> *InterGroup =
1192               InterleaveRec->getInterleaveGroup();
1193           bool NeedPredication = false;
1194           for (int I = 0, NumMembers = InterGroup->getNumMembers();
1195                I < NumMembers; ++I) {
1196             Instruction *Member = InterGroup->getMember(I);
1197             if (Member)
1198               NeedPredication |=
1199                   Legal->blockNeedsPredication(Member->getParent());
1200           }
1201 
1202           if (NeedPredication)
1203             collectPoisonGeneratingInstrsInBackwardSlice(
1204                 cast<VPRecipeBase>(AddrDef));
1205         }
1206       }
1207     }
1208   }
1209 }
1210 
1211 void InnerLoopVectorizer::addMetadata(Instruction *To,
1212                                       Instruction *From) {
1213   propagateMetadata(To, From);
1214   addNewMetadata(To, From);
1215 }
1216 
1217 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1218                                       Instruction *From) {
1219   for (Value *V : To) {
1220     if (Instruction *I = dyn_cast<Instruction>(V))
1221       addMetadata(I, From);
1222   }
1223 }
1224 
1225 namespace llvm {
1226 
1227 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1228 // lowered.
1229 enum ScalarEpilogueLowering {
1230 
1231   // The default: allowing scalar epilogues.
1232   CM_ScalarEpilogueAllowed,
1233 
1234   // Vectorization with OptForSize: don't allow epilogues.
1235   CM_ScalarEpilogueNotAllowedOptSize,
1236 
1237   // A special case of vectorisation with OptForSize: loops with a very small
1238   // trip count are considered for vectorization under OptForSize, thereby
1239   // making sure the cost of their loop body is dominant, free of runtime
1240   // guards and scalar iteration overheads.
1241   CM_ScalarEpilogueNotAllowedLowTripLoop,
1242 
1243   // Loop hint predicate indicating an epilogue is undesired.
1244   CM_ScalarEpilogueNotNeededUsePredicate,
1245 
1246   // Directive indicating we must either tail fold or not vectorize
1247   CM_ScalarEpilogueNotAllowedUsePredicate
1248 };
1249 
1250 /// ElementCountComparator creates a total ordering for ElementCount
1251 /// for the purposes of using it in a set structure.
1252 struct ElementCountComparator {
1253   bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
1254     return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
1255            std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
1256   }
1257 };
1258 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
1259 
1260 /// LoopVectorizationCostModel - estimates the expected speedups due to
1261 /// vectorization.
1262 /// In many cases vectorization is not profitable. This can happen because of
1263 /// a number of reasons. In this class we mainly attempt to predict the
1264 /// expected speedup/slowdowns due to the supported instruction set. We use the
1265 /// TargetTransformInfo to query the different backends for the cost of
1266 /// different operations.
1267 class LoopVectorizationCostModel {
1268 public:
1269   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1270                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1271                              LoopVectorizationLegality *Legal,
1272                              const TargetTransformInfo &TTI,
1273                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1274                              AssumptionCache *AC,
1275                              OptimizationRemarkEmitter *ORE, const Function *F,
1276                              const LoopVectorizeHints *Hints,
1277                              InterleavedAccessInfo &IAI)
1278       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1279         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1280         Hints(Hints), InterleaveInfo(IAI) {}
1281 
1282   /// \return An upper bound for the vectorization factors (both fixed and
1283   /// scalable). If the factors are 0, vectorization and interleaving should be
1284   /// avoided up front.
1285   FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1286 
1287   /// \return True if runtime checks are required for vectorization, and false
1288   /// otherwise.
1289   bool runtimeChecksRequired();
1290 
1291   /// \return The most profitable vectorization factor and the cost of that VF.
1292   /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
1293   /// then this vectorization factor will be selected if vectorization is
1294   /// possible.
1295   VectorizationFactor
1296   selectVectorizationFactor(const ElementCountSet &CandidateVFs);
1297 
1298   VectorizationFactor
1299   selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1300                                     const LoopVectorizationPlanner &LVP);
1301 
1302   /// Setup cost-based decisions for user vectorization factor.
1303   /// \return true if the UserVF is a feasible VF to be chosen.
1304   bool selectUserVectorizationFactor(ElementCount UserVF) {
1305     collectUniformsAndScalars(UserVF);
1306     collectInstsToScalarize(UserVF);
1307     return expectedCost(UserVF).first.isValid();
1308   }
1309 
1310   /// \return The size (in bits) of the smallest and widest types in the code
1311   /// that needs to be vectorized. We ignore values that remain scalar such as
1312   /// 64 bit loop indices.
1313   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1314 
1315   /// \return The desired interleave count.
1316   /// If interleave count has been specified by metadata it will be returned.
1317   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1318   /// are the selected vectorization factor and the cost of the selected VF.
1319   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1320 
1321   /// Memory access instruction may be vectorized in more than one way.
1322   /// Form of instruction after vectorization depends on cost.
1323   /// This function takes cost-based decisions for Load/Store instructions
1324   /// and collects them in a map. This decisions map is used for building
1325   /// the lists of loop-uniform and loop-scalar instructions.
1326   /// The calculated cost is saved with widening decision in order to
1327   /// avoid redundant calculations.
1328   void setCostBasedWideningDecision(ElementCount VF);
1329 
1330   /// A struct that represents some properties of the register usage
1331   /// of a loop.
1332   struct RegisterUsage {
1333     /// Holds the number of loop invariant values that are used in the loop.
1334     /// The key is ClassID of target-provided register class.
1335     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1336     /// Holds the maximum number of concurrent live intervals in the loop.
1337     /// The key is ClassID of target-provided register class.
1338     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1339   };
1340 
1341   /// \return Returns information about the register usages of the loop for the
1342   /// given vectorization factors.
1343   SmallVector<RegisterUsage, 8>
1344   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1345 
1346   /// Collect values we want to ignore in the cost model.
1347   void collectValuesToIgnore();
1348 
1349   /// Collect all element types in the loop for which widening is needed.
1350   void collectElementTypesForWidening();
1351 
1352   /// Split reductions into those that happen in the loop, and those that happen
1353   /// outside. In loop reductions are collected into InLoopReductionChains.
1354   void collectInLoopReductions();
1355 
1356   /// Returns true if we should use strict in-order reductions for the given
1357   /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1358   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1359   /// of FP operations.
1360   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) {
1361     return !Hints->allowReordering() && RdxDesc.isOrdered();
1362   }
1363 
1364   /// \returns The smallest bitwidth each instruction can be represented with.
1365   /// The vector equivalents of these instructions should be truncated to this
1366   /// type.
1367   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1368     return MinBWs;
1369   }
1370 
1371   /// \returns True if it is more profitable to scalarize instruction \p I for
1372   /// vectorization factor \p VF.
1373   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1374     assert(VF.isVector() &&
1375            "Profitable to scalarize relevant only for VF > 1.");
1376 
1377     // Cost model is not run in the VPlan-native path - return conservative
1378     // result until this changes.
1379     if (EnableVPlanNativePath)
1380       return false;
1381 
1382     auto Scalars = InstsToScalarize.find(VF);
1383     assert(Scalars != InstsToScalarize.end() &&
1384            "VF not yet analyzed for scalarization profitability");
1385     return Scalars->second.find(I) != Scalars->second.end();
1386   }
1387 
1388   /// Returns true if \p I is known to be uniform after vectorization.
1389   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1390     if (VF.isScalar())
1391       return true;
1392 
1393     // Cost model is not run in the VPlan-native path - return conservative
1394     // result until this changes.
1395     if (EnableVPlanNativePath)
1396       return false;
1397 
1398     auto UniformsPerVF = Uniforms.find(VF);
1399     assert(UniformsPerVF != Uniforms.end() &&
1400            "VF not yet analyzed for uniformity");
1401     return UniformsPerVF->second.count(I);
1402   }
1403 
1404   /// Returns true if \p I is known to be scalar after vectorization.
1405   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1406     if (VF.isScalar())
1407       return true;
1408 
1409     // Cost model is not run in the VPlan-native path - return conservative
1410     // result until this changes.
1411     if (EnableVPlanNativePath)
1412       return false;
1413 
1414     auto ScalarsPerVF = Scalars.find(VF);
1415     assert(ScalarsPerVF != Scalars.end() &&
1416            "Scalar values are not calculated for VF");
1417     return ScalarsPerVF->second.count(I);
1418   }
1419 
1420   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1421   /// for vectorization factor \p VF.
1422   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1423     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1424            !isProfitableToScalarize(I, VF) &&
1425            !isScalarAfterVectorization(I, VF);
1426   }
1427 
1428   /// Decision that was taken during cost calculation for memory instruction.
1429   enum InstWidening {
1430     CM_Unknown,
1431     CM_Widen,         // For consecutive accesses with stride +1.
1432     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1433     CM_Interleave,
1434     CM_GatherScatter,
1435     CM_Scalarize
1436   };
1437 
1438   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1439   /// instruction \p I and vector width \p VF.
1440   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1441                            InstructionCost Cost) {
1442     assert(VF.isVector() && "Expected VF >=2");
1443     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1444   }
1445 
1446   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1447   /// interleaving group \p Grp and vector width \p VF.
1448   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1449                            ElementCount VF, InstWidening W,
1450                            InstructionCost Cost) {
1451     assert(VF.isVector() && "Expected VF >=2");
1452     /// Broadcast this decicion to all instructions inside the group.
1453     /// But the cost will be assigned to one instruction only.
1454     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1455       if (auto *I = Grp->getMember(i)) {
1456         if (Grp->getInsertPos() == I)
1457           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1458         else
1459           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1460       }
1461     }
1462   }
1463 
1464   /// Return the cost model decision for the given instruction \p I and vector
1465   /// width \p VF. Return CM_Unknown if this instruction did not pass
1466   /// through the cost modeling.
1467   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1468     assert(VF.isVector() && "Expected VF to be a vector VF");
1469     // Cost model is not run in the VPlan-native path - return conservative
1470     // result until this changes.
1471     if (EnableVPlanNativePath)
1472       return CM_GatherScatter;
1473 
1474     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1475     auto Itr = WideningDecisions.find(InstOnVF);
1476     if (Itr == WideningDecisions.end())
1477       return CM_Unknown;
1478     return Itr->second.first;
1479   }
1480 
1481   /// Return the vectorization cost for the given instruction \p I and vector
1482   /// width \p VF.
1483   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1484     assert(VF.isVector() && "Expected VF >=2");
1485     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1486     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1487            "The cost is not calculated");
1488     return WideningDecisions[InstOnVF].second;
1489   }
1490 
1491   /// Return True if instruction \p I is an optimizable truncate whose operand
1492   /// is an induction variable. Such a truncate will be removed by adding a new
1493   /// induction variable with the destination type.
1494   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1495     // If the instruction is not a truncate, return false.
1496     auto *Trunc = dyn_cast<TruncInst>(I);
1497     if (!Trunc)
1498       return false;
1499 
1500     // Get the source and destination types of the truncate.
1501     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1502     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1503 
1504     // If the truncate is free for the given types, return false. Replacing a
1505     // free truncate with an induction variable would add an induction variable
1506     // update instruction to each iteration of the loop. We exclude from this
1507     // check the primary induction variable since it will need an update
1508     // instruction regardless.
1509     Value *Op = Trunc->getOperand(0);
1510     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1511       return false;
1512 
1513     // If the truncated value is not an induction variable, return false.
1514     return Legal->isInductionPhi(Op);
1515   }
1516 
1517   /// Collects the instructions to scalarize for each predicated instruction in
1518   /// the loop.
1519   void collectInstsToScalarize(ElementCount VF);
1520 
1521   /// Collect Uniform and Scalar values for the given \p VF.
1522   /// The sets depend on CM decision for Load/Store instructions
1523   /// that may be vectorized as interleave, gather-scatter or scalarized.
1524   void collectUniformsAndScalars(ElementCount VF) {
1525     // Do the analysis once.
1526     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1527       return;
1528     setCostBasedWideningDecision(VF);
1529     collectLoopUniforms(VF);
1530     collectLoopScalars(VF);
1531   }
1532 
1533   /// Returns true if the target machine supports masked store operation
1534   /// for the given \p DataType and kind of access to \p Ptr.
1535   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1536     return Legal->isConsecutivePtr(DataType, Ptr) &&
1537            TTI.isLegalMaskedStore(DataType, Alignment);
1538   }
1539 
1540   /// Returns true if the target machine supports masked load operation
1541   /// for the given \p DataType and kind of access to \p Ptr.
1542   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1543     return Legal->isConsecutivePtr(DataType, Ptr) &&
1544            TTI.isLegalMaskedLoad(DataType, Alignment);
1545   }
1546 
1547   /// Returns true if the target machine can represent \p V as a masked gather
1548   /// or scatter operation.
1549   bool isLegalGatherOrScatter(Value *V) {
1550     bool LI = isa<LoadInst>(V);
1551     bool SI = isa<StoreInst>(V);
1552     if (!LI && !SI)
1553       return false;
1554     auto *Ty = getLoadStoreType(V);
1555     Align Align = getLoadStoreAlignment(V);
1556     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1557            (SI && TTI.isLegalMaskedScatter(Ty, Align));
1558   }
1559 
1560   /// Returns true if the target machine supports all of the reduction
1561   /// variables found for the given VF.
1562   bool canVectorizeReductions(ElementCount VF) const {
1563     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1564       const RecurrenceDescriptor &RdxDesc = Reduction.second;
1565       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1566     }));
1567   }
1568 
1569   /// Returns true if \p I is an instruction that will be scalarized with
1570   /// predication. Such instructions include conditional stores and
1571   /// instructions that may divide by zero.
1572   /// If a non-zero VF has been calculated, we check if I will be scalarized
1573   /// predication for that VF.
1574   bool isScalarWithPredication(Instruction *I) const;
1575 
1576   // Returns true if \p I is an instruction that will be predicated either
1577   // through scalar predication or masked load/store or masked gather/scatter.
1578   // Superset of instructions that return true for isScalarWithPredication.
1579   bool isPredicatedInst(Instruction *I, bool IsKnownUniform = false) {
1580     // When we know the load is uniform and the original scalar loop was not
1581     // predicated we don't need to mark it as a predicated instruction. Any
1582     // vectorised blocks created when tail-folding are something artificial we
1583     // have introduced and we know there is always at least one active lane.
1584     // That's why we call Legal->blockNeedsPredication here because it doesn't
1585     // query tail-folding.
1586     if (IsKnownUniform && isa<LoadInst>(I) &&
1587         !Legal->blockNeedsPredication(I->getParent()))
1588       return false;
1589     if (!blockNeedsPredicationForAnyReason(I->getParent()))
1590       return false;
1591     // Loads and stores that need some form of masked operation are predicated
1592     // instructions.
1593     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1594       return Legal->isMaskRequired(I);
1595     return isScalarWithPredication(I);
1596   }
1597 
1598   /// Returns true if \p I is a memory instruction with consecutive memory
1599   /// access that can be widened.
1600   bool
1601   memoryInstructionCanBeWidened(Instruction *I,
1602                                 ElementCount VF = ElementCount::getFixed(1));
1603 
1604   /// Returns true if \p I is a memory instruction in an interleaved-group
1605   /// of memory accesses that can be vectorized with wide vector loads/stores
1606   /// and shuffles.
1607   bool
1608   interleavedAccessCanBeWidened(Instruction *I,
1609                                 ElementCount VF = ElementCount::getFixed(1));
1610 
1611   /// Check if \p Instr belongs to any interleaved access group.
1612   bool isAccessInterleaved(Instruction *Instr) {
1613     return InterleaveInfo.isInterleaved(Instr);
1614   }
1615 
1616   /// Get the interleaved access group that \p Instr belongs to.
1617   const InterleaveGroup<Instruction> *
1618   getInterleavedAccessGroup(Instruction *Instr) {
1619     return InterleaveInfo.getInterleaveGroup(Instr);
1620   }
1621 
1622   /// Returns true if we're required to use a scalar epilogue for at least
1623   /// the final iteration of the original loop.
1624   bool requiresScalarEpilogue(ElementCount VF) const {
1625     if (!isScalarEpilogueAllowed())
1626       return false;
1627     // If we might exit from anywhere but the latch, must run the exiting
1628     // iteration in scalar form.
1629     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1630       return true;
1631     return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();
1632   }
1633 
1634   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1635   /// loop hint annotation.
1636   bool isScalarEpilogueAllowed() const {
1637     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1638   }
1639 
1640   /// Returns true if all loop blocks should be masked to fold tail loop.
1641   bool foldTailByMasking() const { return FoldTailByMasking; }
1642 
1643   /// Returns true if the instructions in this block requires predication
1644   /// for any reason, e.g. because tail folding now requires a predicate
1645   /// or because the block in the original loop was predicated.
1646   bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1647     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1648   }
1649 
1650   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1651   /// nodes to the chain of instructions representing the reductions. Uses a
1652   /// MapVector to ensure deterministic iteration order.
1653   using ReductionChainMap =
1654       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1655 
1656   /// Return the chain of instructions representing an inloop reduction.
1657   const ReductionChainMap &getInLoopReductionChains() const {
1658     return InLoopReductionChains;
1659   }
1660 
1661   /// Returns true if the Phi is part of an inloop reduction.
1662   bool isInLoopReduction(PHINode *Phi) const {
1663     return InLoopReductionChains.count(Phi);
1664   }
1665 
1666   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1667   /// with factor VF.  Return the cost of the instruction, including
1668   /// scalarization overhead if it's needed.
1669   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1670 
1671   /// Estimate cost of a call instruction CI if it were vectorized with factor
1672   /// VF. Return the cost of the instruction, including scalarization overhead
1673   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1674   /// scalarized -
1675   /// i.e. either vector version isn't available, or is too expensive.
1676   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1677                                     bool &NeedToScalarize) const;
1678 
1679   /// Returns true if the per-lane cost of VectorizationFactor A is lower than
1680   /// that of B.
1681   bool isMoreProfitable(const VectorizationFactor &A,
1682                         const VectorizationFactor &B) const;
1683 
1684   /// Invalidates decisions already taken by the cost model.
1685   void invalidateCostModelingDecisions() {
1686     WideningDecisions.clear();
1687     Uniforms.clear();
1688     Scalars.clear();
1689   }
1690 
1691 private:
1692   unsigned NumPredStores = 0;
1693 
1694   /// \return An upper bound for the vectorization factors for both
1695   /// fixed and scalable vectorization, where the minimum-known number of
1696   /// elements is a power-of-2 larger than zero. If scalable vectorization is
1697   /// disabled or unsupported, then the scalable part will be equal to
1698   /// ElementCount::getScalable(0).
1699   FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1700                                            ElementCount UserVF,
1701                                            bool FoldTailByMasking);
1702 
1703   /// \return the maximized element count based on the targets vector
1704   /// registers and the loop trip-count, but limited to a maximum safe VF.
1705   /// This is a helper function of computeFeasibleMaxVF.
1706   /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure
1707   /// issue that occurred on one of the buildbots which cannot be reproduced
1708   /// without having access to the properietary compiler (see comments on
1709   /// D98509). The issue is currently under investigation and this workaround
1710   /// will be removed as soon as possible.
1711   ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
1712                                        unsigned SmallestType,
1713                                        unsigned WidestType,
1714                                        const ElementCount &MaxSafeVF,
1715                                        bool FoldTailByMasking);
1716 
1717   /// \return the maximum legal scalable VF, based on the safe max number
1718   /// of elements.
1719   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1720 
1721   /// The vectorization cost is a combination of the cost itself and a boolean
1722   /// indicating whether any of the contributing operations will actually
1723   /// operate on vector values after type legalization in the backend. If this
1724   /// latter value is false, then all operations will be scalarized (i.e. no
1725   /// vectorization has actually taken place).
1726   using VectorizationCostTy = std::pair<InstructionCost, bool>;
1727 
1728   /// Returns the expected execution cost. The unit of the cost does
1729   /// not matter because we use the 'cost' units to compare different
1730   /// vector widths. The cost that is returned is *not* normalized by
1731   /// the factor width. If \p Invalid is not nullptr, this function
1732   /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1733   /// each instruction that has an Invalid cost for the given VF.
1734   using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1735   VectorizationCostTy
1736   expectedCost(ElementCount VF,
1737                SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1738 
1739   /// Returns the execution time cost of an instruction for a given vector
1740   /// width. Vector width of one means scalar.
1741   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1742 
1743   /// The cost-computation logic from getInstructionCost which provides
1744   /// the vector type as an output parameter.
1745   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1746                                      Type *&VectorTy);
1747 
1748   /// Return the cost of instructions in an inloop reduction pattern, if I is
1749   /// part of that pattern.
1750   Optional<InstructionCost>
1751   getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1752                           TTI::TargetCostKind CostKind);
1753 
1754   /// Calculate vectorization cost of memory instruction \p I.
1755   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1756 
1757   /// The cost computation for scalarized memory instruction.
1758   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1759 
1760   /// The cost computation for interleaving group of memory instructions.
1761   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1762 
1763   /// The cost computation for Gather/Scatter instruction.
1764   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1765 
1766   /// The cost computation for widening instruction \p I with consecutive
1767   /// memory access.
1768   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1769 
1770   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1771   /// Load: scalar load + broadcast.
1772   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1773   /// element)
1774   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1775 
1776   /// Estimate the overhead of scalarizing an instruction. This is a
1777   /// convenience wrapper for the type-based getScalarizationOverhead API.
1778   InstructionCost getScalarizationOverhead(Instruction *I,
1779                                            ElementCount VF) const;
1780 
1781   /// Returns whether the instruction is a load or store and will be a emitted
1782   /// as a vector operation.
1783   bool isConsecutiveLoadOrStore(Instruction *I);
1784 
1785   /// Returns true if an artificially high cost for emulated masked memrefs
1786   /// should be used.
1787   bool useEmulatedMaskMemRefHack(Instruction *I);
1788 
1789   /// Map of scalar integer values to the smallest bitwidth they can be legally
1790   /// represented as. The vector equivalents of these values should be truncated
1791   /// to this type.
1792   MapVector<Instruction *, uint64_t> MinBWs;
1793 
1794   /// A type representing the costs for instructions if they were to be
1795   /// scalarized rather than vectorized. The entries are Instruction-Cost
1796   /// pairs.
1797   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1798 
1799   /// A set containing all BasicBlocks that are known to present after
1800   /// vectorization as a predicated block.
1801   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1802 
1803   /// Records whether it is allowed to have the original scalar loop execute at
1804   /// least once. This may be needed as a fallback loop in case runtime
1805   /// aliasing/dependence checks fail, or to handle the tail/remainder
1806   /// iterations when the trip count is unknown or doesn't divide by the VF,
1807   /// or as a peel-loop to handle gaps in interleave-groups.
1808   /// Under optsize and when the trip count is very small we don't allow any
1809   /// iterations to execute in the scalar loop.
1810   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1811 
1812   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1813   bool FoldTailByMasking = false;
1814 
1815   /// A map holding scalar costs for different vectorization factors. The
1816   /// presence of a cost for an instruction in the mapping indicates that the
1817   /// instruction will be scalarized when vectorizing with the associated
1818   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1819   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1820 
1821   /// Holds the instructions known to be uniform after vectorization.
1822   /// The data is collected per VF.
1823   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1824 
1825   /// Holds the instructions known to be scalar after vectorization.
1826   /// The data is collected per VF.
1827   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1828 
1829   /// Holds the instructions (address computations) that are forced to be
1830   /// scalarized.
1831   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1832 
1833   /// PHINodes of the reductions that should be expanded in-loop along with
1834   /// their associated chains of reduction operations, in program order from top
1835   /// (PHI) to bottom
1836   ReductionChainMap InLoopReductionChains;
1837 
1838   /// A Map of inloop reduction operations and their immediate chain operand.
1839   /// FIXME: This can be removed once reductions can be costed correctly in
1840   /// vplan. This was added to allow quick lookup to the inloop operations,
1841   /// without having to loop through InLoopReductionChains.
1842   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1843 
1844   /// Returns the expected difference in cost from scalarizing the expression
1845   /// feeding a predicated instruction \p PredInst. The instructions to
1846   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1847   /// non-negative return value implies the expression will be scalarized.
1848   /// Currently, only single-use chains are considered for scalarization.
1849   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1850                               ElementCount VF);
1851 
1852   /// Collect the instructions that are uniform after vectorization. An
1853   /// instruction is uniform if we represent it with a single scalar value in
1854   /// the vectorized loop corresponding to each vector iteration. Examples of
1855   /// uniform instructions include pointer operands of consecutive or
1856   /// interleaved memory accesses. Note that although uniformity implies an
1857   /// instruction will be scalar, the reverse is not true. In general, a
1858   /// scalarized instruction will be represented by VF scalar values in the
1859   /// vectorized loop, each corresponding to an iteration of the original
1860   /// scalar loop.
1861   void collectLoopUniforms(ElementCount VF);
1862 
1863   /// Collect the instructions that are scalar after vectorization. An
1864   /// instruction is scalar if it is known to be uniform or will be scalarized
1865   /// during vectorization. collectLoopScalars should only add non-uniform nodes
1866   /// to the list if they are used by a load/store instruction that is marked as
1867   /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1868   /// VF values in the vectorized loop, each corresponding to an iteration of
1869   /// the original scalar loop.
1870   void collectLoopScalars(ElementCount VF);
1871 
1872   /// Keeps cost model vectorization decision and cost for instructions.
1873   /// Right now it is used for memory instructions only.
1874   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1875                                 std::pair<InstWidening, InstructionCost>>;
1876 
1877   DecisionList WideningDecisions;
1878 
1879   /// Returns true if \p V is expected to be vectorized and it needs to be
1880   /// extracted.
1881   bool needsExtract(Value *V, ElementCount VF) const {
1882     Instruction *I = dyn_cast<Instruction>(V);
1883     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1884         TheLoop->isLoopInvariant(I))
1885       return false;
1886 
1887     // Assume we can vectorize V (and hence we need extraction) if the
1888     // scalars are not computed yet. This can happen, because it is called
1889     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1890     // the scalars are collected. That should be a safe assumption in most
1891     // cases, because we check if the operands have vectorizable types
1892     // beforehand in LoopVectorizationLegality.
1893     return Scalars.find(VF) == Scalars.end() ||
1894            !isScalarAfterVectorization(I, VF);
1895   };
1896 
1897   /// Returns a range containing only operands needing to be extracted.
1898   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1899                                                    ElementCount VF) const {
1900     return SmallVector<Value *, 4>(make_filter_range(
1901         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1902   }
1903 
1904   /// Determines if we have the infrastructure to vectorize loop \p L and its
1905   /// epilogue, assuming the main loop is vectorized by \p VF.
1906   bool isCandidateForEpilogueVectorization(const Loop &L,
1907                                            const ElementCount VF) const;
1908 
1909   /// Returns true if epilogue vectorization is considered profitable, and
1910   /// false otherwise.
1911   /// \p VF is the vectorization factor chosen for the original loop.
1912   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1913 
1914 public:
1915   /// The loop that we evaluate.
1916   Loop *TheLoop;
1917 
1918   /// Predicated scalar evolution analysis.
1919   PredicatedScalarEvolution &PSE;
1920 
1921   /// Loop Info analysis.
1922   LoopInfo *LI;
1923 
1924   /// Vectorization legality.
1925   LoopVectorizationLegality *Legal;
1926 
1927   /// Vector target information.
1928   const TargetTransformInfo &TTI;
1929 
1930   /// Target Library Info.
1931   const TargetLibraryInfo *TLI;
1932 
1933   /// Demanded bits analysis.
1934   DemandedBits *DB;
1935 
1936   /// Assumption cache.
1937   AssumptionCache *AC;
1938 
1939   /// Interface to emit optimization remarks.
1940   OptimizationRemarkEmitter *ORE;
1941 
1942   const Function *TheFunction;
1943 
1944   /// Loop Vectorize Hint.
1945   const LoopVectorizeHints *Hints;
1946 
1947   /// The interleave access information contains groups of interleaved accesses
1948   /// with the same stride and close to each other.
1949   InterleavedAccessInfo &InterleaveInfo;
1950 
1951   /// Values to ignore in the cost model.
1952   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1953 
1954   /// Values to ignore in the cost model when VF > 1.
1955   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1956 
1957   /// All element types found in the loop.
1958   SmallPtrSet<Type *, 16> ElementTypesInLoop;
1959 
1960   /// Profitable vector factors.
1961   SmallVector<VectorizationFactor, 8> ProfitableVFs;
1962 };
1963 } // end namespace llvm
1964 
1965 /// Helper struct to manage generating runtime checks for vectorization.
1966 ///
1967 /// The runtime checks are created up-front in temporary blocks to allow better
1968 /// estimating the cost and un-linked from the existing IR. After deciding to
1969 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1970 /// temporary blocks are completely removed.
1971 class GeneratedRTChecks {
1972   /// Basic block which contains the generated SCEV checks, if any.
1973   BasicBlock *SCEVCheckBlock = nullptr;
1974 
1975   /// The value representing the result of the generated SCEV checks. If it is
1976   /// nullptr, either no SCEV checks have been generated or they have been used.
1977   Value *SCEVCheckCond = nullptr;
1978 
1979   /// Basic block which contains the generated memory runtime checks, if any.
1980   BasicBlock *MemCheckBlock = nullptr;
1981 
1982   /// The value representing the result of the generated memory runtime checks.
1983   /// If it is nullptr, either no memory runtime checks have been generated or
1984   /// they have been used.
1985   Value *MemRuntimeCheckCond = nullptr;
1986 
1987   DominatorTree *DT;
1988   LoopInfo *LI;
1989 
1990   SCEVExpander SCEVExp;
1991   SCEVExpander MemCheckExp;
1992 
1993 public:
1994   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1995                     const DataLayout &DL)
1996       : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"),
1997         MemCheckExp(SE, DL, "scev.check") {}
1998 
1999   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
2000   /// accurately estimate the cost of the runtime checks. The blocks are
2001   /// un-linked from the IR and is added back during vector code generation. If
2002   /// there is no vector code generation, the check blocks are removed
2003   /// completely.
2004   void Create(Loop *L, const LoopAccessInfo &LAI,
2005               const SCEVUnionPredicate &UnionPred) {
2006 
2007     BasicBlock *LoopHeader = L->getHeader();
2008     BasicBlock *Preheader = L->getLoopPreheader();
2009 
2010     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
2011     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
2012     // may be used by SCEVExpander. The blocks will be un-linked from their
2013     // predecessors and removed from LI & DT at the end of the function.
2014     if (!UnionPred.isAlwaysTrue()) {
2015       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
2016                                   nullptr, "vector.scevcheck");
2017 
2018       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
2019           &UnionPred, SCEVCheckBlock->getTerminator());
2020     }
2021 
2022     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
2023     if (RtPtrChecking.Need) {
2024       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
2025       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
2026                                  "vector.memcheck");
2027 
2028       MemRuntimeCheckCond =
2029           addRuntimeChecks(MemCheckBlock->getTerminator(), L,
2030                            RtPtrChecking.getChecks(), MemCheckExp);
2031       assert(MemRuntimeCheckCond &&
2032              "no RT checks generated although RtPtrChecking "
2033              "claimed checks are required");
2034     }
2035 
2036     if (!MemCheckBlock && !SCEVCheckBlock)
2037       return;
2038 
2039     // Unhook the temporary block with the checks, update various places
2040     // accordingly.
2041     if (SCEVCheckBlock)
2042       SCEVCheckBlock->replaceAllUsesWith(Preheader);
2043     if (MemCheckBlock)
2044       MemCheckBlock->replaceAllUsesWith(Preheader);
2045 
2046     if (SCEVCheckBlock) {
2047       SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2048       new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
2049       Preheader->getTerminator()->eraseFromParent();
2050     }
2051     if (MemCheckBlock) {
2052       MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2053       new UnreachableInst(Preheader->getContext(), MemCheckBlock);
2054       Preheader->getTerminator()->eraseFromParent();
2055     }
2056 
2057     DT->changeImmediateDominator(LoopHeader, Preheader);
2058     if (MemCheckBlock) {
2059       DT->eraseNode(MemCheckBlock);
2060       LI->removeBlock(MemCheckBlock);
2061     }
2062     if (SCEVCheckBlock) {
2063       DT->eraseNode(SCEVCheckBlock);
2064       LI->removeBlock(SCEVCheckBlock);
2065     }
2066   }
2067 
2068   /// Remove the created SCEV & memory runtime check blocks & instructions, if
2069   /// unused.
2070   ~GeneratedRTChecks() {
2071     SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT);
2072     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT);
2073     if (!SCEVCheckCond)
2074       SCEVCleaner.markResultUsed();
2075 
2076     if (!MemRuntimeCheckCond)
2077       MemCheckCleaner.markResultUsed();
2078 
2079     if (MemRuntimeCheckCond) {
2080       auto &SE = *MemCheckExp.getSE();
2081       // Memory runtime check generation creates compares that use expanded
2082       // values. Remove them before running the SCEVExpanderCleaners.
2083       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2084         if (MemCheckExp.isInsertedInstruction(&I))
2085           continue;
2086         SE.forgetValue(&I);
2087         I.eraseFromParent();
2088       }
2089     }
2090     MemCheckCleaner.cleanup();
2091     SCEVCleaner.cleanup();
2092 
2093     if (SCEVCheckCond)
2094       SCEVCheckBlock->eraseFromParent();
2095     if (MemRuntimeCheckCond)
2096       MemCheckBlock->eraseFromParent();
2097   }
2098 
2099   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2100   /// adjusts the branches to branch to the vector preheader or \p Bypass,
2101   /// depending on the generated condition.
2102   BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass,
2103                              BasicBlock *LoopVectorPreHeader,
2104                              BasicBlock *LoopExitBlock) {
2105     if (!SCEVCheckCond)
2106       return nullptr;
2107     if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond))
2108       if (C->isZero())
2109         return nullptr;
2110 
2111     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2112 
2113     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2114     // Create new preheader for vector loop.
2115     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2116       PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2117 
2118     SCEVCheckBlock->getTerminator()->eraseFromParent();
2119     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2120     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2121                                                 SCEVCheckBlock);
2122 
2123     DT->addNewBlock(SCEVCheckBlock, Pred);
2124     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2125 
2126     ReplaceInstWithInst(
2127         SCEVCheckBlock->getTerminator(),
2128         BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond));
2129     // Mark the check as used, to prevent it from being removed during cleanup.
2130     SCEVCheckCond = nullptr;
2131     return SCEVCheckBlock;
2132   }
2133 
2134   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2135   /// the branches to branch to the vector preheader or \p Bypass, depending on
2136   /// the generated condition.
2137   BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass,
2138                                    BasicBlock *LoopVectorPreHeader) {
2139     // Check if we generated code that checks in runtime if arrays overlap.
2140     if (!MemRuntimeCheckCond)
2141       return nullptr;
2142 
2143     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2144     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2145                                                 MemCheckBlock);
2146 
2147     DT->addNewBlock(MemCheckBlock, Pred);
2148     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2149     MemCheckBlock->moveBefore(LoopVectorPreHeader);
2150 
2151     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2152       PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2153 
2154     ReplaceInstWithInst(
2155         MemCheckBlock->getTerminator(),
2156         BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2157     MemCheckBlock->getTerminator()->setDebugLoc(
2158         Pred->getTerminator()->getDebugLoc());
2159 
2160     // Mark the check as used, to prevent it from being removed during cleanup.
2161     MemRuntimeCheckCond = nullptr;
2162     return MemCheckBlock;
2163   }
2164 };
2165 
2166 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2167 // vectorization. The loop needs to be annotated with #pragma omp simd
2168 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2169 // vector length information is not provided, vectorization is not considered
2170 // explicit. Interleave hints are not allowed either. These limitations will be
2171 // relaxed in the future.
2172 // Please, note that we are currently forced to abuse the pragma 'clang
2173 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2174 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2175 // provides *explicit vectorization hints* (LV can bypass legal checks and
2176 // assume that vectorization is legal). However, both hints are implemented
2177 // using the same metadata (llvm.loop.vectorize, processed by
2178 // LoopVectorizeHints). This will be fixed in the future when the native IR
2179 // representation for pragma 'omp simd' is introduced.
2180 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2181                                    OptimizationRemarkEmitter *ORE) {
2182   assert(!OuterLp->isInnermost() && "This is not an outer loop");
2183   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2184 
2185   // Only outer loops with an explicit vectorization hint are supported.
2186   // Unannotated outer loops are ignored.
2187   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2188     return false;
2189 
2190   Function *Fn = OuterLp->getHeader()->getParent();
2191   if (!Hints.allowVectorization(Fn, OuterLp,
2192                                 true /*VectorizeOnlyWhenForced*/)) {
2193     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2194     return false;
2195   }
2196 
2197   if (Hints.getInterleave() > 1) {
2198     // TODO: Interleave support is future work.
2199     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2200                          "outer loops.\n");
2201     Hints.emitRemarkWithHints();
2202     return false;
2203   }
2204 
2205   return true;
2206 }
2207 
2208 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2209                                   OptimizationRemarkEmitter *ORE,
2210                                   SmallVectorImpl<Loop *> &V) {
2211   // Collect inner loops and outer loops without irreducible control flow. For
2212   // now, only collect outer loops that have explicit vectorization hints. If we
2213   // are stress testing the VPlan H-CFG construction, we collect the outermost
2214   // loop of every loop nest.
2215   if (L.isInnermost() || VPlanBuildStressTest ||
2216       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2217     LoopBlocksRPO RPOT(&L);
2218     RPOT.perform(LI);
2219     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2220       V.push_back(&L);
2221       // TODO: Collect inner loops inside marked outer loops in case
2222       // vectorization fails for the outer loop. Do not invoke
2223       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2224       // already known to be reducible. We can use an inherited attribute for
2225       // that.
2226       return;
2227     }
2228   }
2229   for (Loop *InnerL : L)
2230     collectSupportedLoops(*InnerL, LI, ORE, V);
2231 }
2232 
2233 namespace {
2234 
2235 /// The LoopVectorize Pass.
2236 struct LoopVectorize : public FunctionPass {
2237   /// Pass identification, replacement for typeid
2238   static char ID;
2239 
2240   LoopVectorizePass Impl;
2241 
2242   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2243                          bool VectorizeOnlyWhenForced = false)
2244       : FunctionPass(ID),
2245         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2246     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2247   }
2248 
2249   bool runOnFunction(Function &F) override {
2250     if (skipFunction(F))
2251       return false;
2252 
2253     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2254     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2255     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2256     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2257     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2258     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2259     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2260     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2261     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2262     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2263     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2264     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2265     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2266 
2267     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2268         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2269 
2270     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2271                         GetLAA, *ORE, PSI).MadeAnyChange;
2272   }
2273 
2274   void getAnalysisUsage(AnalysisUsage &AU) const override {
2275     AU.addRequired<AssumptionCacheTracker>();
2276     AU.addRequired<BlockFrequencyInfoWrapperPass>();
2277     AU.addRequired<DominatorTreeWrapperPass>();
2278     AU.addRequired<LoopInfoWrapperPass>();
2279     AU.addRequired<ScalarEvolutionWrapperPass>();
2280     AU.addRequired<TargetTransformInfoWrapperPass>();
2281     AU.addRequired<AAResultsWrapperPass>();
2282     AU.addRequired<LoopAccessLegacyAnalysis>();
2283     AU.addRequired<DemandedBitsWrapperPass>();
2284     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2285     AU.addRequired<InjectTLIMappingsLegacy>();
2286 
2287     // We currently do not preserve loopinfo/dominator analyses with outer loop
2288     // vectorization. Until this is addressed, mark these analyses as preserved
2289     // only for non-VPlan-native path.
2290     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2291     if (!EnableVPlanNativePath) {
2292       AU.addPreserved<LoopInfoWrapperPass>();
2293       AU.addPreserved<DominatorTreeWrapperPass>();
2294     }
2295 
2296     AU.addPreserved<BasicAAWrapperPass>();
2297     AU.addPreserved<GlobalsAAWrapperPass>();
2298     AU.addRequired<ProfileSummaryInfoWrapperPass>();
2299   }
2300 };
2301 
2302 } // end anonymous namespace
2303 
2304 //===----------------------------------------------------------------------===//
2305 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2306 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2307 //===----------------------------------------------------------------------===//
2308 
2309 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2310   // We need to place the broadcast of invariant variables outside the loop,
2311   // but only if it's proven safe to do so. Else, broadcast will be inside
2312   // vector loop body.
2313   Instruction *Instr = dyn_cast<Instruction>(V);
2314   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2315                      (!Instr ||
2316                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2317   // Place the code for broadcasting invariant variables in the new preheader.
2318   IRBuilder<>::InsertPointGuard Guard(Builder);
2319   if (SafeToHoist)
2320     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2321 
2322   // Broadcast the scalar into all locations in the vector.
2323   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2324 
2325   return Shuf;
2326 }
2327 
2328 /// This function adds
2329 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
2330 /// to each vector element of Val. The sequence starts at StartIndex.
2331 /// \p Opcode is relevant for FP induction variable.
2332 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
2333                             Instruction::BinaryOps BinOp, ElementCount VF,
2334                             IRBuilder<> &Builder) {
2335   if (VF.isScalar()) {
2336     // When unrolling and the VF is 1, we only need to add a simple scalar.
2337     Type *Ty = Val->getType();
2338     assert(!Ty->isVectorTy() && "Val must be a scalar");
2339 
2340     if (Ty->isFloatingPointTy()) {
2341       // Floating-point operations inherit FMF via the builder's flags.
2342       Value *MulOp = Builder.CreateFMul(StartIdx, Step);
2343       return Builder.CreateBinOp(BinOp, Val, MulOp);
2344     }
2345     return Builder.CreateAdd(Val, Builder.CreateMul(StartIdx, Step),
2346                              "induction");
2347   }
2348 
2349   // Create and check the types.
2350   auto *ValVTy = cast<VectorType>(Val->getType());
2351   ElementCount VLen = ValVTy->getElementCount();
2352 
2353   Type *STy = Val->getType()->getScalarType();
2354   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2355          "Induction Step must be an integer or FP");
2356   assert(Step->getType() == STy && "Step has wrong type");
2357 
2358   SmallVector<Constant *, 8> Indices;
2359 
2360   // Create a vector of consecutive numbers from zero to VF.
2361   VectorType *InitVecValVTy = ValVTy;
2362   Type *InitVecValSTy = STy;
2363   if (STy->isFloatingPointTy()) {
2364     InitVecValSTy =
2365         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
2366     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2367   }
2368   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2369 
2370   // Splat the StartIdx
2371   Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
2372 
2373   if (STy->isIntegerTy()) {
2374     InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2375     Step = Builder.CreateVectorSplat(VLen, Step);
2376     assert(Step->getType() == Val->getType() && "Invalid step vec");
2377     // FIXME: The newly created binary instructions should contain nsw/nuw
2378     // flags, which can be found from the original scalar operations.
2379     Step = Builder.CreateMul(InitVec, Step);
2380     return Builder.CreateAdd(Val, Step, "induction");
2381   }
2382 
2383   // Floating point induction.
2384   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2385          "Binary Opcode should be specified for FP induction");
2386   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2387   InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
2388 
2389   Step = Builder.CreateVectorSplat(VLen, Step);
2390   Value *MulOp = Builder.CreateFMul(InitVec, Step);
2391   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2392 }
2393 
2394 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
2395     const InductionDescriptor &II, Value *Step, Value *Start,
2396     Instruction *EntryVal, VPValue *Def, VPTransformState &State) {
2397   IRBuilder<> &Builder = State.Builder;
2398   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2399          "Expected either an induction phi-node or a truncate of it!");
2400 
2401   // Construct the initial value of the vector IV in the vector loop preheader
2402   auto CurrIP = Builder.saveIP();
2403   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2404   if (isa<TruncInst>(EntryVal)) {
2405     assert(Start->getType()->isIntegerTy() &&
2406            "Truncation requires an integer type");
2407     auto *TruncType = cast<IntegerType>(EntryVal->getType());
2408     Step = Builder.CreateTrunc(Step, TruncType);
2409     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2410   }
2411 
2412   Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
2413   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
2414   Value *SteppedStart = getStepVector(
2415       SplatStart, Zero, Step, II.getInductionOpcode(), State.VF, State.Builder);
2416 
2417   // We create vector phi nodes for both integer and floating-point induction
2418   // variables. Here, we determine the kind of arithmetic we will perform.
2419   Instruction::BinaryOps AddOp;
2420   Instruction::BinaryOps MulOp;
2421   if (Step->getType()->isIntegerTy()) {
2422     AddOp = Instruction::Add;
2423     MulOp = Instruction::Mul;
2424   } else {
2425     AddOp = II.getInductionOpcode();
2426     MulOp = Instruction::FMul;
2427   }
2428 
2429   // Multiply the vectorization factor by the step using integer or
2430   // floating-point arithmetic as appropriate.
2431   Type *StepType = Step->getType();
2432   Value *RuntimeVF;
2433   if (Step->getType()->isFloatingPointTy())
2434     RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
2435   else
2436     RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
2437   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
2438 
2439   // Create a vector splat to use in the induction update.
2440   //
2441   // FIXME: If the step is non-constant, we create the vector splat with
2442   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2443   //        handle a constant vector splat.
2444   Value *SplatVF = isa<Constant>(Mul)
2445                        ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
2446                        : Builder.CreateVectorSplat(State.VF, Mul);
2447   Builder.restoreIP(CurrIP);
2448 
2449   // We may need to add the step a number of times, depending on the unroll
2450   // factor. The last of those goes into the PHI.
2451   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2452                                     &*LoopVectorBody->getFirstInsertionPt());
2453   VecInd->setDebugLoc(EntryVal->getDebugLoc());
2454   Instruction *LastInduction = VecInd;
2455   for (unsigned Part = 0; Part < UF; ++Part) {
2456     State.set(Def, LastInduction, Part);
2457 
2458     if (isa<TruncInst>(EntryVal))
2459       addMetadata(LastInduction, EntryVal);
2460 
2461     LastInduction = cast<Instruction>(
2462         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
2463     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
2464   }
2465 
2466   // Move the last step to the end of the latch block. This ensures consistent
2467   // placement of all induction updates.
2468   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2469   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2470   auto *ICmp = cast<Instruction>(Br->getCondition());
2471   LastInduction->moveBefore(ICmp);
2472   LastInduction->setName("vec.ind.next");
2473 
2474   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2475   VecInd->addIncoming(LastInduction, LoopVectorLatch);
2476 }
2477 
2478 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
2479   return Cost->isScalarAfterVectorization(I, VF) ||
2480          Cost->isProfitableToScalarize(I, VF);
2481 }
2482 
2483 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
2484   if (shouldScalarizeInstruction(IV))
2485     return true;
2486   auto isScalarInst = [&](User *U) -> bool {
2487     auto *I = cast<Instruction>(U);
2488     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
2489   };
2490   return llvm::any_of(IV->users(), isScalarInst);
2491 }
2492 
2493 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV,
2494                                                 const InductionDescriptor &ID,
2495                                                 Value *Start, TruncInst *Trunc,
2496                                                 VPValue *Def,
2497                                                 VPTransformState &State) {
2498   IRBuilder<> &Builder = State.Builder;
2499   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
2500          "Primary induction variable must have an integer type");
2501   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
2502 
2503   // The value from the original loop to which we are mapping the new induction
2504   // variable.
2505   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2506 
2507   auto &DL = EntryVal->getModule()->getDataLayout();
2508 
2509   // Generate code for the induction step. Note that induction steps are
2510   // required to be loop-invariant
2511   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
2512     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
2513            "Induction step should be loop invariant");
2514     if (PSE.getSE()->isSCEVable(IV->getType())) {
2515       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2516       return Exp.expandCodeFor(Step, Step->getType(),
2517                                State.CFG.VectorPreHeader->getTerminator());
2518     }
2519     return cast<SCEVUnknown>(Step)->getValue();
2520   };
2521 
2522   // The scalar value to broadcast. This is derived from the canonical
2523   // induction variable. If a truncation type is given, truncate the canonical
2524   // induction variable and step. Otherwise, derive these values from the
2525   // induction descriptor.
2526   auto CreateScalarIV = [&](Value *&Step) -> Value * {
2527     Value *ScalarIV = Induction;
2528     if (IV != OldInduction) {
2529       ScalarIV = IV->getType()->isIntegerTy()
2530                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
2531                      : Builder.CreateCast(Instruction::SIToFP, Induction,
2532                                           IV->getType());
2533       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID,
2534                                       State.CFG.PrevBB);
2535       ScalarIV->setName("offset.idx");
2536     }
2537     if (Trunc) {
2538       auto *TruncType = cast<IntegerType>(Trunc->getType());
2539       assert(Step->getType()->isIntegerTy() &&
2540              "Truncation requires an integer step");
2541       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2542       Step = Builder.CreateTrunc(Step, TruncType);
2543     }
2544     return ScalarIV;
2545   };
2546 
2547   // Create the vector values from the scalar IV, in the absence of creating a
2548   // vector IV.
2549   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
2550     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
2551     for (unsigned Part = 0; Part < UF; ++Part) {
2552       assert(!State.VF.isScalable() && "scalable vectors not yet supported.");
2553       Value *StartIdx;
2554       if (Step->getType()->isFloatingPointTy())
2555         StartIdx =
2556             getRuntimeVFAsFloat(Builder, Step->getType(), State.VF * Part);
2557       else
2558         StartIdx = getRuntimeVF(Builder, Step->getType(), State.VF * Part);
2559 
2560       Value *EntryPart =
2561           getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode(),
2562                         State.VF, State.Builder);
2563       State.set(Def, EntryPart, Part);
2564       if (Trunc)
2565         addMetadata(EntryPart, Trunc);
2566     }
2567   };
2568 
2569   // Fast-math-flags propagate from the original induction instruction.
2570   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
2571   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
2572     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
2573 
2574   // Now do the actual transformations, and start with creating the step value.
2575   Value *Step = CreateStepValue(ID.getStep());
2576   if (State.VF.isZero() || State.VF.isScalar()) {
2577     Value *ScalarIV = CreateScalarIV(Step);
2578     CreateSplatIV(ScalarIV, Step);
2579     return;
2580   }
2581 
2582   // Determine if we want a scalar version of the induction variable. This is
2583   // true if the induction variable itself is not widened, or if it has at
2584   // least one user in the loop that is not widened.
2585   auto NeedsScalarIV = needsScalarInduction(EntryVal);
2586   if (!NeedsScalarIV) {
2587     createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State);
2588     return;
2589   }
2590 
2591   // Try to create a new independent vector induction variable. If we can't
2592   // create the phi node, we will splat the scalar induction variable in each
2593   // loop iteration.
2594   if (!shouldScalarizeInstruction(EntryVal)) {
2595     createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State);
2596     Value *ScalarIV = CreateScalarIV(Step);
2597     // Create scalar steps that can be used by instructions we will later
2598     // scalarize. Note that the addition of the scalar steps will not increase
2599     // the number of instructions in the loop in the common case prior to
2600     // InstCombine. We will be trading one vector extract for each scalar step.
2601     buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State);
2602     return;
2603   }
2604 
2605   // All IV users are scalar instructions, so only emit a scalar IV, not a
2606   // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2607   // predicate used by the masked loads/stores.
2608   Value *ScalarIV = CreateScalarIV(Step);
2609   if (!Cost->isScalarEpilogueAllowed())
2610     CreateSplatIV(ScalarIV, Step);
2611   buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State);
2612 }
2613 
2614 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2615                                            Instruction *EntryVal,
2616                                            const InductionDescriptor &ID,
2617                                            VPValue *Def,
2618                                            VPTransformState &State) {
2619   IRBuilder<> &Builder = State.Builder;
2620   // We shouldn't have to build scalar steps if we aren't vectorizing.
2621   assert(State.VF.isVector() && "VF should be greater than one");
2622   // Get the value type and ensure it and the step have the same integer type.
2623   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2624   assert(ScalarIVTy == Step->getType() &&
2625          "Val and Step should have the same type");
2626 
2627   // We build scalar steps for both integer and floating-point induction
2628   // variables. Here, we determine the kind of arithmetic we will perform.
2629   Instruction::BinaryOps AddOp;
2630   Instruction::BinaryOps MulOp;
2631   if (ScalarIVTy->isIntegerTy()) {
2632     AddOp = Instruction::Add;
2633     MulOp = Instruction::Mul;
2634   } else {
2635     AddOp = ID.getInductionOpcode();
2636     MulOp = Instruction::FMul;
2637   }
2638 
2639   // Determine the number of scalars we need to generate for each unroll
2640   // iteration. If EntryVal is uniform, we only need to generate the first
2641   // lane. Otherwise, we generate all VF values.
2642   bool IsUniform =
2643       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), State.VF);
2644   unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue();
2645   // Compute the scalar steps and save the results in State.
2646   Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2647                                      ScalarIVTy->getScalarSizeInBits());
2648   Type *VecIVTy = nullptr;
2649   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2650   if (!IsUniform && State.VF.isScalable()) {
2651     VecIVTy = VectorType::get(ScalarIVTy, State.VF);
2652     UnitStepVec =
2653         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
2654     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
2655     SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV);
2656   }
2657 
2658   for (unsigned Part = 0; Part < State.UF; ++Part) {
2659     Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
2660 
2661     if (!IsUniform && State.VF.isScalable()) {
2662       auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
2663       auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2664       if (ScalarIVTy->isFloatingPointTy())
2665         InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2666       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2667       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2668       State.set(Def, Add, Part);
2669       // It's useful to record the lane values too for the known minimum number
2670       // of elements so we do those below. This improves the code quality when
2671       // trying to extract the first element, for example.
2672     }
2673 
2674     if (ScalarIVTy->isFloatingPointTy())
2675       StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2676 
2677     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2678       Value *StartIdx = Builder.CreateBinOp(
2679           AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2680       // The step returned by `createStepForVF` is a runtime-evaluated value
2681       // when VF is scalable. Otherwise, it should be folded into a Constant.
2682       assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
2683              "Expected StartIdx to be folded to a constant when VF is not "
2684              "scalable");
2685       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2686       auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2687       State.set(Def, Add, VPIteration(Part, Lane));
2688     }
2689   }
2690 }
2691 
2692 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2693                                                     const VPIteration &Instance,
2694                                                     VPTransformState &State) {
2695   Value *ScalarInst = State.get(Def, Instance);
2696   Value *VectorValue = State.get(Def, Instance.Part);
2697   VectorValue = Builder.CreateInsertElement(
2698       VectorValue, ScalarInst,
2699       Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2700   State.set(Def, VectorValue, Instance.Part);
2701 }
2702 
2703 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2704   assert(Vec->getType()->isVectorTy() && "Invalid type");
2705   return Builder.CreateVectorReverse(Vec, "reverse");
2706 }
2707 
2708 // Return whether we allow using masked interleave-groups (for dealing with
2709 // strided loads/stores that reside in predicated blocks, or for dealing
2710 // with gaps).
2711 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2712   // If an override option has been passed in for interleaved accesses, use it.
2713   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2714     return EnableMaskedInterleavedMemAccesses;
2715 
2716   return TTI.enableMaskedInterleavedAccessVectorization();
2717 }
2718 
2719 // Try to vectorize the interleave group that \p Instr belongs to.
2720 //
2721 // E.g. Translate following interleaved load group (factor = 3):
2722 //   for (i = 0; i < N; i+=3) {
2723 //     R = Pic[i];             // Member of index 0
2724 //     G = Pic[i+1];           // Member of index 1
2725 //     B = Pic[i+2];           // Member of index 2
2726 //     ... // do something to R, G, B
2727 //   }
2728 // To:
2729 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2730 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2731 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2732 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2733 //
2734 // Or translate following interleaved store group (factor = 3):
2735 //   for (i = 0; i < N; i+=3) {
2736 //     ... do something to R, G, B
2737 //     Pic[i]   = R;           // Member of index 0
2738 //     Pic[i+1] = G;           // Member of index 1
2739 //     Pic[i+2] = B;           // Member of index 2
2740 //   }
2741 // To:
2742 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2743 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2744 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2745 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2746 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2747 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2748     const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2749     VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2750     VPValue *BlockInMask) {
2751   Instruction *Instr = Group->getInsertPos();
2752   const DataLayout &DL = Instr->getModule()->getDataLayout();
2753 
2754   // Prepare for the vector type of the interleaved load/store.
2755   Type *ScalarTy = getLoadStoreType(Instr);
2756   unsigned InterleaveFactor = Group->getFactor();
2757   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2758   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2759 
2760   // Prepare for the new pointers.
2761   SmallVector<Value *, 2> AddrParts;
2762   unsigned Index = Group->getIndex(Instr);
2763 
2764   // TODO: extend the masked interleaved-group support to reversed access.
2765   assert((!BlockInMask || !Group->isReverse()) &&
2766          "Reversed masked interleave-group not supported.");
2767 
2768   // If the group is reverse, adjust the index to refer to the last vector lane
2769   // instead of the first. We adjust the index from the first vector lane,
2770   // rather than directly getting the pointer for lane VF - 1, because the
2771   // pointer operand of the interleaved access is supposed to be uniform. For
2772   // uniform instructions, we're only required to generate a value for the
2773   // first vector lane in each unroll iteration.
2774   if (Group->isReverse())
2775     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2776 
2777   for (unsigned Part = 0; Part < UF; Part++) {
2778     Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2779     setDebugLocFromInst(AddrPart);
2780 
2781     // Notice current instruction could be any index. Need to adjust the address
2782     // to the member of index 0.
2783     //
2784     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2785     //       b = A[i];       // Member of index 0
2786     // Current pointer is pointed to A[i+1], adjust it to A[i].
2787     //
2788     // E.g.  A[i+1] = a;     // Member of index 1
2789     //       A[i]   = b;     // Member of index 0
2790     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2791     // Current pointer is pointed to A[i+2], adjust it to A[i].
2792 
2793     bool InBounds = false;
2794     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2795       InBounds = gep->isInBounds();
2796     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2797     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2798 
2799     // Cast to the vector pointer type.
2800     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2801     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2802     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2803   }
2804 
2805   setDebugLocFromInst(Instr);
2806   Value *PoisonVec = PoisonValue::get(VecTy);
2807 
2808   Value *MaskForGaps = nullptr;
2809   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2810     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2811     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2812   }
2813 
2814   // Vectorize the interleaved load group.
2815   if (isa<LoadInst>(Instr)) {
2816     // For each unroll part, create a wide load for the group.
2817     SmallVector<Value *, 2> NewLoads;
2818     for (unsigned Part = 0; Part < UF; Part++) {
2819       Instruction *NewLoad;
2820       if (BlockInMask || MaskForGaps) {
2821         assert(useMaskedInterleavedAccesses(*TTI) &&
2822                "masked interleaved groups are not allowed.");
2823         Value *GroupMask = MaskForGaps;
2824         if (BlockInMask) {
2825           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2826           Value *ShuffledMask = Builder.CreateShuffleVector(
2827               BlockInMaskPart,
2828               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2829               "interleaved.mask");
2830           GroupMask = MaskForGaps
2831                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2832                                                 MaskForGaps)
2833                           : ShuffledMask;
2834         }
2835         NewLoad =
2836             Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2837                                      GroupMask, PoisonVec, "wide.masked.vec");
2838       }
2839       else
2840         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2841                                             Group->getAlign(), "wide.vec");
2842       Group->addMetadata(NewLoad);
2843       NewLoads.push_back(NewLoad);
2844     }
2845 
2846     // For each member in the group, shuffle out the appropriate data from the
2847     // wide loads.
2848     unsigned J = 0;
2849     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2850       Instruction *Member = Group->getMember(I);
2851 
2852       // Skip the gaps in the group.
2853       if (!Member)
2854         continue;
2855 
2856       auto StrideMask =
2857           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2858       for (unsigned Part = 0; Part < UF; Part++) {
2859         Value *StridedVec = Builder.CreateShuffleVector(
2860             NewLoads[Part], StrideMask, "strided.vec");
2861 
2862         // If this member has different type, cast the result type.
2863         if (Member->getType() != ScalarTy) {
2864           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2865           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2866           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2867         }
2868 
2869         if (Group->isReverse())
2870           StridedVec = reverseVector(StridedVec);
2871 
2872         State.set(VPDefs[J], StridedVec, Part);
2873       }
2874       ++J;
2875     }
2876     return;
2877   }
2878 
2879   // The sub vector type for current instruction.
2880   auto *SubVT = VectorType::get(ScalarTy, VF);
2881 
2882   // Vectorize the interleaved store group.
2883   MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2884   assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2885          "masked interleaved groups are not allowed.");
2886   assert((!MaskForGaps || !VF.isScalable()) &&
2887          "masking gaps for scalable vectors is not yet supported.");
2888   for (unsigned Part = 0; Part < UF; Part++) {
2889     // Collect the stored vector from each member.
2890     SmallVector<Value *, 4> StoredVecs;
2891     for (unsigned i = 0; i < InterleaveFactor; i++) {
2892       assert((Group->getMember(i) || MaskForGaps) &&
2893              "Fail to get a member from an interleaved store group");
2894       Instruction *Member = Group->getMember(i);
2895 
2896       // Skip the gaps in the group.
2897       if (!Member) {
2898         Value *Undef = PoisonValue::get(SubVT);
2899         StoredVecs.push_back(Undef);
2900         continue;
2901       }
2902 
2903       Value *StoredVec = State.get(StoredValues[i], Part);
2904 
2905       if (Group->isReverse())
2906         StoredVec = reverseVector(StoredVec);
2907 
2908       // If this member has different type, cast it to a unified type.
2909 
2910       if (StoredVec->getType() != SubVT)
2911         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2912 
2913       StoredVecs.push_back(StoredVec);
2914     }
2915 
2916     // Concatenate all vectors into a wide vector.
2917     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2918 
2919     // Interleave the elements in the wide vector.
2920     Value *IVec = Builder.CreateShuffleVector(
2921         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2922         "interleaved.vec");
2923 
2924     Instruction *NewStoreInstr;
2925     if (BlockInMask || MaskForGaps) {
2926       Value *GroupMask = MaskForGaps;
2927       if (BlockInMask) {
2928         Value *BlockInMaskPart = State.get(BlockInMask, Part);
2929         Value *ShuffledMask = Builder.CreateShuffleVector(
2930             BlockInMaskPart,
2931             createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2932             "interleaved.mask");
2933         GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And,
2934                                                       ShuffledMask, MaskForGaps)
2935                                 : ShuffledMask;
2936       }
2937       NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2938                                                 Group->getAlign(), GroupMask);
2939     } else
2940       NewStoreInstr =
2941           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2942 
2943     Group->addMetadata(NewStoreInstr);
2944   }
2945 }
2946 
2947 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2948                                                VPReplicateRecipe *RepRecipe,
2949                                                const VPIteration &Instance,
2950                                                bool IfPredicateInstr,
2951                                                VPTransformState &State) {
2952   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2953 
2954   // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2955   // the first lane and part.
2956   if (isa<NoAliasScopeDeclInst>(Instr))
2957     if (!Instance.isFirstIteration())
2958       return;
2959 
2960   setDebugLocFromInst(Instr);
2961 
2962   // Does this instruction return a value ?
2963   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2964 
2965   Instruction *Cloned = Instr->clone();
2966   if (!IsVoidRetTy)
2967     Cloned->setName(Instr->getName() + ".cloned");
2968 
2969   // If the scalarized instruction contributes to the address computation of a
2970   // widen masked load/store which was in a basic block that needed predication
2971   // and is not predicated after vectorization, we can't propagate
2972   // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized
2973   // instruction could feed a poison value to the base address of the widen
2974   // load/store.
2975   if (State.MayGeneratePoisonRecipes.contains(RepRecipe))
2976     Cloned->dropPoisonGeneratingFlags();
2977 
2978   State.Builder.SetInsertPoint(Builder.GetInsertBlock(),
2979                                Builder.GetInsertPoint());
2980   // Replace the operands of the cloned instructions with their scalar
2981   // equivalents in the new loop.
2982   for (auto &I : enumerate(RepRecipe->operands())) {
2983     auto InputInstance = Instance;
2984     VPValue *Operand = I.value();
2985     if (State.Plan->isUniformAfterVectorization(Operand))
2986       InputInstance.Lane = VPLane::getFirstLane();
2987     Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2988   }
2989   addNewMetadata(Cloned, Instr);
2990 
2991   // Place the cloned scalar in the new loop.
2992   Builder.Insert(Cloned);
2993 
2994   State.set(RepRecipe, Cloned, Instance);
2995 
2996   // If we just cloned a new assumption, add it the assumption cache.
2997   if (auto *II = dyn_cast<AssumeInst>(Cloned))
2998     AC->registerAssumption(II);
2999 
3000   // End if-block.
3001   if (IfPredicateInstr)
3002     PredicatedInstructions.push_back(Cloned);
3003 }
3004 
3005 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
3006                                                       Value *End, Value *Step,
3007                                                       Instruction *DL) {
3008   BasicBlock *Header = L->getHeader();
3009   BasicBlock *Latch = L->getLoopLatch();
3010   // As we're just creating this loop, it's possible no latch exists
3011   // yet. If so, use the header as this will be a single block loop.
3012   if (!Latch)
3013     Latch = Header;
3014 
3015   IRBuilder<> B(&*Header->getFirstInsertionPt());
3016   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
3017   setDebugLocFromInst(OldInst, &B);
3018   auto *Induction = B.CreatePHI(Start->getType(), 2, "index");
3019 
3020   B.SetInsertPoint(Latch->getTerminator());
3021   setDebugLocFromInst(OldInst, &B);
3022 
3023   // Create i+1 and fill the PHINode.
3024   //
3025   // If the tail is not folded, we know that End - Start >= Step (either
3026   // statically or through the minimum iteration checks). We also know that both
3027   // Start % Step == 0 and End % Step == 0. We exit the vector loop if %IV +
3028   // %Step == %End. Hence we must exit the loop before %IV + %Step unsigned
3029   // overflows and we can mark the induction increment as NUW.
3030   Value *Next = B.CreateAdd(Induction, Step, "index.next",
3031                             /*NUW=*/!Cost->foldTailByMasking(), /*NSW=*/false);
3032   Induction->addIncoming(Start, L->getLoopPreheader());
3033   Induction->addIncoming(Next, Latch);
3034   // Create the compare.
3035   Value *ICmp = B.CreateICmpEQ(Next, End);
3036   B.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header);
3037 
3038   // Now we have two terminators. Remove the old one from the block.
3039   Latch->getTerminator()->eraseFromParent();
3040 
3041   return Induction;
3042 }
3043 
3044 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
3045   if (TripCount)
3046     return TripCount;
3047 
3048   assert(L && "Create Trip Count for null loop.");
3049   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3050   // Find the loop boundaries.
3051   ScalarEvolution *SE = PSE.getSE();
3052   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
3053   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
3054          "Invalid loop count");
3055 
3056   Type *IdxTy = Legal->getWidestInductionType();
3057   assert(IdxTy && "No type for induction");
3058 
3059   // The exit count might have the type of i64 while the phi is i32. This can
3060   // happen if we have an induction variable that is sign extended before the
3061   // compare. The only way that we get a backedge taken count is that the
3062   // induction variable was signed and as such will not overflow. In such a case
3063   // truncation is legal.
3064   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
3065       IdxTy->getPrimitiveSizeInBits())
3066     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
3067   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
3068 
3069   // Get the total trip count from the count by adding 1.
3070   const SCEV *ExitCount = SE->getAddExpr(
3071       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
3072 
3073   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
3074 
3075   // Expand the trip count and place the new instructions in the preheader.
3076   // Notice that the pre-header does not change, only the loop body.
3077   SCEVExpander Exp(*SE, DL, "induction");
3078 
3079   // Count holds the overall loop count (N).
3080   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
3081                                 L->getLoopPreheader()->getTerminator());
3082 
3083   if (TripCount->getType()->isPointerTy())
3084     TripCount =
3085         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
3086                                     L->getLoopPreheader()->getTerminator());
3087 
3088   return TripCount;
3089 }
3090 
3091 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
3092   if (VectorTripCount)
3093     return VectorTripCount;
3094 
3095   Value *TC = getOrCreateTripCount(L);
3096   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3097 
3098   Type *Ty = TC->getType();
3099   // This is where we can make the step a runtime constant.
3100   Value *Step = createStepForVF(Builder, Ty, VF, UF);
3101 
3102   // If the tail is to be folded by masking, round the number of iterations N
3103   // up to a multiple of Step instead of rounding down. This is done by first
3104   // adding Step-1 and then rounding down. Note that it's ok if this addition
3105   // overflows: the vector induction variable will eventually wrap to zero given
3106   // that it starts at zero and its Step is a power of two; the loop will then
3107   // exit, with the last early-exit vector comparison also producing all-true.
3108   if (Cost->foldTailByMasking()) {
3109     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
3110            "VF*UF must be a power of 2 when folding tail by masking");
3111     assert(!VF.isScalable() &&
3112            "Tail folding not yet supported for scalable vectors");
3113     TC = Builder.CreateAdd(
3114         TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
3115   }
3116 
3117   // Now we need to generate the expression for the part of the loop that the
3118   // vectorized body will execute. This is equal to N - (N % Step) if scalar
3119   // iterations are not required for correctness, or N - Step, otherwise. Step
3120   // is equal to the vectorization factor (number of SIMD elements) times the
3121   // unroll factor (number of SIMD instructions).
3122   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
3123 
3124   // There are cases where we *must* run at least one iteration in the remainder
3125   // loop.  See the cost model for when this can happen.  If the step evenly
3126   // divides the trip count, we set the remainder to be equal to the step. If
3127   // the step does not evenly divide the trip count, no adjustment is necessary
3128   // since there will already be scalar iterations. Note that the minimum
3129   // iterations check ensures that N >= Step.
3130   if (Cost->requiresScalarEpilogue(VF)) {
3131     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
3132     R = Builder.CreateSelect(IsZero, Step, R);
3133   }
3134 
3135   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
3136 
3137   return VectorTripCount;
3138 }
3139 
3140 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
3141                                                    const DataLayout &DL) {
3142   // Verify that V is a vector type with same number of elements as DstVTy.
3143   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
3144   unsigned VF = DstFVTy->getNumElements();
3145   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
3146   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
3147   Type *SrcElemTy = SrcVecTy->getElementType();
3148   Type *DstElemTy = DstFVTy->getElementType();
3149   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3150          "Vector elements must have same size");
3151 
3152   // Do a direct cast if element types are castable.
3153   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3154     return Builder.CreateBitOrPointerCast(V, DstFVTy);
3155   }
3156   // V cannot be directly casted to desired vector type.
3157   // May happen when V is a floating point vector but DstVTy is a vector of
3158   // pointers or vice-versa. Handle this using a two-step bitcast using an
3159   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3160   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3161          "Only one type should be a pointer type");
3162   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3163          "Only one type should be a floating point type");
3164   Type *IntTy =
3165       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3166   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
3167   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3168   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
3169 }
3170 
3171 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
3172                                                          BasicBlock *Bypass) {
3173   Value *Count = getOrCreateTripCount(L);
3174   // Reuse existing vector loop preheader for TC checks.
3175   // Note that new preheader block is generated for vector loop.
3176   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
3177   IRBuilder<> Builder(TCCheckBlock->getTerminator());
3178 
3179   // Generate code to check if the loop's trip count is less than VF * UF, or
3180   // equal to it in case a scalar epilogue is required; this implies that the
3181   // vector trip count is zero. This check also covers the case where adding one
3182   // to the backedge-taken count overflowed leading to an incorrect trip count
3183   // of zero. In this case we will also jump to the scalar loop.
3184   auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
3185                                             : ICmpInst::ICMP_ULT;
3186 
3187   // If tail is to be folded, vector loop takes care of all iterations.
3188   Value *CheckMinIters = Builder.getFalse();
3189   if (!Cost->foldTailByMasking()) {
3190     Value *Step = createStepForVF(Builder, Count->getType(), VF, UF);
3191     CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
3192   }
3193   // Create new preheader for vector loop.
3194   LoopVectorPreHeader =
3195       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3196                  "vector.ph");
3197 
3198   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
3199                                DT->getNode(Bypass)->getIDom()) &&
3200          "TC check is expected to dominate Bypass");
3201 
3202   // Update dominator for Bypass & LoopExit (if needed).
3203   DT->changeImmediateDominator(Bypass, TCCheckBlock);
3204   if (!Cost->requiresScalarEpilogue(VF))
3205     // If there is an epilogue which must run, there's no edge from the
3206     // middle block to exit blocks  and thus no need to update the immediate
3207     // dominator of the exit blocks.
3208     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3209 
3210   ReplaceInstWithInst(
3211       TCCheckBlock->getTerminator(),
3212       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3213   LoopBypassBlocks.push_back(TCCheckBlock);
3214 }
3215 
3216 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
3217 
3218   BasicBlock *const SCEVCheckBlock =
3219       RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock);
3220   if (!SCEVCheckBlock)
3221     return nullptr;
3222 
3223   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3224            (OptForSizeBasedOnProfile &&
3225             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
3226          "Cannot SCEV check stride or overflow when optimizing for size");
3227 
3228 
3229   // Update dominator only if this is first RT check.
3230   if (LoopBypassBlocks.empty()) {
3231     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3232     if (!Cost->requiresScalarEpilogue(VF))
3233       // If there is an epilogue which must run, there's no edge from the
3234       // middle block to exit blocks  and thus no need to update the immediate
3235       // dominator of the exit blocks.
3236       DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3237   }
3238 
3239   LoopBypassBlocks.push_back(SCEVCheckBlock);
3240   AddedSafetyChecks = true;
3241   return SCEVCheckBlock;
3242 }
3243 
3244 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
3245                                                       BasicBlock *Bypass) {
3246   // VPlan-native path does not do any analysis for runtime checks currently.
3247   if (EnableVPlanNativePath)
3248     return nullptr;
3249 
3250   BasicBlock *const MemCheckBlock =
3251       RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader);
3252 
3253   // Check if we generated code that checks in runtime if arrays overlap. We put
3254   // the checks into a separate block to make the more common case of few
3255   // elements faster.
3256   if (!MemCheckBlock)
3257     return nullptr;
3258 
3259   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3260     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3261            "Cannot emit memory checks when optimizing for size, unless forced "
3262            "to vectorize.");
3263     ORE->emit([&]() {
3264       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3265                                         L->getStartLoc(), L->getHeader())
3266              << "Code-size may be reduced by not forcing "
3267                 "vectorization, or by source-code modifications "
3268                 "eliminating the need for runtime checks "
3269                 "(e.g., adding 'restrict').";
3270     });
3271   }
3272 
3273   LoopBypassBlocks.push_back(MemCheckBlock);
3274 
3275   AddedSafetyChecks = true;
3276 
3277   // We currently don't use LoopVersioning for the actual loop cloning but we
3278   // still use it to add the noalias metadata.
3279   LVer = std::make_unique<LoopVersioning>(
3280       *Legal->getLAI(),
3281       Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3282       DT, PSE.getSE());
3283   LVer->prepareNoAliasMetadata();
3284   return MemCheckBlock;
3285 }
3286 
3287 Value *InnerLoopVectorizer::emitTransformedIndex(
3288     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
3289     const InductionDescriptor &ID, BasicBlock *VectorHeader) const {
3290 
3291   SCEVExpander Exp(*SE, DL, "induction");
3292   auto Step = ID.getStep();
3293   auto StartValue = ID.getStartValue();
3294   assert(Index->getType()->getScalarType() == Step->getType() &&
3295          "Index scalar type does not match StepValue type");
3296 
3297   // Note: the IR at this point is broken. We cannot use SE to create any new
3298   // SCEV and then expand it, hoping that SCEV's simplification will give us
3299   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3300   // lead to various SCEV crashes. So all we can do is to use builder and rely
3301   // on InstCombine for future simplifications. Here we handle some trivial
3302   // cases only.
3303   auto CreateAdd = [&B](Value *X, Value *Y) {
3304     assert(X->getType() == Y->getType() && "Types don't match!");
3305     if (auto *CX = dyn_cast<ConstantInt>(X))
3306       if (CX->isZero())
3307         return Y;
3308     if (auto *CY = dyn_cast<ConstantInt>(Y))
3309       if (CY->isZero())
3310         return X;
3311     return B.CreateAdd(X, Y);
3312   };
3313 
3314   // We allow X to be a vector type, in which case Y will potentially be
3315   // splatted into a vector with the same element count.
3316   auto CreateMul = [&B](Value *X, Value *Y) {
3317     assert(X->getType()->getScalarType() == Y->getType() &&
3318            "Types don't match!");
3319     if (auto *CX = dyn_cast<ConstantInt>(X))
3320       if (CX->isOne())
3321         return Y;
3322     if (auto *CY = dyn_cast<ConstantInt>(Y))
3323       if (CY->isOne())
3324         return X;
3325     VectorType *XVTy = dyn_cast<VectorType>(X->getType());
3326     if (XVTy && !isa<VectorType>(Y->getType()))
3327       Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
3328     return B.CreateMul(X, Y);
3329   };
3330 
3331   // Get a suitable insert point for SCEV expansion. For blocks in the vector
3332   // loop, choose the end of the vector loop header (=VectorHeader), because
3333   // the DomTree is not kept up-to-date for additional blocks generated in the
3334   // vector loop. By using the header as insertion point, we guarantee that the
3335   // expanded instructions dominate all their uses.
3336   auto GetInsertPoint = [this, &B, VectorHeader]() {
3337     BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3338     if (InsertBB != LoopVectorBody &&
3339         LI->getLoopFor(VectorHeader) == LI->getLoopFor(InsertBB))
3340       return VectorHeader->getTerminator();
3341     return &*B.GetInsertPoint();
3342   };
3343 
3344   switch (ID.getKind()) {
3345   case InductionDescriptor::IK_IntInduction: {
3346     assert(!isa<VectorType>(Index->getType()) &&
3347            "Vector indices not supported for integer inductions yet");
3348     assert(Index->getType() == StartValue->getType() &&
3349            "Index type does not match StartValue type");
3350     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3351       return B.CreateSub(StartValue, Index);
3352     auto *Offset = CreateMul(
3353         Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3354     return CreateAdd(StartValue, Offset);
3355   }
3356   case InductionDescriptor::IK_PtrInduction: {
3357     assert(isa<SCEVConstant>(Step) &&
3358            "Expected constant step for pointer induction");
3359     return B.CreateGEP(
3360         ID.getElementType(), StartValue,
3361         CreateMul(Index,
3362                   Exp.expandCodeFor(Step, Index->getType()->getScalarType(),
3363                                     GetInsertPoint())));
3364   }
3365   case InductionDescriptor::IK_FpInduction: {
3366     assert(!isa<VectorType>(Index->getType()) &&
3367            "Vector indices not supported for FP inductions yet");
3368     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
3369     auto InductionBinOp = ID.getInductionBinOp();
3370     assert(InductionBinOp &&
3371            (InductionBinOp->getOpcode() == Instruction::FAdd ||
3372             InductionBinOp->getOpcode() == Instruction::FSub) &&
3373            "Original bin op should be defined for FP induction");
3374 
3375     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3376     Value *MulExp = B.CreateFMul(StepValue, Index);
3377     return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3378                          "induction");
3379   }
3380   case InductionDescriptor::IK_NoInduction:
3381     return nullptr;
3382   }
3383   llvm_unreachable("invalid enum");
3384 }
3385 
3386 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3387   LoopScalarBody = OrigLoop->getHeader();
3388   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3389   assert(LoopVectorPreHeader && "Invalid loop structure");
3390   LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3391   assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) &&
3392          "multiple exit loop without required epilogue?");
3393 
3394   LoopMiddleBlock =
3395       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3396                  LI, nullptr, Twine(Prefix) + "middle.block");
3397   LoopScalarPreHeader =
3398       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3399                  nullptr, Twine(Prefix) + "scalar.ph");
3400 
3401   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3402 
3403   // Set up the middle block terminator.  Two cases:
3404   // 1) If we know that we must execute the scalar epilogue, emit an
3405   //    unconditional branch.
3406   // 2) Otherwise, we must have a single unique exit block (due to how we
3407   //    implement the multiple exit case).  In this case, set up a conditonal
3408   //    branch from the middle block to the loop scalar preheader, and the
3409   //    exit block.  completeLoopSkeleton will update the condition to use an
3410   //    iteration check, if required to decide whether to execute the remainder.
3411   BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ?
3412     BranchInst::Create(LoopScalarPreHeader) :
3413     BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3414                        Builder.getTrue());
3415   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3416   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3417 
3418   // We intentionally don't let SplitBlock to update LoopInfo since
3419   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3420   // LoopVectorBody is explicitly added to the correct place few lines later.
3421   LoopVectorBody =
3422       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3423                  nullptr, nullptr, Twine(Prefix) + "vector.body");
3424 
3425   // Update dominator for loop exit.
3426   if (!Cost->requiresScalarEpilogue(VF))
3427     // If there is an epilogue which must run, there's no edge from the
3428     // middle block to exit blocks  and thus no need to update the immediate
3429     // dominator of the exit blocks.
3430     DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3431 
3432   // Create and register the new vector loop.
3433   Loop *Lp = LI->AllocateLoop();
3434   Loop *ParentLoop = OrigLoop->getParentLoop();
3435 
3436   // Insert the new loop into the loop nest and register the new basic blocks
3437   // before calling any utilities such as SCEV that require valid LoopInfo.
3438   if (ParentLoop) {
3439     ParentLoop->addChildLoop(Lp);
3440   } else {
3441     LI->addTopLevelLoop(Lp);
3442   }
3443   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3444   return Lp;
3445 }
3446 
3447 void InnerLoopVectorizer::createInductionResumeValues(
3448     Loop *L, Value *VectorTripCount,
3449     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3450   assert(VectorTripCount && L && "Expected valid arguments");
3451   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3452           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3453          "Inconsistent information about additional bypass.");
3454   // We are going to resume the execution of the scalar loop.
3455   // Go over all of the induction variables that we found and fix the
3456   // PHIs that are left in the scalar version of the loop.
3457   // The starting values of PHI nodes depend on the counter of the last
3458   // iteration in the vectorized loop.
3459   // If we come from a bypass edge then we need to start from the original
3460   // start value.
3461   for (auto &InductionEntry : Legal->getInductionVars()) {
3462     PHINode *OrigPhi = InductionEntry.first;
3463     InductionDescriptor II = InductionEntry.second;
3464 
3465     // Create phi nodes to merge from the  backedge-taken check block.
3466     PHINode *BCResumeVal =
3467         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3468                         LoopScalarPreHeader->getTerminator());
3469     // Copy original phi DL over to the new one.
3470     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3471     Value *&EndValue = IVEndValues[OrigPhi];
3472     Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3473     if (OrigPhi == OldInduction) {
3474       // We know what the end value is.
3475       EndValue = VectorTripCount;
3476     } else {
3477       IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3478 
3479       // Fast-math-flags propagate from the original induction instruction.
3480       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3481         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3482 
3483       Type *StepType = II.getStep()->getType();
3484       Instruction::CastOps CastOp =
3485           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3486       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3487       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3488       EndValue =
3489           emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody);
3490       EndValue->setName("ind.end");
3491 
3492       // Compute the end value for the additional bypass (if applicable).
3493       if (AdditionalBypass.first) {
3494         B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3495         CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3496                                          StepType, true);
3497         CRD =
3498             B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
3499         EndValueFromAdditionalBypass =
3500             emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody);
3501         EndValueFromAdditionalBypass->setName("ind.end");
3502       }
3503     }
3504     // The new PHI merges the original incoming value, in case of a bypass,
3505     // or the value at the end of the vectorized loop.
3506     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3507 
3508     // Fix the scalar body counter (PHI node).
3509     // The old induction's phi node in the scalar body needs the truncated
3510     // value.
3511     for (BasicBlock *BB : LoopBypassBlocks)
3512       BCResumeVal->addIncoming(II.getStartValue(), BB);
3513 
3514     if (AdditionalBypass.first)
3515       BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3516                                             EndValueFromAdditionalBypass);
3517 
3518     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3519   }
3520 }
3521 
3522 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3523                                                       MDNode *OrigLoopID) {
3524   assert(L && "Expected valid loop.");
3525 
3526   // The trip counts should be cached by now.
3527   Value *Count = getOrCreateTripCount(L);
3528   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3529 
3530   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3531 
3532   // Add a check in the middle block to see if we have completed
3533   // all of the iterations in the first vector loop.  Three cases:
3534   // 1) If we require a scalar epilogue, there is no conditional branch as
3535   //    we unconditionally branch to the scalar preheader.  Do nothing.
3536   // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3537   //    Thus if tail is to be folded, we know we don't need to run the
3538   //    remainder and we can use the previous value for the condition (true).
3539   // 3) Otherwise, construct a runtime check.
3540   if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) {
3541     Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3542                                         Count, VectorTripCount, "cmp.n",
3543                                         LoopMiddleBlock->getTerminator());
3544 
3545     // Here we use the same DebugLoc as the scalar loop latch terminator instead
3546     // of the corresponding compare because they may have ended up with
3547     // different line numbers and we want to avoid awkward line stepping while
3548     // debugging. Eg. if the compare has got a line number inside the loop.
3549     CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3550     cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3551   }
3552 
3553   // Get ready to start creating new instructions into the vectorized body.
3554   assert(LoopVectorPreHeader == L->getLoopPreheader() &&
3555          "Inconsistent vector loop preheader");
3556   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3557 
3558   Optional<MDNode *> VectorizedLoopID =
3559       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3560                                       LLVMLoopVectorizeFollowupVectorized});
3561   if (VectorizedLoopID.hasValue()) {
3562     L->setLoopID(VectorizedLoopID.getValue());
3563 
3564     // Do not setAlreadyVectorized if loop attributes have been defined
3565     // explicitly.
3566     return LoopVectorPreHeader;
3567   }
3568 
3569   // Keep all loop hints from the original loop on the vector loop (we'll
3570   // replace the vectorizer-specific hints below).
3571   if (MDNode *LID = OrigLoop->getLoopID())
3572     L->setLoopID(LID);
3573 
3574   LoopVectorizeHints Hints(L, true, *ORE, TTI);
3575   Hints.setAlreadyVectorized();
3576 
3577 #ifdef EXPENSIVE_CHECKS
3578   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3579   LI->verify(*DT);
3580 #endif
3581 
3582   return LoopVectorPreHeader;
3583 }
3584 
3585 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3586   /*
3587    In this function we generate a new loop. The new loop will contain
3588    the vectorized instructions while the old loop will continue to run the
3589    scalar remainder.
3590 
3591        [ ] <-- loop iteration number check.
3592     /   |
3593    /    v
3594   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3595   |  /  |
3596   | /   v
3597   ||   [ ]     <-- vector pre header.
3598   |/    |
3599   |     v
3600   |    [  ] \
3601   |    [  ]_|   <-- vector loop.
3602   |     |
3603   |     v
3604   \   -[ ]   <--- middle-block.
3605    \/   |
3606    /\   v
3607    | ->[ ]     <--- new preheader.
3608    |    |
3609  (opt)  v      <-- edge from middle to exit iff epilogue is not required.
3610    |   [ ] \
3611    |   [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue).
3612     \   |
3613      \  v
3614       >[ ]     <-- exit block(s).
3615    ...
3616    */
3617 
3618   // Get the metadata of the original loop before it gets modified.
3619   MDNode *OrigLoopID = OrigLoop->getLoopID();
3620 
3621   // Workaround!  Compute the trip count of the original loop and cache it
3622   // before we start modifying the CFG.  This code has a systemic problem
3623   // wherein it tries to run analysis over partially constructed IR; this is
3624   // wrong, and not simply for SCEV.  The trip count of the original loop
3625   // simply happens to be prone to hitting this in practice.  In theory, we
3626   // can hit the same issue for any SCEV, or ValueTracking query done during
3627   // mutation.  See PR49900.
3628   getOrCreateTripCount(OrigLoop);
3629 
3630   // Create an empty vector loop, and prepare basic blocks for the runtime
3631   // checks.
3632   Loop *Lp = createVectorLoopSkeleton("");
3633 
3634   // Now, compare the new count to zero. If it is zero skip the vector loop and
3635   // jump to the scalar loop. This check also covers the case where the
3636   // backedge-taken count is uint##_max: adding one to it will overflow leading
3637   // to an incorrect trip count of zero. In this (rare) case we will also jump
3638   // to the scalar loop.
3639   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3640 
3641   // Generate the code to check any assumptions that we've made for SCEV
3642   // expressions.
3643   emitSCEVChecks(Lp, LoopScalarPreHeader);
3644 
3645   // Generate the code that checks in runtime if arrays overlap. We put the
3646   // checks into a separate block to make the more common case of few elements
3647   // faster.
3648   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3649 
3650   // Some loops have a single integer induction variable, while other loops
3651   // don't. One example is c++ iterators that often have multiple pointer
3652   // induction variables. In the code below we also support a case where we
3653   // don't have a single induction variable.
3654   //
3655   // We try to obtain an induction variable from the original loop as hard
3656   // as possible. However if we don't find one that:
3657   //   - is an integer
3658   //   - counts from zero, stepping by one
3659   //   - is the size of the widest induction variable type
3660   // then we create a new one.
3661   OldInduction = Legal->getPrimaryInduction();
3662   Type *IdxTy = Legal->getWidestInductionType();
3663   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3664   // The loop step is equal to the vectorization factor (num of SIMD elements)
3665   // times the unroll factor (num of SIMD instructions).
3666   Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
3667   Value *Step = createStepForVF(Builder, IdxTy, VF, UF);
3668   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3669   Induction =
3670       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3671                               getDebugLocFromInstOrOperands(OldInduction));
3672 
3673   // Emit phis for the new starting index of the scalar loop.
3674   createInductionResumeValues(Lp, CountRoundDown);
3675 
3676   return completeLoopSkeleton(Lp, OrigLoopID);
3677 }
3678 
3679 // Fix up external users of the induction variable. At this point, we are
3680 // in LCSSA form, with all external PHIs that use the IV having one input value,
3681 // coming from the remainder loop. We need those PHIs to also have a correct
3682 // value for the IV when arriving directly from the middle block.
3683 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3684                                        const InductionDescriptor &II,
3685                                        Value *CountRoundDown, Value *EndValue,
3686                                        BasicBlock *MiddleBlock) {
3687   // There are two kinds of external IV usages - those that use the value
3688   // computed in the last iteration (the PHI) and those that use the penultimate
3689   // value (the value that feeds into the phi from the loop latch).
3690   // We allow both, but they, obviously, have different values.
3691 
3692   assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3693 
3694   DenseMap<Value *, Value *> MissingVals;
3695 
3696   // An external user of the last iteration's value should see the value that
3697   // the remainder loop uses to initialize its own IV.
3698   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3699   for (User *U : PostInc->users()) {
3700     Instruction *UI = cast<Instruction>(U);
3701     if (!OrigLoop->contains(UI)) {
3702       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3703       MissingVals[UI] = EndValue;
3704     }
3705   }
3706 
3707   // An external user of the penultimate value need to see EndValue - Step.
3708   // The simplest way to get this is to recompute it from the constituent SCEVs,
3709   // that is Start + (Step * (CRD - 1)).
3710   for (User *U : OrigPhi->users()) {
3711     auto *UI = cast<Instruction>(U);
3712     if (!OrigLoop->contains(UI)) {
3713       const DataLayout &DL =
3714           OrigLoop->getHeader()->getModule()->getDataLayout();
3715       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3716 
3717       IRBuilder<> B(MiddleBlock->getTerminator());
3718 
3719       // Fast-math-flags propagate from the original induction instruction.
3720       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3721         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3722 
3723       Value *CountMinusOne = B.CreateSub(
3724           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3725       Value *CMO =
3726           !II.getStep()->getType()->isIntegerTy()
3727               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3728                              II.getStep()->getType())
3729               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3730       CMO->setName("cast.cmo");
3731       Value *Escape =
3732           emitTransformedIndex(B, CMO, PSE.getSE(), DL, II, LoopVectorBody);
3733       Escape->setName("ind.escape");
3734       MissingVals[UI] = Escape;
3735     }
3736   }
3737 
3738   for (auto &I : MissingVals) {
3739     PHINode *PHI = cast<PHINode>(I.first);
3740     // One corner case we have to handle is two IVs "chasing" each-other,
3741     // that is %IV2 = phi [...], [ %IV1, %latch ]
3742     // In this case, if IV1 has an external use, we need to avoid adding both
3743     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3744     // don't already have an incoming value for the middle block.
3745     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3746       PHI->addIncoming(I.second, MiddleBlock);
3747   }
3748 }
3749 
3750 namespace {
3751 
3752 struct CSEDenseMapInfo {
3753   static bool canHandle(const Instruction *I) {
3754     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3755            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3756   }
3757 
3758   static inline Instruction *getEmptyKey() {
3759     return DenseMapInfo<Instruction *>::getEmptyKey();
3760   }
3761 
3762   static inline Instruction *getTombstoneKey() {
3763     return DenseMapInfo<Instruction *>::getTombstoneKey();
3764   }
3765 
3766   static unsigned getHashValue(const Instruction *I) {
3767     assert(canHandle(I) && "Unknown instruction!");
3768     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3769                                                            I->value_op_end()));
3770   }
3771 
3772   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3773     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3774         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3775       return LHS == RHS;
3776     return LHS->isIdenticalTo(RHS);
3777   }
3778 };
3779 
3780 } // end anonymous namespace
3781 
3782 ///Perform cse of induction variable instructions.
3783 static void cse(BasicBlock *BB) {
3784   // Perform simple cse.
3785   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3786   for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3787     if (!CSEDenseMapInfo::canHandle(&In))
3788       continue;
3789 
3790     // Check if we can replace this instruction with any of the
3791     // visited instructions.
3792     if (Instruction *V = CSEMap.lookup(&In)) {
3793       In.replaceAllUsesWith(V);
3794       In.eraseFromParent();
3795       continue;
3796     }
3797 
3798     CSEMap[&In] = &In;
3799   }
3800 }
3801 
3802 InstructionCost
3803 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3804                                               bool &NeedToScalarize) const {
3805   Function *F = CI->getCalledFunction();
3806   Type *ScalarRetTy = CI->getType();
3807   SmallVector<Type *, 4> Tys, ScalarTys;
3808   for (auto &ArgOp : CI->args())
3809     ScalarTys.push_back(ArgOp->getType());
3810 
3811   // Estimate cost of scalarized vector call. The source operands are assumed
3812   // to be vectors, so we need to extract individual elements from there,
3813   // execute VF scalar calls, and then gather the result into the vector return
3814   // value.
3815   InstructionCost ScalarCallCost =
3816       TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
3817   if (VF.isScalar())
3818     return ScalarCallCost;
3819 
3820   // Compute corresponding vector type for return value and arguments.
3821   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3822   for (Type *ScalarTy : ScalarTys)
3823     Tys.push_back(ToVectorTy(ScalarTy, VF));
3824 
3825   // Compute costs of unpacking argument values for the scalar calls and
3826   // packing the return values to a vector.
3827   InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
3828 
3829   InstructionCost Cost =
3830       ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3831 
3832   // If we can't emit a vector call for this function, then the currently found
3833   // cost is the cost we need to return.
3834   NeedToScalarize = true;
3835   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3836   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3837 
3838   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3839     return Cost;
3840 
3841   // If the corresponding vector cost is cheaper, return its cost.
3842   InstructionCost VectorCallCost =
3843       TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
3844   if (VectorCallCost < Cost) {
3845     NeedToScalarize = false;
3846     Cost = VectorCallCost;
3847   }
3848   return Cost;
3849 }
3850 
3851 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3852   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3853     return Elt;
3854   return VectorType::get(Elt, VF);
3855 }
3856 
3857 InstructionCost
3858 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3859                                                    ElementCount VF) const {
3860   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3861   assert(ID && "Expected intrinsic call!");
3862   Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3863   FastMathFlags FMF;
3864   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3865     FMF = FPMO->getFastMathFlags();
3866 
3867   SmallVector<const Value *> Arguments(CI->args());
3868   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3869   SmallVector<Type *> ParamTys;
3870   std::transform(FTy->param_begin(), FTy->param_end(),
3871                  std::back_inserter(ParamTys),
3872                  [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3873 
3874   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3875                                     dyn_cast<IntrinsicInst>(CI));
3876   return TTI.getIntrinsicInstrCost(CostAttrs,
3877                                    TargetTransformInfo::TCK_RecipThroughput);
3878 }
3879 
3880 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3881   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3882   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3883   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3884 }
3885 
3886 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3887   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3888   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3889   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3890 }
3891 
3892 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3893   // For every instruction `I` in MinBWs, truncate the operands, create a
3894   // truncated version of `I` and reextend its result. InstCombine runs
3895   // later and will remove any ext/trunc pairs.
3896   SmallPtrSet<Value *, 4> Erased;
3897   for (const auto &KV : Cost->getMinimalBitwidths()) {
3898     // If the value wasn't vectorized, we must maintain the original scalar
3899     // type. The absence of the value from State indicates that it
3900     // wasn't vectorized.
3901     // FIXME: Should not rely on getVPValue at this point.
3902     VPValue *Def = State.Plan->getVPValue(KV.first, true);
3903     if (!State.hasAnyVectorValue(Def))
3904       continue;
3905     for (unsigned Part = 0; Part < UF; ++Part) {
3906       Value *I = State.get(Def, Part);
3907       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3908         continue;
3909       Type *OriginalTy = I->getType();
3910       Type *ScalarTruncatedTy =
3911           IntegerType::get(OriginalTy->getContext(), KV.second);
3912       auto *TruncatedTy = VectorType::get(
3913           ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
3914       if (TruncatedTy == OriginalTy)
3915         continue;
3916 
3917       IRBuilder<> B(cast<Instruction>(I));
3918       auto ShrinkOperand = [&](Value *V) -> Value * {
3919         if (auto *ZI = dyn_cast<ZExtInst>(V))
3920           if (ZI->getSrcTy() == TruncatedTy)
3921             return ZI->getOperand(0);
3922         return B.CreateZExtOrTrunc(V, TruncatedTy);
3923       };
3924 
3925       // The actual instruction modification depends on the instruction type,
3926       // unfortunately.
3927       Value *NewI = nullptr;
3928       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3929         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3930                              ShrinkOperand(BO->getOperand(1)));
3931 
3932         // Any wrapping introduced by shrinking this operation shouldn't be
3933         // considered undefined behavior. So, we can't unconditionally copy
3934         // arithmetic wrapping flags to NewI.
3935         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3936       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3937         NewI =
3938             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3939                          ShrinkOperand(CI->getOperand(1)));
3940       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3941         NewI = B.CreateSelect(SI->getCondition(),
3942                               ShrinkOperand(SI->getTrueValue()),
3943                               ShrinkOperand(SI->getFalseValue()));
3944       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3945         switch (CI->getOpcode()) {
3946         default:
3947           llvm_unreachable("Unhandled cast!");
3948         case Instruction::Trunc:
3949           NewI = ShrinkOperand(CI->getOperand(0));
3950           break;
3951         case Instruction::SExt:
3952           NewI = B.CreateSExtOrTrunc(
3953               CI->getOperand(0),
3954               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3955           break;
3956         case Instruction::ZExt:
3957           NewI = B.CreateZExtOrTrunc(
3958               CI->getOperand(0),
3959               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3960           break;
3961         }
3962       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3963         auto Elements0 =
3964             cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
3965         auto *O0 = B.CreateZExtOrTrunc(
3966             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3967         auto Elements1 =
3968             cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
3969         auto *O1 = B.CreateZExtOrTrunc(
3970             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3971 
3972         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3973       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3974         // Don't do anything with the operands, just extend the result.
3975         continue;
3976       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3977         auto Elements =
3978             cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
3979         auto *O0 = B.CreateZExtOrTrunc(
3980             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3981         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3982         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3983       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3984         auto Elements =
3985             cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
3986         auto *O0 = B.CreateZExtOrTrunc(
3987             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3988         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3989       } else {
3990         // If we don't know what to do, be conservative and don't do anything.
3991         continue;
3992       }
3993 
3994       // Lastly, extend the result.
3995       NewI->takeName(cast<Instruction>(I));
3996       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3997       I->replaceAllUsesWith(Res);
3998       cast<Instruction>(I)->eraseFromParent();
3999       Erased.insert(I);
4000       State.reset(Def, Res, Part);
4001     }
4002   }
4003 
4004   // We'll have created a bunch of ZExts that are now parentless. Clean up.
4005   for (const auto &KV : Cost->getMinimalBitwidths()) {
4006     // If the value wasn't vectorized, we must maintain the original scalar
4007     // type. The absence of the value from State indicates that it
4008     // wasn't vectorized.
4009     // FIXME: Should not rely on getVPValue at this point.
4010     VPValue *Def = State.Plan->getVPValue(KV.first, true);
4011     if (!State.hasAnyVectorValue(Def))
4012       continue;
4013     for (unsigned Part = 0; Part < UF; ++Part) {
4014       Value *I = State.get(Def, Part);
4015       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
4016       if (Inst && Inst->use_empty()) {
4017         Value *NewI = Inst->getOperand(0);
4018         Inst->eraseFromParent();
4019         State.reset(Def, NewI, Part);
4020       }
4021     }
4022   }
4023 }
4024 
4025 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
4026   // Insert truncates and extends for any truncated instructions as hints to
4027   // InstCombine.
4028   if (VF.isVector())
4029     truncateToMinimalBitwidths(State);
4030 
4031   // Fix widened non-induction PHIs by setting up the PHI operands.
4032   if (OrigPHIsToFix.size()) {
4033     assert(EnableVPlanNativePath &&
4034            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
4035     fixNonInductionPHIs(State);
4036   }
4037 
4038   // At this point every instruction in the original loop is widened to a
4039   // vector form. Now we need to fix the recurrences in the loop. These PHI
4040   // nodes are currently empty because we did not want to introduce cycles.
4041   // This is the second stage of vectorizing recurrences.
4042   fixCrossIterationPHIs(State);
4043 
4044   // Forget the original basic block.
4045   PSE.getSE()->forgetLoop(OrigLoop);
4046 
4047   // If we inserted an edge from the middle block to the unique exit block,
4048   // update uses outside the loop (phis) to account for the newly inserted
4049   // edge.
4050   if (!Cost->requiresScalarEpilogue(VF)) {
4051     // Fix-up external users of the induction variables.
4052     for (auto &Entry : Legal->getInductionVars())
4053       fixupIVUsers(Entry.first, Entry.second,
4054                    getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
4055                    IVEndValues[Entry.first], LoopMiddleBlock);
4056 
4057     fixLCSSAPHIs(State);
4058   }
4059 
4060   for (Instruction *PI : PredicatedInstructions)
4061     sinkScalarOperands(&*PI);
4062 
4063   // Remove redundant induction instructions.
4064   cse(LoopVectorBody);
4065 
4066   // Set/update profile weights for the vector and remainder loops as original
4067   // loop iterations are now distributed among them. Note that original loop
4068   // represented by LoopScalarBody becomes remainder loop after vectorization.
4069   //
4070   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
4071   // end up getting slightly roughened result but that should be OK since
4072   // profile is not inherently precise anyway. Note also possible bypass of
4073   // vector code caused by legality checks is ignored, assigning all the weight
4074   // to the vector loop, optimistically.
4075   //
4076   // For scalable vectorization we can't know at compile time how many iterations
4077   // of the loop are handled in one vector iteration, so instead assume a pessimistic
4078   // vscale of '1'.
4079   setProfileInfoAfterUnrolling(
4080       LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
4081       LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
4082 }
4083 
4084 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
4085   // In order to support recurrences we need to be able to vectorize Phi nodes.
4086   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4087   // stage #2: We now need to fix the recurrences by adding incoming edges to
4088   // the currently empty PHI nodes. At this point every instruction in the
4089   // original loop is widened to a vector form so we can use them to construct
4090   // the incoming edges.
4091   VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock();
4092   for (VPRecipeBase &R : Header->phis()) {
4093     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
4094       fixReduction(ReductionPhi, State);
4095     else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
4096       fixFirstOrderRecurrence(FOR, State);
4097   }
4098 }
4099 
4100 void InnerLoopVectorizer::fixFirstOrderRecurrence(
4101     VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
4102   // This is the second phase of vectorizing first-order recurrences. An
4103   // overview of the transformation is described below. Suppose we have the
4104   // following loop.
4105   //
4106   //   for (int i = 0; i < n; ++i)
4107   //     b[i] = a[i] - a[i - 1];
4108   //
4109   // There is a first-order recurrence on "a". For this loop, the shorthand
4110   // scalar IR looks like:
4111   //
4112   //   scalar.ph:
4113   //     s_init = a[-1]
4114   //     br scalar.body
4115   //
4116   //   scalar.body:
4117   //     i = phi [0, scalar.ph], [i+1, scalar.body]
4118   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
4119   //     s2 = a[i]
4120   //     b[i] = s2 - s1
4121   //     br cond, scalar.body, ...
4122   //
4123   // In this example, s1 is a recurrence because it's value depends on the
4124   // previous iteration. In the first phase of vectorization, we created a
4125   // vector phi v1 for s1. We now complete the vectorization and produce the
4126   // shorthand vector IR shown below (for VF = 4, UF = 1).
4127   //
4128   //   vector.ph:
4129   //     v_init = vector(..., ..., ..., a[-1])
4130   //     br vector.body
4131   //
4132   //   vector.body
4133   //     i = phi [0, vector.ph], [i+4, vector.body]
4134   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
4135   //     v2 = a[i, i+1, i+2, i+3];
4136   //     v3 = vector(v1(3), v2(0, 1, 2))
4137   //     b[i, i+1, i+2, i+3] = v2 - v3
4138   //     br cond, vector.body, middle.block
4139   //
4140   //   middle.block:
4141   //     x = v2(3)
4142   //     br scalar.ph
4143   //
4144   //   scalar.ph:
4145   //     s_init = phi [x, middle.block], [a[-1], otherwise]
4146   //     br scalar.body
4147   //
4148   // After execution completes the vector loop, we extract the next value of
4149   // the recurrence (x) to use as the initial value in the scalar loop.
4150 
4151   // Extract the last vector element in the middle block. This will be the
4152   // initial value for the recurrence when jumping to the scalar loop.
4153   VPValue *PreviousDef = PhiR->getBackedgeValue();
4154   Value *Incoming = State.get(PreviousDef, UF - 1);
4155   auto *ExtractForScalar = Incoming;
4156   auto *IdxTy = Builder.getInt32Ty();
4157   if (VF.isVector()) {
4158     auto *One = ConstantInt::get(IdxTy, 1);
4159     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4160     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
4161     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
4162     ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
4163                                                     "vector.recur.extract");
4164   }
4165   // Extract the second last element in the middle block if the
4166   // Phi is used outside the loop. We need to extract the phi itself
4167   // and not the last element (the phi update in the current iteration). This
4168   // will be the value when jumping to the exit block from the LoopMiddleBlock,
4169   // when the scalar loop is not run at all.
4170   Value *ExtractForPhiUsedOutsideLoop = nullptr;
4171   if (VF.isVector()) {
4172     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
4173     auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
4174     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4175         Incoming, Idx, "vector.recur.extract.for.phi");
4176   } else if (UF > 1)
4177     // When loop is unrolled without vectorizing, initialize
4178     // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
4179     // of `Incoming`. This is analogous to the vectorized case above: extracting
4180     // the second last element when VF > 1.
4181     ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
4182 
4183   // Fix the initial value of the original recurrence in the scalar loop.
4184   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
4185   PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
4186   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4187   auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
4188   for (auto *BB : predecessors(LoopScalarPreHeader)) {
4189     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4190     Start->addIncoming(Incoming, BB);
4191   }
4192 
4193   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
4194   Phi->setName("scalar.recur");
4195 
4196   // Finally, fix users of the recurrence outside the loop. The users will need
4197   // either the last value of the scalar recurrence or the last value of the
4198   // vector recurrence we extracted in the middle block. Since the loop is in
4199   // LCSSA form, we just need to find all the phi nodes for the original scalar
4200   // recurrence in the exit block, and then add an edge for the middle block.
4201   // Note that LCSSA does not imply single entry when the original scalar loop
4202   // had multiple exiting edges (as we always run the last iteration in the
4203   // scalar epilogue); in that case, there is no edge from middle to exit and
4204   // and thus no phis which needed updated.
4205   if (!Cost->requiresScalarEpilogue(VF))
4206     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4207       if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi))
4208         LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4209 }
4210 
4211 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
4212                                        VPTransformState &State) {
4213   PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
4214   // Get it's reduction variable descriptor.
4215   assert(Legal->isReductionVariable(OrigPhi) &&
4216          "Unable to find the reduction variable");
4217   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
4218 
4219   RecurKind RK = RdxDesc.getRecurrenceKind();
4220   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4221   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4222   setDebugLocFromInst(ReductionStartValue);
4223 
4224   VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
4225   // This is the vector-clone of the value that leaves the loop.
4226   Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
4227 
4228   // Wrap flags are in general invalid after vectorization, clear them.
4229   clearReductionWrapFlags(RdxDesc, State);
4230 
4231   // Before each round, move the insertion point right between
4232   // the PHIs and the values we are going to write.
4233   // This allows us to write both PHINodes and the extractelement
4234   // instructions.
4235   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4236 
4237   setDebugLocFromInst(LoopExitInst);
4238 
4239   Type *PhiTy = OrigPhi->getType();
4240   // If tail is folded by masking, the vector value to leave the loop should be
4241   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
4242   // instead of the former. For an inloop reduction the reduction will already
4243   // be predicated, and does not need to be handled here.
4244   if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
4245     for (unsigned Part = 0; Part < UF; ++Part) {
4246       Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
4247       Value *Sel = nullptr;
4248       for (User *U : VecLoopExitInst->users()) {
4249         if (isa<SelectInst>(U)) {
4250           assert(!Sel && "Reduction exit feeding two selects");
4251           Sel = U;
4252         } else
4253           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
4254       }
4255       assert(Sel && "Reduction exit feeds no select");
4256       State.reset(LoopExitInstDef, Sel, Part);
4257 
4258       // If the target can create a predicated operator for the reduction at no
4259       // extra cost in the loop (for example a predicated vadd), it can be
4260       // cheaper for the select to remain in the loop than be sunk out of it,
4261       // and so use the select value for the phi instead of the old
4262       // LoopExitValue.
4263       if (PreferPredicatedReductionSelect ||
4264           TTI->preferPredicatedReductionSelect(
4265               RdxDesc.getOpcode(), PhiTy,
4266               TargetTransformInfo::ReductionFlags())) {
4267         auto *VecRdxPhi =
4268             cast<PHINode>(State.get(PhiR, Part));
4269         VecRdxPhi->setIncomingValueForBlock(
4270             LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4271       }
4272     }
4273   }
4274 
4275   // If the vector reduction can be performed in a smaller type, we truncate
4276   // then extend the loop exit value to enable InstCombine to evaluate the
4277   // entire expression in the smaller type.
4278   if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
4279     assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
4280     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4281     Builder.SetInsertPoint(
4282         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4283     VectorParts RdxParts(UF);
4284     for (unsigned Part = 0; Part < UF; ++Part) {
4285       RdxParts[Part] = State.get(LoopExitInstDef, Part);
4286       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4287       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4288                                         : Builder.CreateZExt(Trunc, VecTy);
4289       for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users()))
4290         if (U != Trunc) {
4291           U->replaceUsesOfWith(RdxParts[Part], Extnd);
4292           RdxParts[Part] = Extnd;
4293         }
4294     }
4295     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4296     for (unsigned Part = 0; Part < UF; ++Part) {
4297       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4298       State.reset(LoopExitInstDef, RdxParts[Part], Part);
4299     }
4300   }
4301 
4302   // Reduce all of the unrolled parts into a single vector.
4303   Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
4304   unsigned Op = RecurrenceDescriptor::getOpcode(RK);
4305 
4306   // The middle block terminator has already been assigned a DebugLoc here (the
4307   // OrigLoop's single latch terminator). We want the whole middle block to
4308   // appear to execute on this line because: (a) it is all compiler generated,
4309   // (b) these instructions are always executed after evaluating the latch
4310   // conditional branch, and (c) other passes may add new predecessors which
4311   // terminate on this line. This is the easiest way to ensure we don't
4312   // accidentally cause an extra step back into the loop while debugging.
4313   setDebugLocFromInst(LoopMiddleBlock->getTerminator());
4314   if (PhiR->isOrdered())
4315     ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
4316   else {
4317     // Floating-point operations should have some FMF to enable the reduction.
4318     IRBuilderBase::FastMathFlagGuard FMFG(Builder);
4319     Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
4320     for (unsigned Part = 1; Part < UF; ++Part) {
4321       Value *RdxPart = State.get(LoopExitInstDef, Part);
4322       if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
4323         ReducedPartRdx = Builder.CreateBinOp(
4324             (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
4325       } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK))
4326         ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK,
4327                                            ReducedPartRdx, RdxPart);
4328       else
4329         ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
4330     }
4331   }
4332 
4333   // Create the reduction after the loop. Note that inloop reductions create the
4334   // target reduction in the loop using a Reduction recipe.
4335   if (VF.isVector() && !PhiR->isInLoop()) {
4336     ReducedPartRdx =
4337         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi);
4338     // If the reduction can be performed in a smaller type, we need to extend
4339     // the reduction to the wider type before we branch to the original loop.
4340     if (PhiTy != RdxDesc.getRecurrenceType())
4341       ReducedPartRdx = RdxDesc.isSigned()
4342                            ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
4343                            : Builder.CreateZExt(ReducedPartRdx, PhiTy);
4344   }
4345 
4346   // Create a phi node that merges control-flow from the backedge-taken check
4347   // block and the middle block.
4348   PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
4349                                         LoopScalarPreHeader->getTerminator());
4350   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4351     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4352   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4353 
4354   // Now, we need to fix the users of the reduction variable
4355   // inside and outside of the scalar remainder loop.
4356 
4357   // We know that the loop is in LCSSA form. We need to update the PHI nodes
4358   // in the exit blocks.  See comment on analogous loop in
4359   // fixFirstOrderRecurrence for a more complete explaination of the logic.
4360   if (!Cost->requiresScalarEpilogue(VF))
4361     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4362       if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst))
4363         LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4364 
4365   // Fix the scalar loop reduction variable with the incoming reduction sum
4366   // from the vector body and from the backedge value.
4367   int IncomingEdgeBlockIdx =
4368       OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4369   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4370   // Pick the other block.
4371   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4372   OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4373   OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4374 }
4375 
4376 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
4377                                                   VPTransformState &State) {
4378   RecurKind RK = RdxDesc.getRecurrenceKind();
4379   if (RK != RecurKind::Add && RK != RecurKind::Mul)
4380     return;
4381 
4382   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4383   assert(LoopExitInstr && "null loop exit instruction");
4384   SmallVector<Instruction *, 8> Worklist;
4385   SmallPtrSet<Instruction *, 8> Visited;
4386   Worklist.push_back(LoopExitInstr);
4387   Visited.insert(LoopExitInstr);
4388 
4389   while (!Worklist.empty()) {
4390     Instruction *Cur = Worklist.pop_back_val();
4391     if (isa<OverflowingBinaryOperator>(Cur))
4392       for (unsigned Part = 0; Part < UF; ++Part) {
4393         // FIXME: Should not rely on getVPValue at this point.
4394         Value *V = State.get(State.Plan->getVPValue(Cur, true), Part);
4395         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4396       }
4397 
4398     for (User *U : Cur->users()) {
4399       Instruction *UI = cast<Instruction>(U);
4400       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4401           Visited.insert(UI).second)
4402         Worklist.push_back(UI);
4403     }
4404   }
4405 }
4406 
4407 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) {
4408   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4409     if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
4410       // Some phis were already hand updated by the reduction and recurrence
4411       // code above, leave them alone.
4412       continue;
4413 
4414     auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4415     // Non-instruction incoming values will have only one value.
4416 
4417     VPLane Lane = VPLane::getFirstLane();
4418     if (isa<Instruction>(IncomingValue) &&
4419         !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue),
4420                                            VF))
4421       Lane = VPLane::getLastLaneForVF(VF);
4422 
4423     // Can be a loop invariant incoming value or the last scalar value to be
4424     // extracted from the vectorized loop.
4425     // FIXME: Should not rely on getVPValue at this point.
4426     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4427     Value *lastIncomingValue =
4428         OrigLoop->isLoopInvariant(IncomingValue)
4429             ? IncomingValue
4430             : State.get(State.Plan->getVPValue(IncomingValue, true),
4431                         VPIteration(UF - 1, Lane));
4432     LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4433   }
4434 }
4435 
4436 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4437   // The basic block and loop containing the predicated instruction.
4438   auto *PredBB = PredInst->getParent();
4439   auto *VectorLoop = LI->getLoopFor(PredBB);
4440 
4441   // Initialize a worklist with the operands of the predicated instruction.
4442   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4443 
4444   // Holds instructions that we need to analyze again. An instruction may be
4445   // reanalyzed if we don't yet know if we can sink it or not.
4446   SmallVector<Instruction *, 8> InstsToReanalyze;
4447 
4448   // Returns true if a given use occurs in the predicated block. Phi nodes use
4449   // their operands in their corresponding predecessor blocks.
4450   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4451     auto *I = cast<Instruction>(U.getUser());
4452     BasicBlock *BB = I->getParent();
4453     if (auto *Phi = dyn_cast<PHINode>(I))
4454       BB = Phi->getIncomingBlock(
4455           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4456     return BB == PredBB;
4457   };
4458 
4459   // Iteratively sink the scalarized operands of the predicated instruction
4460   // into the block we created for it. When an instruction is sunk, it's
4461   // operands are then added to the worklist. The algorithm ends after one pass
4462   // through the worklist doesn't sink a single instruction.
4463   bool Changed;
4464   do {
4465     // Add the instructions that need to be reanalyzed to the worklist, and
4466     // reset the changed indicator.
4467     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4468     InstsToReanalyze.clear();
4469     Changed = false;
4470 
4471     while (!Worklist.empty()) {
4472       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4473 
4474       // We can't sink an instruction if it is a phi node, is not in the loop,
4475       // or may have side effects.
4476       if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
4477           I->mayHaveSideEffects())
4478         continue;
4479 
4480       // If the instruction is already in PredBB, check if we can sink its
4481       // operands. In that case, VPlan's sinkScalarOperands() succeeded in
4482       // sinking the scalar instruction I, hence it appears in PredBB; but it
4483       // may have failed to sink I's operands (recursively), which we try
4484       // (again) here.
4485       if (I->getParent() == PredBB) {
4486         Worklist.insert(I->op_begin(), I->op_end());
4487         continue;
4488       }
4489 
4490       // It's legal to sink the instruction if all its uses occur in the
4491       // predicated block. Otherwise, there's nothing to do yet, and we may
4492       // need to reanalyze the instruction.
4493       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4494         InstsToReanalyze.push_back(I);
4495         continue;
4496       }
4497 
4498       // Move the instruction to the beginning of the predicated block, and add
4499       // it's operands to the worklist.
4500       I->moveBefore(&*PredBB->getFirstInsertionPt());
4501       Worklist.insert(I->op_begin(), I->op_end());
4502 
4503       // The sinking may have enabled other instructions to be sunk, so we will
4504       // need to iterate.
4505       Changed = true;
4506     }
4507   } while (Changed);
4508 }
4509 
4510 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
4511   for (PHINode *OrigPhi : OrigPHIsToFix) {
4512     VPWidenPHIRecipe *VPPhi =
4513         cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi));
4514     PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4515     // Make sure the builder has a valid insert point.
4516     Builder.SetInsertPoint(NewPhi);
4517     for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4518       VPValue *Inc = VPPhi->getIncomingValue(i);
4519       VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4520       NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4521     }
4522   }
4523 }
4524 
4525 bool InnerLoopVectorizer::useOrderedReductions(
4526     const RecurrenceDescriptor &RdxDesc) {
4527   return Cost->useOrderedReductions(RdxDesc);
4528 }
4529 
4530 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
4531                                               VPWidenPHIRecipe *PhiR,
4532                                               VPTransformState &State) {
4533   PHINode *P = cast<PHINode>(PN);
4534   if (EnableVPlanNativePath) {
4535     // Currently we enter here in the VPlan-native path for non-induction
4536     // PHIs where all control flow is uniform. We simply widen these PHIs.
4537     // Create a vector phi with no operands - the vector phi operands will be
4538     // set at the end of vector code generation.
4539     Type *VecTy = (State.VF.isScalar())
4540                       ? PN->getType()
4541                       : VectorType::get(PN->getType(), State.VF);
4542     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4543     State.set(PhiR, VecPhi, 0);
4544     OrigPHIsToFix.push_back(P);
4545 
4546     return;
4547   }
4548 
4549   assert(PN->getParent() == OrigLoop->getHeader() &&
4550          "Non-header phis should have been handled elsewhere");
4551 
4552   // In order to support recurrences we need to be able to vectorize Phi nodes.
4553   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4554   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4555   // this value when we vectorize all of the instructions that use the PHI.
4556 
4557   assert(!Legal->isReductionVariable(P) &&
4558          "reductions should be handled elsewhere");
4559 
4560   setDebugLocFromInst(P);
4561 
4562   // This PHINode must be an induction variable.
4563   // Make sure that we know about it.
4564   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4565 
4566   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4567   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4568 
4569   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4570   // which can be found from the original scalar operations.
4571   switch (II.getKind()) {
4572   case InductionDescriptor::IK_NoInduction:
4573     llvm_unreachable("Unknown induction");
4574   case InductionDescriptor::IK_IntInduction:
4575   case InductionDescriptor::IK_FpInduction:
4576     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4577   case InductionDescriptor::IK_PtrInduction: {
4578     // Handle the pointer induction variable case.
4579     assert(P->getType()->isPointerTy() && "Unexpected type.");
4580 
4581     if (Cost->isScalarAfterVectorization(P, State.VF)) {
4582       // This is the normalized GEP that starts counting at zero.
4583       Value *PtrInd =
4584           Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4585       // Determine the number of scalars we need to generate for each unroll
4586       // iteration. If the instruction is uniform, we only need to generate the
4587       // first lane. Otherwise, we generate all VF values.
4588       bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF);
4589       assert((IsUniform || !State.VF.isScalable()) &&
4590              "Cannot scalarize a scalable VF");
4591       unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
4592 
4593       for (unsigned Part = 0; Part < UF; ++Part) {
4594         Value *PartStart =
4595             createStepForVF(Builder, PtrInd->getType(), VF, Part);
4596 
4597         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4598           Value *Idx = Builder.CreateAdd(
4599               PartStart, ConstantInt::get(PtrInd->getType(), Lane));
4600           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4601           Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(),
4602                                                 DL, II, State.CFG.PrevBB);
4603           SclrGep->setName("next.gep");
4604           State.set(PhiR, SclrGep, VPIteration(Part, Lane));
4605         }
4606       }
4607       return;
4608     }
4609     assert(isa<SCEVConstant>(II.getStep()) &&
4610            "Induction step not a SCEV constant!");
4611     Type *PhiType = II.getStep()->getType();
4612 
4613     // Build a pointer phi
4614     Value *ScalarStartValue = II.getStartValue();
4615     Type *ScStValueType = ScalarStartValue->getType();
4616     PHINode *NewPointerPhi =
4617         PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4618     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4619 
4620     // A pointer induction, performed by using a gep
4621     BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4622     Instruction *InductionLoc = LoopLatch->getTerminator();
4623     const SCEV *ScalarStep = II.getStep();
4624     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4625     Value *ScalarStepValue =
4626         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4627     Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF);
4628     Value *NumUnrolledElems =
4629         Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
4630     Value *InductionGEP = GetElementPtrInst::Create(
4631         II.getElementType(), NewPointerPhi,
4632         Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
4633         InductionLoc);
4634     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4635 
4636     // Create UF many actual address geps that use the pointer
4637     // phi as base and a vectorized version of the step value
4638     // (<step*0, ..., step*N>) as offset.
4639     for (unsigned Part = 0; Part < State.UF; ++Part) {
4640       Type *VecPhiType = VectorType::get(PhiType, State.VF);
4641       Value *StartOffsetScalar =
4642           Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
4643       Value *StartOffset =
4644           Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
4645       // Create a vector of consecutive numbers from zero to VF.
4646       StartOffset =
4647           Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType));
4648 
4649       Value *GEP = Builder.CreateGEP(
4650           II.getElementType(), NewPointerPhi,
4651           Builder.CreateMul(
4652               StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue),
4653               "vector.gep"));
4654       State.set(PhiR, GEP, Part);
4655     }
4656   }
4657   }
4658 }
4659 
4660 /// A helper function for checking whether an integer division-related
4661 /// instruction may divide by zero (in which case it must be predicated if
4662 /// executed conditionally in the scalar code).
4663 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4664 /// Non-zero divisors that are non compile-time constants will not be
4665 /// converted into multiplication, so we will still end up scalarizing
4666 /// the division, but can do so w/o predication.
4667 static bool mayDivideByZero(Instruction &I) {
4668   assert((I.getOpcode() == Instruction::UDiv ||
4669           I.getOpcode() == Instruction::SDiv ||
4670           I.getOpcode() == Instruction::URem ||
4671           I.getOpcode() == Instruction::SRem) &&
4672          "Unexpected instruction");
4673   Value *Divisor = I.getOperand(1);
4674   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4675   return !CInt || CInt->isZero();
4676 }
4677 
4678 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4679                                                VPUser &ArgOperands,
4680                                                VPTransformState &State) {
4681   assert(!isa<DbgInfoIntrinsic>(I) &&
4682          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4683   setDebugLocFromInst(&I);
4684 
4685   Module *M = I.getParent()->getParent()->getParent();
4686   auto *CI = cast<CallInst>(&I);
4687 
4688   SmallVector<Type *, 4> Tys;
4689   for (Value *ArgOperand : CI->args())
4690     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4691 
4692   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4693 
4694   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4695   // version of the instruction.
4696   // Is it beneficial to perform intrinsic call compared to lib call?
4697   bool NeedToScalarize = false;
4698   InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4699   InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
4700   bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
4701   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4702          "Instruction should be scalarized elsewhere.");
4703   assert((IntrinsicCost.isValid() || CallCost.isValid()) &&
4704          "Either the intrinsic cost or vector call cost must be valid");
4705 
4706   for (unsigned Part = 0; Part < UF; ++Part) {
4707     SmallVector<Type *, 2> TysForDecl = {CI->getType()};
4708     SmallVector<Value *, 4> Args;
4709     for (auto &I : enumerate(ArgOperands.operands())) {
4710       // Some intrinsics have a scalar argument - don't replace it with a
4711       // vector.
4712       Value *Arg;
4713       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4714         Arg = State.get(I.value(), Part);
4715       else {
4716         Arg = State.get(I.value(), VPIteration(0, 0));
4717         if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index()))
4718           TysForDecl.push_back(Arg->getType());
4719       }
4720       Args.push_back(Arg);
4721     }
4722 
4723     Function *VectorF;
4724     if (UseVectorIntrinsic) {
4725       // Use vector version of the intrinsic.
4726       if (VF.isVector())
4727         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4728       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4729       assert(VectorF && "Can't retrieve vector intrinsic.");
4730     } else {
4731       // Use vector version of the function call.
4732       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4733 #ifndef NDEBUG
4734       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4735              "Can't create vector function.");
4736 #endif
4737         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4738     }
4739       SmallVector<OperandBundleDef, 1> OpBundles;
4740       CI->getOperandBundlesAsDefs(OpBundles);
4741       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4742 
4743       if (isa<FPMathOperator>(V))
4744         V->copyFastMathFlags(CI);
4745 
4746       State.set(Def, V, Part);
4747       addMetadata(V, &I);
4748   }
4749 }
4750 
4751 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4752   // We should not collect Scalars more than once per VF. Right now, this
4753   // function is called from collectUniformsAndScalars(), which already does
4754   // this check. Collecting Scalars for VF=1 does not make any sense.
4755   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4756          "This function should not be visited twice for the same VF");
4757 
4758   SmallSetVector<Instruction *, 8> Worklist;
4759 
4760   // These sets are used to seed the analysis with pointers used by memory
4761   // accesses that will remain scalar.
4762   SmallSetVector<Instruction *, 8> ScalarPtrs;
4763   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4764   auto *Latch = TheLoop->getLoopLatch();
4765 
4766   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4767   // The pointer operands of loads and stores will be scalar as long as the
4768   // memory access is not a gather or scatter operation. The value operand of a
4769   // store will remain scalar if the store is scalarized.
4770   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4771     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4772     assert(WideningDecision != CM_Unknown &&
4773            "Widening decision should be ready at this moment");
4774     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4775       if (Ptr == Store->getValueOperand())
4776         return WideningDecision == CM_Scalarize;
4777     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4778            "Ptr is neither a value or pointer operand");
4779     return WideningDecision != CM_GatherScatter;
4780   };
4781 
4782   // A helper that returns true if the given value is a bitcast or
4783   // getelementptr instruction contained in the loop.
4784   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4785     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4786             isa<GetElementPtrInst>(V)) &&
4787            !TheLoop->isLoopInvariant(V);
4788   };
4789 
4790   // A helper that evaluates a memory access's use of a pointer. If the use will
4791   // be a scalar use and the pointer is only used by memory accesses, we place
4792   // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4793   // PossibleNonScalarPtrs.
4794   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4795     // We only care about bitcast and getelementptr instructions contained in
4796     // the loop.
4797     if (!isLoopVaryingBitCastOrGEP(Ptr))
4798       return;
4799 
4800     // If the pointer has already been identified as scalar (e.g., if it was
4801     // also identified as uniform), there's nothing to do.
4802     auto *I = cast<Instruction>(Ptr);
4803     if (Worklist.count(I))
4804       return;
4805 
4806     // If the use of the pointer will be a scalar use, and all users of the
4807     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4808     // place the pointer in PossibleNonScalarPtrs.
4809     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4810           return isa<LoadInst>(U) || isa<StoreInst>(U);
4811         }))
4812       ScalarPtrs.insert(I);
4813     else
4814       PossibleNonScalarPtrs.insert(I);
4815   };
4816 
4817   // We seed the scalars analysis with three classes of instructions: (1)
4818   // instructions marked uniform-after-vectorization and (2) bitcast,
4819   // getelementptr and (pointer) phi instructions used by memory accesses
4820   // requiring a scalar use.
4821   //
4822   // (1) Add to the worklist all instructions that have been identified as
4823   // uniform-after-vectorization.
4824   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4825 
4826   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4827   // memory accesses requiring a scalar use. The pointer operands of loads and
4828   // stores will be scalar as long as the memory accesses is not a gather or
4829   // scatter operation. The value operand of a store will remain scalar if the
4830   // store is scalarized.
4831   for (auto *BB : TheLoop->blocks())
4832     for (auto &I : *BB) {
4833       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4834         evaluatePtrUse(Load, Load->getPointerOperand());
4835       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4836         evaluatePtrUse(Store, Store->getPointerOperand());
4837         evaluatePtrUse(Store, Store->getValueOperand());
4838       }
4839     }
4840   for (auto *I : ScalarPtrs)
4841     if (!PossibleNonScalarPtrs.count(I)) {
4842       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4843       Worklist.insert(I);
4844     }
4845 
4846   // Insert the forced scalars.
4847   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4848   // induction variable when the PHI user is scalarized.
4849   auto ForcedScalar = ForcedScalars.find(VF);
4850   if (ForcedScalar != ForcedScalars.end())
4851     for (auto *I : ForcedScalar->second)
4852       Worklist.insert(I);
4853 
4854   // Expand the worklist by looking through any bitcasts and getelementptr
4855   // instructions we've already identified as scalar. This is similar to the
4856   // expansion step in collectLoopUniforms(); however, here we're only
4857   // expanding to include additional bitcasts and getelementptr instructions.
4858   unsigned Idx = 0;
4859   while (Idx != Worklist.size()) {
4860     Instruction *Dst = Worklist[Idx++];
4861     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4862       continue;
4863     auto *Src = cast<Instruction>(Dst->getOperand(0));
4864     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4865           auto *J = cast<Instruction>(U);
4866           return !TheLoop->contains(J) || Worklist.count(J) ||
4867                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4868                   isScalarUse(J, Src));
4869         })) {
4870       Worklist.insert(Src);
4871       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4872     }
4873   }
4874 
4875   // An induction variable will remain scalar if all users of the induction
4876   // variable and induction variable update remain scalar.
4877   for (auto &Induction : Legal->getInductionVars()) {
4878     auto *Ind = Induction.first;
4879     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4880 
4881     // If tail-folding is applied, the primary induction variable will be used
4882     // to feed a vector compare.
4883     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4884       continue;
4885 
4886     // Returns true if \p Indvar is a pointer induction that is used directly by
4887     // load/store instruction \p I.
4888     auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
4889                                               Instruction *I) {
4890       return Induction.second.getKind() ==
4891                  InductionDescriptor::IK_PtrInduction &&
4892              (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
4893              Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
4894     };
4895 
4896     // Determine if all users of the induction variable are scalar after
4897     // vectorization.
4898     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4899       auto *I = cast<Instruction>(U);
4900       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4901              IsDirectLoadStoreFromPtrIndvar(Ind, I);
4902     });
4903     if (!ScalarInd)
4904       continue;
4905 
4906     // Determine if all users of the induction variable update instruction are
4907     // scalar after vectorization.
4908     auto ScalarIndUpdate =
4909         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4910           auto *I = cast<Instruction>(U);
4911           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4912                  IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
4913         });
4914     if (!ScalarIndUpdate)
4915       continue;
4916 
4917     // The induction variable and its update instruction will remain scalar.
4918     Worklist.insert(Ind);
4919     Worklist.insert(IndUpdate);
4920     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4921     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4922                       << "\n");
4923   }
4924 
4925   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4926 }
4927 
4928 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const {
4929   if (!blockNeedsPredicationForAnyReason(I->getParent()))
4930     return false;
4931   switch(I->getOpcode()) {
4932   default:
4933     break;
4934   case Instruction::Load:
4935   case Instruction::Store: {
4936     if (!Legal->isMaskRequired(I))
4937       return false;
4938     auto *Ptr = getLoadStorePointerOperand(I);
4939     auto *Ty = getLoadStoreType(I);
4940     const Align Alignment = getLoadStoreAlignment(I);
4941     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4942                                 TTI.isLegalMaskedGather(Ty, Alignment))
4943                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4944                                 TTI.isLegalMaskedScatter(Ty, Alignment));
4945   }
4946   case Instruction::UDiv:
4947   case Instruction::SDiv:
4948   case Instruction::SRem:
4949   case Instruction::URem:
4950     return mayDivideByZero(*I);
4951   }
4952   return false;
4953 }
4954 
4955 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4956     Instruction *I, ElementCount VF) {
4957   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4958   assert(getWideningDecision(I, VF) == CM_Unknown &&
4959          "Decision should not be set yet.");
4960   auto *Group = getInterleavedAccessGroup(I);
4961   assert(Group && "Must have a group.");
4962 
4963   // If the instruction's allocated size doesn't equal it's type size, it
4964   // requires padding and will be scalarized.
4965   auto &DL = I->getModule()->getDataLayout();
4966   auto *ScalarTy = getLoadStoreType(I);
4967   if (hasIrregularType(ScalarTy, DL))
4968     return false;
4969 
4970   // Check if masking is required.
4971   // A Group may need masking for one of two reasons: it resides in a block that
4972   // needs predication, or it was decided to use masking to deal with gaps
4973   // (either a gap at the end of a load-access that may result in a speculative
4974   // load, or any gaps in a store-access).
4975   bool PredicatedAccessRequiresMasking =
4976       blockNeedsPredicationForAnyReason(I->getParent()) &&
4977       Legal->isMaskRequired(I);
4978   bool LoadAccessWithGapsRequiresEpilogMasking =
4979       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4980       !isScalarEpilogueAllowed();
4981   bool StoreAccessWithGapsRequiresMasking =
4982       isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4983   if (!PredicatedAccessRequiresMasking &&
4984       !LoadAccessWithGapsRequiresEpilogMasking &&
4985       !StoreAccessWithGapsRequiresMasking)
4986     return true;
4987 
4988   // If masked interleaving is required, we expect that the user/target had
4989   // enabled it, because otherwise it either wouldn't have been created or
4990   // it should have been invalidated by the CostModel.
4991   assert(useMaskedInterleavedAccesses(TTI) &&
4992          "Masked interleave-groups for predicated accesses are not enabled.");
4993 
4994   if (Group->isReverse())
4995     return false;
4996 
4997   auto *Ty = getLoadStoreType(I);
4998   const Align Alignment = getLoadStoreAlignment(I);
4999   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
5000                           : TTI.isLegalMaskedStore(Ty, Alignment);
5001 }
5002 
5003 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
5004     Instruction *I, ElementCount VF) {
5005   // Get and ensure we have a valid memory instruction.
5006   assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
5007 
5008   auto *Ptr = getLoadStorePointerOperand(I);
5009   auto *ScalarTy = getLoadStoreType(I);
5010 
5011   // In order to be widened, the pointer should be consecutive, first of all.
5012   if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
5013     return false;
5014 
5015   // If the instruction is a store located in a predicated block, it will be
5016   // scalarized.
5017   if (isScalarWithPredication(I))
5018     return false;
5019 
5020   // If the instruction's allocated size doesn't equal it's type size, it
5021   // requires padding and will be scalarized.
5022   auto &DL = I->getModule()->getDataLayout();
5023   if (hasIrregularType(ScalarTy, DL))
5024     return false;
5025 
5026   return true;
5027 }
5028 
5029 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
5030   // We should not collect Uniforms more than once per VF. Right now,
5031   // this function is called from collectUniformsAndScalars(), which
5032   // already does this check. Collecting Uniforms for VF=1 does not make any
5033   // sense.
5034 
5035   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
5036          "This function should not be visited twice for the same VF");
5037 
5038   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5039   // not analyze again.  Uniforms.count(VF) will return 1.
5040   Uniforms[VF].clear();
5041 
5042   // We now know that the loop is vectorizable!
5043   // Collect instructions inside the loop that will remain uniform after
5044   // vectorization.
5045 
5046   // Global values, params and instructions outside of current loop are out of
5047   // scope.
5048   auto isOutOfScope = [&](Value *V) -> bool {
5049     Instruction *I = dyn_cast<Instruction>(V);
5050     return (!I || !TheLoop->contains(I));
5051   };
5052 
5053   // Worklist containing uniform instructions demanding lane 0.
5054   SetVector<Instruction *> Worklist;
5055   BasicBlock *Latch = TheLoop->getLoopLatch();
5056 
5057   // Add uniform instructions demanding lane 0 to the worklist. Instructions
5058   // that are scalar with predication must not be considered uniform after
5059   // vectorization, because that would create an erroneous replicating region
5060   // where only a single instance out of VF should be formed.
5061   // TODO: optimize such seldom cases if found important, see PR40816.
5062   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
5063     if (isOutOfScope(I)) {
5064       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
5065                         << *I << "\n");
5066       return;
5067     }
5068     if (isScalarWithPredication(I)) {
5069       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
5070                         << *I << "\n");
5071       return;
5072     }
5073     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
5074     Worklist.insert(I);
5075   };
5076 
5077   // Start with the conditional branch. If the branch condition is an
5078   // instruction contained in the loop that is only used by the branch, it is
5079   // uniform.
5080   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5081   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5082     addToWorklistIfAllowed(Cmp);
5083 
5084   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5085     InstWidening WideningDecision = getWideningDecision(I, VF);
5086     assert(WideningDecision != CM_Unknown &&
5087            "Widening decision should be ready at this moment");
5088 
5089     // A uniform memory op is itself uniform.  We exclude uniform stores
5090     // here as they demand the last lane, not the first one.
5091     if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
5092       assert(WideningDecision == CM_Scalarize);
5093       return true;
5094     }
5095 
5096     return (WideningDecision == CM_Widen ||
5097             WideningDecision == CM_Widen_Reverse ||
5098             WideningDecision == CM_Interleave);
5099   };
5100 
5101 
5102   // Returns true if Ptr is the pointer operand of a memory access instruction
5103   // I, and I is known to not require scalarization.
5104   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5105     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5106   };
5107 
5108   // Holds a list of values which are known to have at least one uniform use.
5109   // Note that there may be other uses which aren't uniform.  A "uniform use"
5110   // here is something which only demands lane 0 of the unrolled iterations;
5111   // it does not imply that all lanes produce the same value (e.g. this is not
5112   // the usual meaning of uniform)
5113   SetVector<Value *> HasUniformUse;
5114 
5115   // Scan the loop for instructions which are either a) known to have only
5116   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
5117   for (auto *BB : TheLoop->blocks())
5118     for (auto &I : *BB) {
5119       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
5120         switch (II->getIntrinsicID()) {
5121         case Intrinsic::sideeffect:
5122         case Intrinsic::experimental_noalias_scope_decl:
5123         case Intrinsic::assume:
5124         case Intrinsic::lifetime_start:
5125         case Intrinsic::lifetime_end:
5126           if (TheLoop->hasLoopInvariantOperands(&I))
5127             addToWorklistIfAllowed(&I);
5128           break;
5129         default:
5130           break;
5131         }
5132       }
5133 
5134       // ExtractValue instructions must be uniform, because the operands are
5135       // known to be loop-invariant.
5136       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
5137         assert(isOutOfScope(EVI->getAggregateOperand()) &&
5138                "Expected aggregate value to be loop invariant");
5139         addToWorklistIfAllowed(EVI);
5140         continue;
5141       }
5142 
5143       // If there's no pointer operand, there's nothing to do.
5144       auto *Ptr = getLoadStorePointerOperand(&I);
5145       if (!Ptr)
5146         continue;
5147 
5148       // A uniform memory op is itself uniform.  We exclude uniform stores
5149       // here as they demand the last lane, not the first one.
5150       if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
5151         addToWorklistIfAllowed(&I);
5152 
5153       if (isUniformDecision(&I, VF)) {
5154         assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
5155         HasUniformUse.insert(Ptr);
5156       }
5157     }
5158 
5159   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
5160   // demanding) users.  Since loops are assumed to be in LCSSA form, this
5161   // disallows uses outside the loop as well.
5162   for (auto *V : HasUniformUse) {
5163     if (isOutOfScope(V))
5164       continue;
5165     auto *I = cast<Instruction>(V);
5166     auto UsersAreMemAccesses =
5167       llvm::all_of(I->users(), [&](User *U) -> bool {
5168         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
5169       });
5170     if (UsersAreMemAccesses)
5171       addToWorklistIfAllowed(I);
5172   }
5173 
5174   // Expand Worklist in topological order: whenever a new instruction
5175   // is added , its users should be already inside Worklist.  It ensures
5176   // a uniform instruction will only be used by uniform instructions.
5177   unsigned idx = 0;
5178   while (idx != Worklist.size()) {
5179     Instruction *I = Worklist[idx++];
5180 
5181     for (auto OV : I->operand_values()) {
5182       // isOutOfScope operands cannot be uniform instructions.
5183       if (isOutOfScope(OV))
5184         continue;
5185       // First order recurrence Phi's should typically be considered
5186       // non-uniform.
5187       auto *OP = dyn_cast<PHINode>(OV);
5188       if (OP && Legal->isFirstOrderRecurrence(OP))
5189         continue;
5190       // If all the users of the operand are uniform, then add the
5191       // operand into the uniform worklist.
5192       auto *OI = cast<Instruction>(OV);
5193       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5194             auto *J = cast<Instruction>(U);
5195             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
5196           }))
5197         addToWorklistIfAllowed(OI);
5198     }
5199   }
5200 
5201   // For an instruction to be added into Worklist above, all its users inside
5202   // the loop should also be in Worklist. However, this condition cannot be
5203   // true for phi nodes that form a cyclic dependence. We must process phi
5204   // nodes separately. An induction variable will remain uniform if all users
5205   // of the induction variable and induction variable update remain uniform.
5206   // The code below handles both pointer and non-pointer induction variables.
5207   for (auto &Induction : Legal->getInductionVars()) {
5208     auto *Ind = Induction.first;
5209     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5210 
5211     // Determine if all users of the induction variable are uniform after
5212     // vectorization.
5213     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5214       auto *I = cast<Instruction>(U);
5215       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5216              isVectorizedMemAccessUse(I, Ind);
5217     });
5218     if (!UniformInd)
5219       continue;
5220 
5221     // Determine if all users of the induction variable update instruction are
5222     // uniform after vectorization.
5223     auto UniformIndUpdate =
5224         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5225           auto *I = cast<Instruction>(U);
5226           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5227                  isVectorizedMemAccessUse(I, IndUpdate);
5228         });
5229     if (!UniformIndUpdate)
5230       continue;
5231 
5232     // The induction variable and its update instruction will remain uniform.
5233     addToWorklistIfAllowed(Ind);
5234     addToWorklistIfAllowed(IndUpdate);
5235   }
5236 
5237   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5238 }
5239 
5240 bool LoopVectorizationCostModel::runtimeChecksRequired() {
5241   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
5242 
5243   if (Legal->getRuntimePointerChecking()->Need) {
5244     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5245         "runtime pointer checks needed. Enable vectorization of this "
5246         "loop with '#pragma clang loop vectorize(enable)' when "
5247         "compiling with -Os/-Oz",
5248         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5249     return true;
5250   }
5251 
5252   if (!PSE.getUnionPredicate().getPredicates().empty()) {
5253     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5254         "runtime SCEV checks needed. Enable vectorization of this "
5255         "loop with '#pragma clang loop vectorize(enable)' when "
5256         "compiling with -Os/-Oz",
5257         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5258     return true;
5259   }
5260 
5261   // FIXME: Avoid specializing for stride==1 instead of bailing out.
5262   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5263     reportVectorizationFailure("Runtime stride check for small trip count",
5264         "runtime stride == 1 checks needed. Enable vectorization of "
5265         "this loop without such check by compiling with -Os/-Oz",
5266         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5267     return true;
5268   }
5269 
5270   return false;
5271 }
5272 
5273 ElementCount
5274 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
5275   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
5276     return ElementCount::getScalable(0);
5277 
5278   if (Hints->isScalableVectorizationDisabled()) {
5279     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
5280                             "ScalableVectorizationDisabled", ORE, TheLoop);
5281     return ElementCount::getScalable(0);
5282   }
5283 
5284   LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
5285 
5286   auto MaxScalableVF = ElementCount::getScalable(
5287       std::numeric_limits<ElementCount::ScalarTy>::max());
5288 
5289   // Test that the loop-vectorizer can legalize all operations for this MaxVF.
5290   // FIXME: While for scalable vectors this is currently sufficient, this should
5291   // be replaced by a more detailed mechanism that filters out specific VFs,
5292   // instead of invalidating vectorization for a whole set of VFs based on the
5293   // MaxVF.
5294 
5295   // Disable scalable vectorization if the loop contains unsupported reductions.
5296   if (!canVectorizeReductions(MaxScalableVF)) {
5297     reportVectorizationInfo(
5298         "Scalable vectorization not supported for the reduction "
5299         "operations found in this loop.",
5300         "ScalableVFUnfeasible", ORE, TheLoop);
5301     return ElementCount::getScalable(0);
5302   }
5303 
5304   // Disable scalable vectorization if the loop contains any instructions
5305   // with element types not supported for scalable vectors.
5306   if (any_of(ElementTypesInLoop, [&](Type *Ty) {
5307         return !Ty->isVoidTy() &&
5308                !this->TTI.isElementTypeLegalForScalableVector(Ty);
5309       })) {
5310     reportVectorizationInfo("Scalable vectorization is not supported "
5311                             "for all element types found in this loop.",
5312                             "ScalableVFUnfeasible", ORE, TheLoop);
5313     return ElementCount::getScalable(0);
5314   }
5315 
5316   if (Legal->isSafeForAnyVectorWidth())
5317     return MaxScalableVF;
5318 
5319   // Limit MaxScalableVF by the maximum safe dependence distance.
5320   Optional<unsigned> MaxVScale = TTI.getMaxVScale();
5321   if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange))
5322     MaxVScale =
5323         TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
5324   MaxScalableVF = ElementCount::getScalable(
5325       MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
5326   if (!MaxScalableVF)
5327     reportVectorizationInfo(
5328         "Max legal vector width too small, scalable vectorization "
5329         "unfeasible.",
5330         "ScalableVFUnfeasible", ORE, TheLoop);
5331 
5332   return MaxScalableVF;
5333 }
5334 
5335 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
5336     unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) {
5337   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5338   unsigned SmallestType, WidestType;
5339   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5340 
5341   // Get the maximum safe dependence distance in bits computed by LAA.
5342   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5343   // the memory accesses that is most restrictive (involved in the smallest
5344   // dependence distance).
5345   unsigned MaxSafeElements =
5346       PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
5347 
5348   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
5349   auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
5350 
5351   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
5352                     << ".\n");
5353   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
5354                     << ".\n");
5355 
5356   // First analyze the UserVF, fall back if the UserVF should be ignored.
5357   if (UserVF) {
5358     auto MaxSafeUserVF =
5359         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
5360 
5361     if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
5362       // If `VF=vscale x N` is safe, then so is `VF=N`
5363       if (UserVF.isScalable())
5364         return FixedScalableVFPair(
5365             ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
5366       else
5367         return UserVF;
5368     }
5369 
5370     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
5371 
5372     // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
5373     // is better to ignore the hint and let the compiler choose a suitable VF.
5374     if (!UserVF.isScalable()) {
5375       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5376                         << " is unsafe, clamping to max safe VF="
5377                         << MaxSafeFixedVF << ".\n");
5378       ORE->emit([&]() {
5379         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5380                                           TheLoop->getStartLoc(),
5381                                           TheLoop->getHeader())
5382                << "User-specified vectorization factor "
5383                << ore::NV("UserVectorizationFactor", UserVF)
5384                << " is unsafe, clamping to maximum safe vectorization factor "
5385                << ore::NV("VectorizationFactor", MaxSafeFixedVF);
5386       });
5387       return MaxSafeFixedVF;
5388     }
5389 
5390     if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
5391       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5392                         << " is ignored because scalable vectors are not "
5393                            "available.\n");
5394       ORE->emit([&]() {
5395         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5396                                           TheLoop->getStartLoc(),
5397                                           TheLoop->getHeader())
5398                << "User-specified vectorization factor "
5399                << ore::NV("UserVectorizationFactor", UserVF)
5400                << " is ignored because the target does not support scalable "
5401                   "vectors. The compiler will pick a more suitable value.";
5402       });
5403     } else {
5404       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5405                         << " is unsafe. Ignoring scalable UserVF.\n");
5406       ORE->emit([&]() {
5407         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5408                                           TheLoop->getStartLoc(),
5409                                           TheLoop->getHeader())
5410                << "User-specified vectorization factor "
5411                << ore::NV("UserVectorizationFactor", UserVF)
5412                << " is unsafe. Ignoring the hint to let the compiler pick a "
5413                   "more suitable value.";
5414       });
5415     }
5416   }
5417 
5418   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5419                     << " / " << WidestType << " bits.\n");
5420 
5421   FixedScalableVFPair Result(ElementCount::getFixed(1),
5422                              ElementCount::getScalable(0));
5423   if (auto MaxVF =
5424           getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5425                                   MaxSafeFixedVF, FoldTailByMasking))
5426     Result.FixedVF = MaxVF;
5427 
5428   if (auto MaxVF =
5429           getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5430                                   MaxSafeScalableVF, FoldTailByMasking))
5431     if (MaxVF.isScalable()) {
5432       Result.ScalableVF = MaxVF;
5433       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
5434                         << "\n");
5435     }
5436 
5437   return Result;
5438 }
5439 
5440 FixedScalableVFPair
5441 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5442   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5443     // TODO: It may by useful to do since it's still likely to be dynamically
5444     // uniform if the target can skip.
5445     reportVectorizationFailure(
5446         "Not inserting runtime ptr check for divergent target",
5447         "runtime pointer checks needed. Not enabled for divergent target",
5448         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5449     return FixedScalableVFPair::getNone();
5450   }
5451 
5452   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5453   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5454   if (TC == 1) {
5455     reportVectorizationFailure("Single iteration (non) loop",
5456         "loop trip count is one, irrelevant for vectorization",
5457         "SingleIterationLoop", ORE, TheLoop);
5458     return FixedScalableVFPair::getNone();
5459   }
5460 
5461   switch (ScalarEpilogueStatus) {
5462   case CM_ScalarEpilogueAllowed:
5463     return computeFeasibleMaxVF(TC, UserVF, false);
5464   case CM_ScalarEpilogueNotAllowedUsePredicate:
5465     LLVM_FALLTHROUGH;
5466   case CM_ScalarEpilogueNotNeededUsePredicate:
5467     LLVM_DEBUG(
5468         dbgs() << "LV: vector predicate hint/switch found.\n"
5469                << "LV: Not allowing scalar epilogue, creating predicated "
5470                << "vector loop.\n");
5471     break;
5472   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5473     // fallthrough as a special case of OptForSize
5474   case CM_ScalarEpilogueNotAllowedOptSize:
5475     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5476       LLVM_DEBUG(
5477           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5478     else
5479       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5480                         << "count.\n");
5481 
5482     // Bail if runtime checks are required, which are not good when optimising
5483     // for size.
5484     if (runtimeChecksRequired())
5485       return FixedScalableVFPair::getNone();
5486 
5487     break;
5488   }
5489 
5490   // The only loops we can vectorize without a scalar epilogue, are loops with
5491   // a bottom-test and a single exiting block. We'd have to handle the fact
5492   // that not every instruction executes on the last iteration.  This will
5493   // require a lane mask which varies through the vector loop body.  (TODO)
5494   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5495     // If there was a tail-folding hint/switch, but we can't fold the tail by
5496     // masking, fallback to a vectorization with a scalar epilogue.
5497     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5498       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5499                            "scalar epilogue instead.\n");
5500       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5501       return computeFeasibleMaxVF(TC, UserVF, false);
5502     }
5503     return FixedScalableVFPair::getNone();
5504   }
5505 
5506   // Now try the tail folding
5507 
5508   // Invalidate interleave groups that require an epilogue if we can't mask
5509   // the interleave-group.
5510   if (!useMaskedInterleavedAccesses(TTI)) {
5511     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5512            "No decisions should have been taken at this point");
5513     // Note: There is no need to invalidate any cost modeling decisions here, as
5514     // non where taken so far.
5515     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5516   }
5517 
5518   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true);
5519   // Avoid tail folding if the trip count is known to be a multiple of any VF
5520   // we chose.
5521   // FIXME: The condition below pessimises the case for fixed-width vectors,
5522   // when scalable VFs are also candidates for vectorization.
5523   if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
5524     ElementCount MaxFixedVF = MaxFactors.FixedVF;
5525     assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&
5526            "MaxFixedVF must be a power of 2");
5527     unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
5528                                    : MaxFixedVF.getFixedValue();
5529     ScalarEvolution *SE = PSE.getSE();
5530     const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5531     const SCEV *ExitCount = SE->getAddExpr(
5532         BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5533     const SCEV *Rem = SE->getURemExpr(
5534         SE->applyLoopGuards(ExitCount, TheLoop),
5535         SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5536     if (Rem->isZero()) {
5537       // Accept MaxFixedVF if we do not have a tail.
5538       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5539       return MaxFactors;
5540     }
5541   }
5542 
5543   // For scalable vectors, don't use tail folding as this is currently not yet
5544   // supported. The code is likely to have ended up here if the tripcount is
5545   // low, in which case it makes sense not to use scalable vectors.
5546   if (MaxFactors.ScalableVF.isVector())
5547     MaxFactors.ScalableVF = ElementCount::getScalable(0);
5548 
5549   // If we don't know the precise trip count, or if the trip count that we
5550   // found modulo the vectorization factor is not zero, try to fold the tail
5551   // by masking.
5552   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5553   if (Legal->prepareToFoldTailByMasking()) {
5554     FoldTailByMasking = true;
5555     return MaxFactors;
5556   }
5557 
5558   // If there was a tail-folding hint/switch, but we can't fold the tail by
5559   // masking, fallback to a vectorization with a scalar epilogue.
5560   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5561     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5562                          "scalar epilogue instead.\n");
5563     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5564     return MaxFactors;
5565   }
5566 
5567   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5568     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5569     return FixedScalableVFPair::getNone();
5570   }
5571 
5572   if (TC == 0) {
5573     reportVectorizationFailure(
5574         "Unable to calculate the loop count due to complex control flow",
5575         "unable to calculate the loop count due to complex control flow",
5576         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5577     return FixedScalableVFPair::getNone();
5578   }
5579 
5580   reportVectorizationFailure(
5581       "Cannot optimize for size and vectorize at the same time.",
5582       "cannot optimize for size and vectorize at the same time. "
5583       "Enable vectorization of this loop with '#pragma clang loop "
5584       "vectorize(enable)' when compiling with -Os/-Oz",
5585       "NoTailLoopWithOptForSize", ORE, TheLoop);
5586   return FixedScalableVFPair::getNone();
5587 }
5588 
5589 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
5590     unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
5591     const ElementCount &MaxSafeVF, bool FoldTailByMasking) {
5592   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
5593   TypeSize WidestRegister = TTI.getRegisterBitWidth(
5594       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5595                            : TargetTransformInfo::RGK_FixedWidthVector);
5596 
5597   // Convenience function to return the minimum of two ElementCounts.
5598   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
5599     assert((LHS.isScalable() == RHS.isScalable()) &&
5600            "Scalable flags must match");
5601     return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
5602   };
5603 
5604   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5605   // Note that both WidestRegister and WidestType may not be a powers of 2.
5606   auto MaxVectorElementCount = ElementCount::get(
5607       PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType),
5608       ComputeScalableMaxVF);
5609   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
5610   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5611                     << (MaxVectorElementCount * WidestType) << " bits.\n");
5612 
5613   if (!MaxVectorElementCount) {
5614     LLVM_DEBUG(dbgs() << "LV: The target has no "
5615                       << (ComputeScalableMaxVF ? "scalable" : "fixed")
5616                       << " vector registers.\n");
5617     return ElementCount::getFixed(1);
5618   }
5619 
5620   const auto TripCountEC = ElementCount::getFixed(ConstTripCount);
5621   if (ConstTripCount &&
5622       ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) &&
5623       (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {
5624     // If loop trip count (TC) is known at compile time there is no point in
5625     // choosing VF greater than TC (as done in the loop below). Select maximum
5626     // power of two which doesn't exceed TC.
5627     // If MaxVectorElementCount is scalable, we only fall back on a fixed VF
5628     // when the TC is less than or equal to the known number of lanes.
5629     auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount);
5630     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
5631                          "exceeding the constant trip count: "
5632                       << ClampedConstTripCount << "\n");
5633     return ElementCount::getFixed(ClampedConstTripCount);
5634   }
5635 
5636   ElementCount MaxVF = MaxVectorElementCount;
5637   if (TTI.shouldMaximizeVectorBandwidth() ||
5638       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5639     auto MaxVectorElementCountMaxBW = ElementCount::get(
5640         PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
5641         ComputeScalableMaxVF);
5642     MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
5643 
5644     // Collect all viable vectorization factors larger than the default MaxVF
5645     // (i.e. MaxVectorElementCount).
5646     SmallVector<ElementCount, 8> VFs;
5647     for (ElementCount VS = MaxVectorElementCount * 2;
5648          ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
5649       VFs.push_back(VS);
5650 
5651     // For each VF calculate its register usage.
5652     auto RUs = calculateRegisterUsage(VFs);
5653 
5654     // Select the largest VF which doesn't require more registers than existing
5655     // ones.
5656     for (int i = RUs.size() - 1; i >= 0; --i) {
5657       bool Selected = true;
5658       for (auto &pair : RUs[i].MaxLocalUsers) {
5659         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5660         if (pair.second > TargetNumRegisters)
5661           Selected = false;
5662       }
5663       if (Selected) {
5664         MaxVF = VFs[i];
5665         break;
5666       }
5667     }
5668     if (ElementCount MinVF =
5669             TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
5670       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
5671         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5672                           << ") with target's minimum: " << MinVF << '\n');
5673         MaxVF = MinVF;
5674       }
5675     }
5676   }
5677   return MaxVF;
5678 }
5679 
5680 bool LoopVectorizationCostModel::isMoreProfitable(
5681     const VectorizationFactor &A, const VectorizationFactor &B) const {
5682   InstructionCost CostA = A.Cost;
5683   InstructionCost CostB = B.Cost;
5684 
5685   unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
5686 
5687   if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking &&
5688       MaxTripCount) {
5689     // If we are folding the tail and the trip count is a known (possibly small)
5690     // constant, the trip count will be rounded up to an integer number of
5691     // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),
5692     // which we compare directly. When not folding the tail, the total cost will
5693     // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is
5694     // approximated with the per-lane cost below instead of using the tripcount
5695     // as here.
5696     auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue());
5697     auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue());
5698     return RTCostA < RTCostB;
5699   }
5700 
5701   // Improve estimate for the vector width if it is scalable.
5702   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
5703   unsigned EstimatedWidthB = B.Width.getKnownMinValue();
5704   if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) {
5705     if (A.Width.isScalable())
5706       EstimatedWidthA *= VScale.getValue();
5707     if (B.Width.isScalable())
5708       EstimatedWidthB *= VScale.getValue();
5709   }
5710 
5711   // Assume vscale may be larger than 1 (or the value being tuned for),
5712   // so that scalable vectorization is slightly favorable over fixed-width
5713   // vectorization.
5714   if (A.Width.isScalable() && !B.Width.isScalable())
5715     return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
5716 
5717   // To avoid the need for FP division:
5718   //      (CostA / A.Width) < (CostB / B.Width)
5719   // <=>  (CostA * B.Width) < (CostB * A.Width)
5720   return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
5721 }
5722 
5723 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
5724     const ElementCountSet &VFCandidates) {
5725   InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
5726   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
5727   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
5728   assert(VFCandidates.count(ElementCount::getFixed(1)) &&
5729          "Expected Scalar VF to be a candidate");
5730 
5731   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost);
5732   VectorizationFactor ChosenFactor = ScalarCost;
5733 
5734   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5735   if (ForceVectorization && VFCandidates.size() > 1) {
5736     // Ignore scalar width, because the user explicitly wants vectorization.
5737     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5738     // evaluation.
5739     ChosenFactor.Cost = InstructionCost::getMax();
5740   }
5741 
5742   SmallVector<InstructionVFPair> InvalidCosts;
5743   for (const auto &i : VFCandidates) {
5744     // The cost for scalar VF=1 is already calculated, so ignore it.
5745     if (i.isScalar())
5746       continue;
5747 
5748     VectorizationCostTy C = expectedCost(i, &InvalidCosts);
5749     VectorizationFactor Candidate(i, C.first);
5750 
5751 #ifndef NDEBUG
5752     unsigned AssumedMinimumVscale = 1;
5753     if (Optional<unsigned> VScale = TTI.getVScaleForTuning())
5754       AssumedMinimumVscale = VScale.getValue();
5755     unsigned Width =
5756         Candidate.Width.isScalable()
5757             ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5758             : Candidate.Width.getFixedValue();
5759     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5760                       << " costs: " << (Candidate.Cost / Width));
5761     if (i.isScalable())
5762       LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
5763                         << AssumedMinimumVscale << ")");
5764     LLVM_DEBUG(dbgs() << ".\n");
5765 #endif
5766 
5767     if (!C.second && !ForceVectorization) {
5768       LLVM_DEBUG(
5769           dbgs() << "LV: Not considering vector loop of width " << i
5770                  << " because it will not generate any vector instructions.\n");
5771       continue;
5772     }
5773 
5774     // If profitable add it to ProfitableVF list.
5775     if (isMoreProfitable(Candidate, ScalarCost))
5776       ProfitableVFs.push_back(Candidate);
5777 
5778     if (isMoreProfitable(Candidate, ChosenFactor))
5779       ChosenFactor = Candidate;
5780   }
5781 
5782   // Emit a report of VFs with invalid costs in the loop.
5783   if (!InvalidCosts.empty()) {
5784     // Group the remarks per instruction, keeping the instruction order from
5785     // InvalidCosts.
5786     std::map<Instruction *, unsigned> Numbering;
5787     unsigned I = 0;
5788     for (auto &Pair : InvalidCosts)
5789       if (!Numbering.count(Pair.first))
5790         Numbering[Pair.first] = I++;
5791 
5792     // Sort the list, first on instruction(number) then on VF.
5793     llvm::sort(InvalidCosts,
5794                [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
5795                  if (Numbering[A.first] != Numbering[B.first])
5796                    return Numbering[A.first] < Numbering[B.first];
5797                  ElementCountComparator ECC;
5798                  return ECC(A.second, B.second);
5799                });
5800 
5801     // For a list of ordered instruction-vf pairs:
5802     //   [(load, vf1), (load, vf2), (store, vf1)]
5803     // Group the instructions together to emit separate remarks for:
5804     //   load  (vf1, vf2)
5805     //   store (vf1)
5806     auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
5807     auto Subset = ArrayRef<InstructionVFPair>();
5808     do {
5809       if (Subset.empty())
5810         Subset = Tail.take_front(1);
5811 
5812       Instruction *I = Subset.front().first;
5813 
5814       // If the next instruction is different, or if there are no other pairs,
5815       // emit a remark for the collated subset. e.g.
5816       //   [(load, vf1), (load, vf2))]
5817       // to emit:
5818       //  remark: invalid costs for 'load' at VF=(vf, vf2)
5819       if (Subset == Tail || Tail[Subset.size()].first != I) {
5820         std::string OutString;
5821         raw_string_ostream OS(OutString);
5822         assert(!Subset.empty() && "Unexpected empty range");
5823         OS << "Instruction with invalid costs prevented vectorization at VF=(";
5824         for (auto &Pair : Subset)
5825           OS << (Pair.second == Subset.front().second ? "" : ", ")
5826              << Pair.second;
5827         OS << "):";
5828         if (auto *CI = dyn_cast<CallInst>(I))
5829           OS << " call to " << CI->getCalledFunction()->getName();
5830         else
5831           OS << " " << I->getOpcodeName();
5832         OS.flush();
5833         reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
5834         Tail = Tail.drop_front(Subset.size());
5835         Subset = {};
5836       } else
5837         // Grow the subset by one element
5838         Subset = Tail.take_front(Subset.size() + 1);
5839     } while (!Tail.empty());
5840   }
5841 
5842   if (!EnableCondStoresVectorization && NumPredStores) {
5843     reportVectorizationFailure("There are conditional stores.",
5844         "store that is conditionally executed prevents vectorization",
5845         "ConditionalStore", ORE, TheLoop);
5846     ChosenFactor = ScalarCost;
5847   }
5848 
5849   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
5850                  ChosenFactor.Cost >= ScalarCost.Cost) dbgs()
5851              << "LV: Vectorization seems to be not beneficial, "
5852              << "but was forced by a user.\n");
5853   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
5854   return ChosenFactor;
5855 }
5856 
5857 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
5858     const Loop &L, ElementCount VF) const {
5859   // Cross iteration phis such as reductions need special handling and are
5860   // currently unsupported.
5861   if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) {
5862         return Legal->isFirstOrderRecurrence(&Phi) ||
5863                Legal->isReductionVariable(&Phi);
5864       }))
5865     return false;
5866 
5867   // Phis with uses outside of the loop require special handling and are
5868   // currently unsupported.
5869   for (auto &Entry : Legal->getInductionVars()) {
5870     // Look for uses of the value of the induction at the last iteration.
5871     Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
5872     for (User *U : PostInc->users())
5873       if (!L.contains(cast<Instruction>(U)))
5874         return false;
5875     // Look for uses of penultimate value of the induction.
5876     for (User *U : Entry.first->users())
5877       if (!L.contains(cast<Instruction>(U)))
5878         return false;
5879   }
5880 
5881   // Induction variables that are widened require special handling that is
5882   // currently not supported.
5883   if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
5884         return !(this->isScalarAfterVectorization(Entry.first, VF) ||
5885                  this->isProfitableToScalarize(Entry.first, VF));
5886       }))
5887     return false;
5888 
5889   // Epilogue vectorization code has not been auditted to ensure it handles
5890   // non-latch exits properly.  It may be fine, but it needs auditted and
5891   // tested.
5892   if (L.getExitingBlock() != L.getLoopLatch())
5893     return false;
5894 
5895   return true;
5896 }
5897 
5898 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5899     const ElementCount VF) const {
5900   // FIXME: We need a much better cost-model to take different parameters such
5901   // as register pressure, code size increase and cost of extra branches into
5902   // account. For now we apply a very crude heuristic and only consider loops
5903   // with vectorization factors larger than a certain value.
5904   // We also consider epilogue vectorization unprofitable for targets that don't
5905   // consider interleaving beneficial (eg. MVE).
5906   if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
5907     return false;
5908   if (VF.getFixedValue() >= EpilogueVectorizationMinVF)
5909     return true;
5910   return false;
5911 }
5912 
5913 VectorizationFactor
5914 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
5915     const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
5916   VectorizationFactor Result = VectorizationFactor::Disabled();
5917   if (!EnableEpilogueVectorization) {
5918     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
5919     return Result;
5920   }
5921 
5922   if (!isScalarEpilogueAllowed()) {
5923     LLVM_DEBUG(
5924         dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
5925                   "allowed.\n";);
5926     return Result;
5927   }
5928 
5929   // Not really a cost consideration, but check for unsupported cases here to
5930   // simplify the logic.
5931   if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
5932     LLVM_DEBUG(
5933         dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
5934                   "not a supported candidate.\n";);
5935     return Result;
5936   }
5937 
5938   if (EpilogueVectorizationForceVF > 1) {
5939     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
5940     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
5941     if (LVP.hasPlanWithVF(ForcedEC))
5942       return {ForcedEC, 0};
5943     else {
5944       LLVM_DEBUG(
5945           dbgs()
5946               << "LEV: Epilogue vectorization forced factor is not viable.\n";);
5947       return Result;
5948     }
5949   }
5950 
5951   if (TheLoop->getHeader()->getParent()->hasOptSize() ||
5952       TheLoop->getHeader()->getParent()->hasMinSize()) {
5953     LLVM_DEBUG(
5954         dbgs()
5955             << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
5956     return Result;
5957   }
5958 
5959   auto FixedMainLoopVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
5960   if (MainLoopVF.isScalable())
5961     LLVM_DEBUG(
5962         dbgs() << "LEV: Epilogue vectorization using scalable vectors not "
5963                   "yet supported. Converting to fixed-width (VF="
5964                << FixedMainLoopVF << ") instead\n");
5965 
5966   if (!isEpilogueVectorizationProfitable(FixedMainLoopVF)) {
5967     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
5968                          "this loop\n");
5969     return Result;
5970   }
5971 
5972   for (auto &NextVF : ProfitableVFs)
5973     if (ElementCount::isKnownLT(NextVF.Width, FixedMainLoopVF) &&
5974         (Result.Width.getFixedValue() == 1 ||
5975          isMoreProfitable(NextVF, Result)) &&
5976         LVP.hasPlanWithVF(NextVF.Width))
5977       Result = NextVF;
5978 
5979   if (Result != VectorizationFactor::Disabled())
5980     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5981                       << Result.Width.getFixedValue() << "\n";);
5982   return Result;
5983 }
5984 
5985 std::pair<unsigned, unsigned>
5986 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5987   unsigned MinWidth = -1U;
5988   unsigned MaxWidth = 8;
5989   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5990   for (Type *T : ElementTypesInLoop) {
5991     MinWidth = std::min<unsigned>(
5992         MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5993     MaxWidth = std::max<unsigned>(
5994         MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5995   }
5996   return {MinWidth, MaxWidth};
5997 }
5998 
5999 void LoopVectorizationCostModel::collectElementTypesForWidening() {
6000   ElementTypesInLoop.clear();
6001   // For each block.
6002   for (BasicBlock *BB : TheLoop->blocks()) {
6003     // For each instruction in the loop.
6004     for (Instruction &I : BB->instructionsWithoutDebug()) {
6005       Type *T = I.getType();
6006 
6007       // Skip ignored values.
6008       if (ValuesToIgnore.count(&I))
6009         continue;
6010 
6011       // Only examine Loads, Stores and PHINodes.
6012       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
6013         continue;
6014 
6015       // Examine PHI nodes that are reduction variables. Update the type to
6016       // account for the recurrence type.
6017       if (auto *PN = dyn_cast<PHINode>(&I)) {
6018         if (!Legal->isReductionVariable(PN))
6019           continue;
6020         const RecurrenceDescriptor &RdxDesc =
6021             Legal->getReductionVars().find(PN)->second;
6022         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
6023             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
6024                                       RdxDesc.getRecurrenceType(),
6025                                       TargetTransformInfo::ReductionFlags()))
6026           continue;
6027         T = RdxDesc.getRecurrenceType();
6028       }
6029 
6030       // Examine the stored values.
6031       if (auto *ST = dyn_cast<StoreInst>(&I))
6032         T = ST->getValueOperand()->getType();
6033 
6034       // Ignore loaded pointer types and stored pointer types that are not
6035       // vectorizable.
6036       //
6037       // FIXME: The check here attempts to predict whether a load or store will
6038       //        be vectorized. We only know this for certain after a VF has
6039       //        been selected. Here, we assume that if an access can be
6040       //        vectorized, it will be. We should also look at extending this
6041       //        optimization to non-pointer types.
6042       //
6043       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
6044           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
6045         continue;
6046 
6047       ElementTypesInLoop.insert(T);
6048     }
6049   }
6050 }
6051 
6052 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
6053                                                            unsigned LoopCost) {
6054   // -- The interleave heuristics --
6055   // We interleave the loop in order to expose ILP and reduce the loop overhead.
6056   // There are many micro-architectural considerations that we can't predict
6057   // at this level. For example, frontend pressure (on decode or fetch) due to
6058   // code size, or the number and capabilities of the execution ports.
6059   //
6060   // We use the following heuristics to select the interleave count:
6061   // 1. If the code has reductions, then we interleave to break the cross
6062   // iteration dependency.
6063   // 2. If the loop is really small, then we interleave to reduce the loop
6064   // overhead.
6065   // 3. We don't interleave if we think that we will spill registers to memory
6066   // due to the increased register pressure.
6067 
6068   if (!isScalarEpilogueAllowed())
6069     return 1;
6070 
6071   // We used the distance for the interleave count.
6072   if (Legal->getMaxSafeDepDistBytes() != -1U)
6073     return 1;
6074 
6075   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
6076   const bool HasReductions = !Legal->getReductionVars().empty();
6077   // Do not interleave loops with a relatively small known or estimated trip
6078   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
6079   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
6080   // because with the above conditions interleaving can expose ILP and break
6081   // cross iteration dependences for reductions.
6082   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
6083       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
6084     return 1;
6085 
6086   RegisterUsage R = calculateRegisterUsage({VF})[0];
6087   // We divide by these constants so assume that we have at least one
6088   // instruction that uses at least one register.
6089   for (auto& pair : R.MaxLocalUsers) {
6090     pair.second = std::max(pair.second, 1U);
6091   }
6092 
6093   // We calculate the interleave count using the following formula.
6094   // Subtract the number of loop invariants from the number of available
6095   // registers. These registers are used by all of the interleaved instances.
6096   // Next, divide the remaining registers by the number of registers that is
6097   // required by the loop, in order to estimate how many parallel instances
6098   // fit without causing spills. All of this is rounded down if necessary to be
6099   // a power of two. We want power of two interleave count to simplify any
6100   // addressing operations or alignment considerations.
6101   // We also want power of two interleave counts to ensure that the induction
6102   // variable of the vector loop wraps to zero, when tail is folded by masking;
6103   // this currently happens when OptForSize, in which case IC is set to 1 above.
6104   unsigned IC = UINT_MAX;
6105 
6106   for (auto& pair : R.MaxLocalUsers) {
6107     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
6108     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
6109                       << " registers of "
6110                       << TTI.getRegisterClassName(pair.first) << " register class\n");
6111     if (VF.isScalar()) {
6112       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
6113         TargetNumRegisters = ForceTargetNumScalarRegs;
6114     } else {
6115       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
6116         TargetNumRegisters = ForceTargetNumVectorRegs;
6117     }
6118     unsigned MaxLocalUsers = pair.second;
6119     unsigned LoopInvariantRegs = 0;
6120     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
6121       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
6122 
6123     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
6124     // Don't count the induction variable as interleaved.
6125     if (EnableIndVarRegisterHeur) {
6126       TmpIC =
6127           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
6128                         std::max(1U, (MaxLocalUsers - 1)));
6129     }
6130 
6131     IC = std::min(IC, TmpIC);
6132   }
6133 
6134   // Clamp the interleave ranges to reasonable counts.
6135   unsigned MaxInterleaveCount =
6136       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
6137 
6138   // Check if the user has overridden the max.
6139   if (VF.isScalar()) {
6140     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
6141       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
6142   } else {
6143     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
6144       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
6145   }
6146 
6147   // If trip count is known or estimated compile time constant, limit the
6148   // interleave count to be less than the trip count divided by VF, provided it
6149   // is at least 1.
6150   //
6151   // For scalable vectors we can't know if interleaving is beneficial. It may
6152   // not be beneficial for small loops if none of the lanes in the second vector
6153   // iterations is enabled. However, for larger loops, there is likely to be a
6154   // similar benefit as for fixed-width vectors. For now, we choose to leave
6155   // the InterleaveCount as if vscale is '1', although if some information about
6156   // the vector is known (e.g. min vector size), we can make a better decision.
6157   if (BestKnownTC) {
6158     MaxInterleaveCount =
6159         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
6160     // Make sure MaxInterleaveCount is greater than 0.
6161     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
6162   }
6163 
6164   assert(MaxInterleaveCount > 0 &&
6165          "Maximum interleave count must be greater than 0");
6166 
6167   // Clamp the calculated IC to be between the 1 and the max interleave count
6168   // that the target and trip count allows.
6169   if (IC > MaxInterleaveCount)
6170     IC = MaxInterleaveCount;
6171   else
6172     // Make sure IC is greater than 0.
6173     IC = std::max(1u, IC);
6174 
6175   assert(IC > 0 && "Interleave count must be greater than 0.");
6176 
6177   // If we did not calculate the cost for VF (because the user selected the VF)
6178   // then we calculate the cost of VF here.
6179   if (LoopCost == 0) {
6180     InstructionCost C = expectedCost(VF).first;
6181     assert(C.isValid() && "Expected to have chosen a VF with valid cost");
6182     LoopCost = *C.getValue();
6183   }
6184 
6185   assert(LoopCost && "Non-zero loop cost expected");
6186 
6187   // Interleave if we vectorized this loop and there is a reduction that could
6188   // benefit from interleaving.
6189   if (VF.isVector() && HasReductions) {
6190     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
6191     return IC;
6192   }
6193 
6194   // Note that if we've already vectorized the loop we will have done the
6195   // runtime check and so interleaving won't require further checks.
6196   bool InterleavingRequiresRuntimePointerCheck =
6197       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
6198 
6199   // We want to interleave small loops in order to reduce the loop overhead and
6200   // potentially expose ILP opportunities.
6201   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
6202                     << "LV: IC is " << IC << '\n'
6203                     << "LV: VF is " << VF << '\n');
6204   const bool AggressivelyInterleaveReductions =
6205       TTI.enableAggressiveInterleaving(HasReductions);
6206   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
6207     // We assume that the cost overhead is 1 and we use the cost model
6208     // to estimate the cost of the loop and interleave until the cost of the
6209     // loop overhead is about 5% of the cost of the loop.
6210     unsigned SmallIC =
6211         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
6212 
6213     // Interleave until store/load ports (estimated by max interleave count) are
6214     // saturated.
6215     unsigned NumStores = Legal->getNumStores();
6216     unsigned NumLoads = Legal->getNumLoads();
6217     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
6218     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
6219 
6220     // There is little point in interleaving for reductions containing selects
6221     // and compares when VF=1 since it may just create more overhead than it's
6222     // worth for loops with small trip counts. This is because we still have to
6223     // do the final reduction after the loop.
6224     bool HasSelectCmpReductions =
6225         HasReductions &&
6226         any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
6227           const RecurrenceDescriptor &RdxDesc = Reduction.second;
6228           return RecurrenceDescriptor::isSelectCmpRecurrenceKind(
6229               RdxDesc.getRecurrenceKind());
6230         });
6231     if (HasSelectCmpReductions) {
6232       LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
6233       return 1;
6234     }
6235 
6236     // If we have a scalar reduction (vector reductions are already dealt with
6237     // by this point), we can increase the critical path length if the loop
6238     // we're interleaving is inside another loop. For tree-wise reductions
6239     // set the limit to 2, and for ordered reductions it's best to disable
6240     // interleaving entirely.
6241     if (HasReductions && TheLoop->getLoopDepth() > 1) {
6242       bool HasOrderedReductions =
6243           any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
6244             const RecurrenceDescriptor &RdxDesc = Reduction.second;
6245             return RdxDesc.isOrdered();
6246           });
6247       if (HasOrderedReductions) {
6248         LLVM_DEBUG(
6249             dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
6250         return 1;
6251       }
6252 
6253       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
6254       SmallIC = std::min(SmallIC, F);
6255       StoresIC = std::min(StoresIC, F);
6256       LoadsIC = std::min(LoadsIC, F);
6257     }
6258 
6259     if (EnableLoadStoreRuntimeInterleave &&
6260         std::max(StoresIC, LoadsIC) > SmallIC) {
6261       LLVM_DEBUG(
6262           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
6263       return std::max(StoresIC, LoadsIC);
6264     }
6265 
6266     // If there are scalar reductions and TTI has enabled aggressive
6267     // interleaving for reductions, we will interleave to expose ILP.
6268     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
6269         AggressivelyInterleaveReductions) {
6270       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6271       // Interleave no less than SmallIC but not as aggressive as the normal IC
6272       // to satisfy the rare situation when resources are too limited.
6273       return std::max(IC / 2, SmallIC);
6274     } else {
6275       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
6276       return SmallIC;
6277     }
6278   }
6279 
6280   // Interleave if this is a large loop (small loops are already dealt with by
6281   // this point) that could benefit from interleaving.
6282   if (AggressivelyInterleaveReductions) {
6283     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6284     return IC;
6285   }
6286 
6287   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
6288   return 1;
6289 }
6290 
6291 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
6292 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
6293   // This function calculates the register usage by measuring the highest number
6294   // of values that are alive at a single location. Obviously, this is a very
6295   // rough estimation. We scan the loop in a topological order in order and
6296   // assign a number to each instruction. We use RPO to ensure that defs are
6297   // met before their users. We assume that each instruction that has in-loop
6298   // users starts an interval. We record every time that an in-loop value is
6299   // used, so we have a list of the first and last occurrences of each
6300   // instruction. Next, we transpose this data structure into a multi map that
6301   // holds the list of intervals that *end* at a specific location. This multi
6302   // map allows us to perform a linear search. We scan the instructions linearly
6303   // and record each time that a new interval starts, by placing it in a set.
6304   // If we find this value in the multi-map then we remove it from the set.
6305   // The max register usage is the maximum size of the set.
6306   // We also search for instructions that are defined outside the loop, but are
6307   // used inside the loop. We need this number separately from the max-interval
6308   // usage number because when we unroll, loop-invariant values do not take
6309   // more register.
6310   LoopBlocksDFS DFS(TheLoop);
6311   DFS.perform(LI);
6312 
6313   RegisterUsage RU;
6314 
6315   // Each 'key' in the map opens a new interval. The values
6316   // of the map are the index of the 'last seen' usage of the
6317   // instruction that is the key.
6318   using IntervalMap = DenseMap<Instruction *, unsigned>;
6319 
6320   // Maps instruction to its index.
6321   SmallVector<Instruction *, 64> IdxToInstr;
6322   // Marks the end of each interval.
6323   IntervalMap EndPoint;
6324   // Saves the list of instruction indices that are used in the loop.
6325   SmallPtrSet<Instruction *, 8> Ends;
6326   // Saves the list of values that are used in the loop but are
6327   // defined outside the loop, such as arguments and constants.
6328   SmallPtrSet<Value *, 8> LoopInvariants;
6329 
6330   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6331     for (Instruction &I : BB->instructionsWithoutDebug()) {
6332       IdxToInstr.push_back(&I);
6333 
6334       // Save the end location of each USE.
6335       for (Value *U : I.operands()) {
6336         auto *Instr = dyn_cast<Instruction>(U);
6337 
6338         // Ignore non-instruction values such as arguments, constants, etc.
6339         if (!Instr)
6340           continue;
6341 
6342         // If this instruction is outside the loop then record it and continue.
6343         if (!TheLoop->contains(Instr)) {
6344           LoopInvariants.insert(Instr);
6345           continue;
6346         }
6347 
6348         // Overwrite previous end points.
6349         EndPoint[Instr] = IdxToInstr.size();
6350         Ends.insert(Instr);
6351       }
6352     }
6353   }
6354 
6355   // Saves the list of intervals that end with the index in 'key'.
6356   using InstrList = SmallVector<Instruction *, 2>;
6357   DenseMap<unsigned, InstrList> TransposeEnds;
6358 
6359   // Transpose the EndPoints to a list of values that end at each index.
6360   for (auto &Interval : EndPoint)
6361     TransposeEnds[Interval.second].push_back(Interval.first);
6362 
6363   SmallPtrSet<Instruction *, 8> OpenIntervals;
6364   SmallVector<RegisterUsage, 8> RUs(VFs.size());
6365   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
6366 
6367   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
6368 
6369   // A lambda that gets the register usage for the given type and VF.
6370   const auto &TTICapture = TTI;
6371   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
6372     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
6373       return 0;
6374     InstructionCost::CostType RegUsage =
6375         *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue();
6376     assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() &&
6377            "Nonsensical values for register usage.");
6378     return RegUsage;
6379   };
6380 
6381   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
6382     Instruction *I = IdxToInstr[i];
6383 
6384     // Remove all of the instructions that end at this location.
6385     InstrList &List = TransposeEnds[i];
6386     for (Instruction *ToRemove : List)
6387       OpenIntervals.erase(ToRemove);
6388 
6389     // Ignore instructions that are never used within the loop.
6390     if (!Ends.count(I))
6391       continue;
6392 
6393     // Skip ignored values.
6394     if (ValuesToIgnore.count(I))
6395       continue;
6396 
6397     // For each VF find the maximum usage of registers.
6398     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6399       // Count the number of live intervals.
6400       SmallMapVector<unsigned, unsigned, 4> RegUsage;
6401 
6402       if (VFs[j].isScalar()) {
6403         for (auto Inst : OpenIntervals) {
6404           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6405           if (RegUsage.find(ClassID) == RegUsage.end())
6406             RegUsage[ClassID] = 1;
6407           else
6408             RegUsage[ClassID] += 1;
6409         }
6410       } else {
6411         collectUniformsAndScalars(VFs[j]);
6412         for (auto Inst : OpenIntervals) {
6413           // Skip ignored values for VF > 1.
6414           if (VecValuesToIgnore.count(Inst))
6415             continue;
6416           if (isScalarAfterVectorization(Inst, VFs[j])) {
6417             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6418             if (RegUsage.find(ClassID) == RegUsage.end())
6419               RegUsage[ClassID] = 1;
6420             else
6421               RegUsage[ClassID] += 1;
6422           } else {
6423             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
6424             if (RegUsage.find(ClassID) == RegUsage.end())
6425               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
6426             else
6427               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6428           }
6429         }
6430       }
6431 
6432       for (auto& pair : RegUsage) {
6433         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
6434           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
6435         else
6436           MaxUsages[j][pair.first] = pair.second;
6437       }
6438     }
6439 
6440     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6441                       << OpenIntervals.size() << '\n');
6442 
6443     // Add the current instruction to the list of open intervals.
6444     OpenIntervals.insert(I);
6445   }
6446 
6447   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6448     SmallMapVector<unsigned, unsigned, 4> Invariant;
6449 
6450     for (auto Inst : LoopInvariants) {
6451       unsigned Usage =
6452           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6453       unsigned ClassID =
6454           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6455       if (Invariant.find(ClassID) == Invariant.end())
6456         Invariant[ClassID] = Usage;
6457       else
6458         Invariant[ClassID] += Usage;
6459     }
6460 
6461     LLVM_DEBUG({
6462       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
6463       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
6464              << " item\n";
6465       for (const auto &pair : MaxUsages[i]) {
6466         dbgs() << "LV(REG): RegisterClass: "
6467                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6468                << " registers\n";
6469       }
6470       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
6471              << " item\n";
6472       for (const auto &pair : Invariant) {
6473         dbgs() << "LV(REG): RegisterClass: "
6474                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6475                << " registers\n";
6476       }
6477     });
6478 
6479     RU.LoopInvariantRegs = Invariant;
6480     RU.MaxLocalUsers = MaxUsages[i];
6481     RUs[i] = RU;
6482   }
6483 
6484   return RUs;
6485 }
6486 
6487 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
6488   // TODO: Cost model for emulated masked load/store is completely
6489   // broken. This hack guides the cost model to use an artificially
6490   // high enough value to practically disable vectorization with such
6491   // operations, except where previously deployed legality hack allowed
6492   // using very low cost values. This is to avoid regressions coming simply
6493   // from moving "masked load/store" check from legality to cost model.
6494   // Masked Load/Gather emulation was previously never allowed.
6495   // Limited number of Masked Store/Scatter emulation was allowed.
6496   assert(isPredicatedInst(I) &&
6497          "Expecting a scalar emulated instruction");
6498   return isa<LoadInst>(I) ||
6499          (isa<StoreInst>(I) &&
6500           NumPredStores > NumberOfStoresToPredicate);
6501 }
6502 
6503 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6504   // If we aren't vectorizing the loop, or if we've already collected the
6505   // instructions to scalarize, there's nothing to do. Collection may already
6506   // have occurred if we have a user-selected VF and are now computing the
6507   // expected cost for interleaving.
6508   if (VF.isScalar() || VF.isZero() ||
6509       InstsToScalarize.find(VF) != InstsToScalarize.end())
6510     return;
6511 
6512   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6513   // not profitable to scalarize any instructions, the presence of VF in the
6514   // map will indicate that we've analyzed it already.
6515   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6516 
6517   // Find all the instructions that are scalar with predication in the loop and
6518   // determine if it would be better to not if-convert the blocks they are in.
6519   // If so, we also record the instructions to scalarize.
6520   for (BasicBlock *BB : TheLoop->blocks()) {
6521     if (!blockNeedsPredicationForAnyReason(BB))
6522       continue;
6523     for (Instruction &I : *BB)
6524       if (isScalarWithPredication(&I)) {
6525         ScalarCostsTy ScalarCosts;
6526         // Do not apply discount if scalable, because that would lead to
6527         // invalid scalarization costs.
6528         // Do not apply discount logic if hacked cost is needed
6529         // for emulated masked memrefs.
6530         if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I) &&
6531             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6532           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6533         // Remember that BB will remain after vectorization.
6534         PredicatedBBsAfterVectorization.insert(BB);
6535       }
6536   }
6537 }
6538 
6539 int LoopVectorizationCostModel::computePredInstDiscount(
6540     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
6541   assert(!isUniformAfterVectorization(PredInst, VF) &&
6542          "Instruction marked uniform-after-vectorization will be predicated");
6543 
6544   // Initialize the discount to zero, meaning that the scalar version and the
6545   // vector version cost the same.
6546   InstructionCost Discount = 0;
6547 
6548   // Holds instructions to analyze. The instructions we visit are mapped in
6549   // ScalarCosts. Those instructions are the ones that would be scalarized if
6550   // we find that the scalar version costs less.
6551   SmallVector<Instruction *, 8> Worklist;
6552 
6553   // Returns true if the given instruction can be scalarized.
6554   auto canBeScalarized = [&](Instruction *I) -> bool {
6555     // We only attempt to scalarize instructions forming a single-use chain
6556     // from the original predicated block that would otherwise be vectorized.
6557     // Although not strictly necessary, we give up on instructions we know will
6558     // already be scalar to avoid traversing chains that are unlikely to be
6559     // beneficial.
6560     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6561         isScalarAfterVectorization(I, VF))
6562       return false;
6563 
6564     // If the instruction is scalar with predication, it will be analyzed
6565     // separately. We ignore it within the context of PredInst.
6566     if (isScalarWithPredication(I))
6567       return false;
6568 
6569     // If any of the instruction's operands are uniform after vectorization,
6570     // the instruction cannot be scalarized. This prevents, for example, a
6571     // masked load from being scalarized.
6572     //
6573     // We assume we will only emit a value for lane zero of an instruction
6574     // marked uniform after vectorization, rather than VF identical values.
6575     // Thus, if we scalarize an instruction that uses a uniform, we would
6576     // create uses of values corresponding to the lanes we aren't emitting code
6577     // for. This behavior can be changed by allowing getScalarValue to clone
6578     // the lane zero values for uniforms rather than asserting.
6579     for (Use &U : I->operands())
6580       if (auto *J = dyn_cast<Instruction>(U.get()))
6581         if (isUniformAfterVectorization(J, VF))
6582           return false;
6583 
6584     // Otherwise, we can scalarize the instruction.
6585     return true;
6586   };
6587 
6588   // Compute the expected cost discount from scalarizing the entire expression
6589   // feeding the predicated instruction. We currently only consider expressions
6590   // that are single-use instruction chains.
6591   Worklist.push_back(PredInst);
6592   while (!Worklist.empty()) {
6593     Instruction *I = Worklist.pop_back_val();
6594 
6595     // If we've already analyzed the instruction, there's nothing to do.
6596     if (ScalarCosts.find(I) != ScalarCosts.end())
6597       continue;
6598 
6599     // Compute the cost of the vector instruction. Note that this cost already
6600     // includes the scalarization overhead of the predicated instruction.
6601     InstructionCost VectorCost = getInstructionCost(I, VF).first;
6602 
6603     // Compute the cost of the scalarized instruction. This cost is the cost of
6604     // the instruction as if it wasn't if-converted and instead remained in the
6605     // predicated block. We will scale this cost by block probability after
6606     // computing the scalarization overhead.
6607     InstructionCost ScalarCost =
6608         VF.getFixedValue() *
6609         getInstructionCost(I, ElementCount::getFixed(1)).first;
6610 
6611     // Compute the scalarization overhead of needed insertelement instructions
6612     // and phi nodes.
6613     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
6614       ScalarCost += TTI.getScalarizationOverhead(
6615           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6616           APInt::getAllOnes(VF.getFixedValue()), true, false);
6617       ScalarCost +=
6618           VF.getFixedValue() *
6619           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6620     }
6621 
6622     // Compute the scalarization overhead of needed extractelement
6623     // instructions. For each of the instruction's operands, if the operand can
6624     // be scalarized, add it to the worklist; otherwise, account for the
6625     // overhead.
6626     for (Use &U : I->operands())
6627       if (auto *J = dyn_cast<Instruction>(U.get())) {
6628         assert(VectorType::isValidElementType(J->getType()) &&
6629                "Instruction has non-scalar type");
6630         if (canBeScalarized(J))
6631           Worklist.push_back(J);
6632         else if (needsExtract(J, VF)) {
6633           ScalarCost += TTI.getScalarizationOverhead(
6634               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6635               APInt::getAllOnes(VF.getFixedValue()), false, true);
6636         }
6637       }
6638 
6639     // Scale the total scalar cost by block probability.
6640     ScalarCost /= getReciprocalPredBlockProb();
6641 
6642     // Compute the discount. A non-negative discount means the vector version
6643     // of the instruction costs more, and scalarizing would be beneficial.
6644     Discount += VectorCost - ScalarCost;
6645     ScalarCosts[I] = ScalarCost;
6646   }
6647 
6648   return *Discount.getValue();
6649 }
6650 
6651 LoopVectorizationCostModel::VectorizationCostTy
6652 LoopVectorizationCostModel::expectedCost(
6653     ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
6654   VectorizationCostTy Cost;
6655 
6656   // For each block.
6657   for (BasicBlock *BB : TheLoop->blocks()) {
6658     VectorizationCostTy BlockCost;
6659 
6660     // For each instruction in the old loop.
6661     for (Instruction &I : BB->instructionsWithoutDebug()) {
6662       // Skip ignored values.
6663       if (ValuesToIgnore.count(&I) ||
6664           (VF.isVector() && VecValuesToIgnore.count(&I)))
6665         continue;
6666 
6667       VectorizationCostTy C = getInstructionCost(&I, VF);
6668 
6669       // Check if we should override the cost.
6670       if (C.first.isValid() &&
6671           ForceTargetInstructionCost.getNumOccurrences() > 0)
6672         C.first = InstructionCost(ForceTargetInstructionCost);
6673 
6674       // Keep a list of instructions with invalid costs.
6675       if (Invalid && !C.first.isValid())
6676         Invalid->emplace_back(&I, VF);
6677 
6678       BlockCost.first += C.first;
6679       BlockCost.second |= C.second;
6680       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6681                         << " for VF " << VF << " For instruction: " << I
6682                         << '\n');
6683     }
6684 
6685     // If we are vectorizing a predicated block, it will have been
6686     // if-converted. This means that the block's instructions (aside from
6687     // stores and instructions that may divide by zero) will now be
6688     // unconditionally executed. For the scalar case, we may not always execute
6689     // the predicated block, if it is an if-else block. Thus, scale the block's
6690     // cost by the probability of executing it. blockNeedsPredication from
6691     // Legal is used so as to not include all blocks in tail folded loops.
6692     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6693       BlockCost.first /= getReciprocalPredBlockProb();
6694 
6695     Cost.first += BlockCost.first;
6696     Cost.second |= BlockCost.second;
6697   }
6698 
6699   return Cost;
6700 }
6701 
6702 /// Gets Address Access SCEV after verifying that the access pattern
6703 /// is loop invariant except the induction variable dependence.
6704 ///
6705 /// This SCEV can be sent to the Target in order to estimate the address
6706 /// calculation cost.
6707 static const SCEV *getAddressAccessSCEV(
6708               Value *Ptr,
6709               LoopVectorizationLegality *Legal,
6710               PredicatedScalarEvolution &PSE,
6711               const Loop *TheLoop) {
6712 
6713   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6714   if (!Gep)
6715     return nullptr;
6716 
6717   // We are looking for a gep with all loop invariant indices except for one
6718   // which should be an induction variable.
6719   auto SE = PSE.getSE();
6720   unsigned NumOperands = Gep->getNumOperands();
6721   for (unsigned i = 1; i < NumOperands; ++i) {
6722     Value *Opd = Gep->getOperand(i);
6723     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6724         !Legal->isInductionVariable(Opd))
6725       return nullptr;
6726   }
6727 
6728   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6729   return PSE.getSCEV(Ptr);
6730 }
6731 
6732 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6733   return Legal->hasStride(I->getOperand(0)) ||
6734          Legal->hasStride(I->getOperand(1));
6735 }
6736 
6737 InstructionCost
6738 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6739                                                         ElementCount VF) {
6740   assert(VF.isVector() &&
6741          "Scalarization cost of instruction implies vectorization.");
6742   if (VF.isScalable())
6743     return InstructionCost::getInvalid();
6744 
6745   Type *ValTy = getLoadStoreType(I);
6746   auto SE = PSE.getSE();
6747 
6748   unsigned AS = getLoadStoreAddressSpace(I);
6749   Value *Ptr = getLoadStorePointerOperand(I);
6750   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6751   // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
6752   //       that it is being called from this specific place.
6753 
6754   // Figure out whether the access is strided and get the stride value
6755   // if it's known in compile time
6756   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6757 
6758   // Get the cost of the scalar memory instruction and address computation.
6759   InstructionCost Cost =
6760       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6761 
6762   // Don't pass *I here, since it is scalar but will actually be part of a
6763   // vectorized loop where the user of it is a vectorized instruction.
6764   const Align Alignment = getLoadStoreAlignment(I);
6765   Cost += VF.getKnownMinValue() *
6766           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6767                               AS, TTI::TCK_RecipThroughput);
6768 
6769   // Get the overhead of the extractelement and insertelement instructions
6770   // we might create due to scalarization.
6771   Cost += getScalarizationOverhead(I, VF);
6772 
6773   // If we have a predicated load/store, it will need extra i1 extracts and
6774   // conditional branches, but may not be executed for each vector lane. Scale
6775   // the cost by the probability of executing the predicated block.
6776   if (isPredicatedInst(I)) {
6777     Cost /= getReciprocalPredBlockProb();
6778 
6779     // Add the cost of an i1 extract and a branch
6780     auto *Vec_i1Ty =
6781         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
6782     Cost += TTI.getScalarizationOverhead(
6783         Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
6784         /*Insert=*/false, /*Extract=*/true);
6785     Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
6786 
6787     if (useEmulatedMaskMemRefHack(I))
6788       // Artificially setting to a high enough value to practically disable
6789       // vectorization with such operations.
6790       Cost = 3000000;
6791   }
6792 
6793   return Cost;
6794 }
6795 
6796 InstructionCost
6797 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6798                                                     ElementCount VF) {
6799   Type *ValTy = getLoadStoreType(I);
6800   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6801   Value *Ptr = getLoadStorePointerOperand(I);
6802   unsigned AS = getLoadStoreAddressSpace(I);
6803   int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
6804   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6805 
6806   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6807          "Stride should be 1 or -1 for consecutive memory access");
6808   const Align Alignment = getLoadStoreAlignment(I);
6809   InstructionCost Cost = 0;
6810   if (Legal->isMaskRequired(I))
6811     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6812                                       CostKind);
6813   else
6814     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6815                                 CostKind, I);
6816 
6817   bool Reverse = ConsecutiveStride < 0;
6818   if (Reverse)
6819     Cost +=
6820         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
6821   return Cost;
6822 }
6823 
6824 InstructionCost
6825 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6826                                                 ElementCount VF) {
6827   assert(Legal->isUniformMemOp(*I));
6828 
6829   Type *ValTy = getLoadStoreType(I);
6830   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6831   const Align Alignment = getLoadStoreAlignment(I);
6832   unsigned AS = getLoadStoreAddressSpace(I);
6833   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6834   if (isa<LoadInst>(I)) {
6835     return TTI.getAddressComputationCost(ValTy) +
6836            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6837                                CostKind) +
6838            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6839   }
6840   StoreInst *SI = cast<StoreInst>(I);
6841 
6842   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6843   return TTI.getAddressComputationCost(ValTy) +
6844          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6845                              CostKind) +
6846          (isLoopInvariantStoreValue
6847               ? 0
6848               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6849                                        VF.getKnownMinValue() - 1));
6850 }
6851 
6852 InstructionCost
6853 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6854                                                  ElementCount VF) {
6855   Type *ValTy = getLoadStoreType(I);
6856   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6857   const Align Alignment = getLoadStoreAlignment(I);
6858   const Value *Ptr = getLoadStorePointerOperand(I);
6859 
6860   return TTI.getAddressComputationCost(VectorTy) +
6861          TTI.getGatherScatterOpCost(
6862              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6863              TargetTransformInfo::TCK_RecipThroughput, I);
6864 }
6865 
6866 InstructionCost
6867 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6868                                                    ElementCount VF) {
6869   // TODO: Once we have support for interleaving with scalable vectors
6870   // we can calculate the cost properly here.
6871   if (VF.isScalable())
6872     return InstructionCost::getInvalid();
6873 
6874   Type *ValTy = getLoadStoreType(I);
6875   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6876   unsigned AS = getLoadStoreAddressSpace(I);
6877 
6878   auto Group = getInterleavedAccessGroup(I);
6879   assert(Group && "Fail to get an interleaved access group.");
6880 
6881   unsigned InterleaveFactor = Group->getFactor();
6882   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6883 
6884   // Holds the indices of existing members in the interleaved group.
6885   SmallVector<unsigned, 4> Indices;
6886   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
6887     if (Group->getMember(IF))
6888       Indices.push_back(IF);
6889 
6890   // Calculate the cost of the whole interleaved group.
6891   bool UseMaskForGaps =
6892       (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6893       (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
6894   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
6895       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6896       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6897 
6898   if (Group->isReverse()) {
6899     // TODO: Add support for reversed masked interleaved access.
6900     assert(!Legal->isMaskRequired(I) &&
6901            "Reverse masked interleaved access not supported.");
6902     Cost +=
6903         Group->getNumMembers() *
6904         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
6905   }
6906   return Cost;
6907 }
6908 
6909 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
6910     Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
6911   using namespace llvm::PatternMatch;
6912   // Early exit for no inloop reductions
6913   if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6914     return None;
6915   auto *VectorTy = cast<VectorType>(Ty);
6916 
6917   // We are looking for a pattern of, and finding the minimal acceptable cost:
6918   //  reduce(mul(ext(A), ext(B))) or
6919   //  reduce(mul(A, B)) or
6920   //  reduce(ext(A)) or
6921   //  reduce(A).
6922   // The basic idea is that we walk down the tree to do that, finding the root
6923   // reduction instruction in InLoopReductionImmediateChains. From there we find
6924   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6925   // of the components. If the reduction cost is lower then we return it for the
6926   // reduction instruction and 0 for the other instructions in the pattern. If
6927   // it is not we return an invalid cost specifying the orignal cost method
6928   // should be used.
6929   Instruction *RetI = I;
6930   if (match(RetI, m_ZExtOrSExt(m_Value()))) {
6931     if (!RetI->hasOneUser())
6932       return None;
6933     RetI = RetI->user_back();
6934   }
6935   if (match(RetI, m_Mul(m_Value(), m_Value())) &&
6936       RetI->user_back()->getOpcode() == Instruction::Add) {
6937     if (!RetI->hasOneUser())
6938       return None;
6939     RetI = RetI->user_back();
6940   }
6941 
6942   // Test if the found instruction is a reduction, and if not return an invalid
6943   // cost specifying the parent to use the original cost modelling.
6944   if (!InLoopReductionImmediateChains.count(RetI))
6945     return None;
6946 
6947   // Find the reduction this chain is a part of and calculate the basic cost of
6948   // the reduction on its own.
6949   Instruction *LastChain = InLoopReductionImmediateChains[RetI];
6950   Instruction *ReductionPhi = LastChain;
6951   while (!isa<PHINode>(ReductionPhi))
6952     ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
6953 
6954   const RecurrenceDescriptor &RdxDesc =
6955       Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
6956 
6957   InstructionCost BaseCost = TTI.getArithmeticReductionCost(
6958       RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
6959 
6960   // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6961   // normal fmul instruction to the cost of the fadd reduction.
6962   if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
6963     BaseCost +=
6964         TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
6965 
6966   // If we're using ordered reductions then we can just return the base cost
6967   // here, since getArithmeticReductionCost calculates the full ordered
6968   // reduction cost when FP reassociation is not allowed.
6969   if (useOrderedReductions(RdxDesc))
6970     return BaseCost;
6971 
6972   // Get the operand that was not the reduction chain and match it to one of the
6973   // patterns, returning the better cost if it is found.
6974   Instruction *RedOp = RetI->getOperand(1) == LastChain
6975                            ? dyn_cast<Instruction>(RetI->getOperand(0))
6976                            : dyn_cast<Instruction>(RetI->getOperand(1));
6977 
6978   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6979 
6980   Instruction *Op0, *Op1;
6981   if (RedOp &&
6982       match(RedOp,
6983             m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
6984       match(Op0, m_ZExtOrSExt(m_Value())) &&
6985       Op0->getOpcode() == Op1->getOpcode() &&
6986       Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6987       !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
6988       (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
6989 
6990     // Matched reduce(ext(mul(ext(A), ext(B)))
6991     // Note that the extend opcodes need to all match, or if A==B they will have
6992     // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6993     // which is equally fine.
6994     bool IsUnsigned = isa<ZExtInst>(Op0);
6995     auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6996     auto *MulType = VectorType::get(Op0->getType(), VectorTy);
6997 
6998     InstructionCost ExtCost =
6999         TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
7000                              TTI::CastContextHint::None, CostKind, Op0);
7001     InstructionCost MulCost =
7002         TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
7003     InstructionCost Ext2Cost =
7004         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
7005                              TTI::CastContextHint::None, CostKind, RedOp);
7006 
7007     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7008         /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
7009         CostKind);
7010 
7011     if (RedCost.isValid() &&
7012         RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
7013       return I == RetI ? RedCost : 0;
7014   } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
7015              !TheLoop->isLoopInvariant(RedOp)) {
7016     // Matched reduce(ext(A))
7017     bool IsUnsigned = isa<ZExtInst>(RedOp);
7018     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
7019     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7020         /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
7021         CostKind);
7022 
7023     InstructionCost ExtCost =
7024         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
7025                              TTI::CastContextHint::None, CostKind, RedOp);
7026     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
7027       return I == RetI ? RedCost : 0;
7028   } else if (RedOp &&
7029              match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
7030     if (match(Op0, m_ZExtOrSExt(m_Value())) &&
7031         Op0->getOpcode() == Op1->getOpcode() &&
7032         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
7033       bool IsUnsigned = isa<ZExtInst>(Op0);
7034       Type *Op0Ty = Op0->getOperand(0)->getType();
7035       Type *Op1Ty = Op1->getOperand(0)->getType();
7036       Type *LargestOpTy =
7037           Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
7038                                                                     : Op0Ty;
7039       auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
7040 
7041       // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of
7042       // different sizes. We take the largest type as the ext to reduce, and add
7043       // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
7044       InstructionCost ExtCost0 = TTI.getCastInstrCost(
7045           Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
7046           TTI::CastContextHint::None, CostKind, Op0);
7047       InstructionCost ExtCost1 = TTI.getCastInstrCost(
7048           Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
7049           TTI::CastContextHint::None, CostKind, Op1);
7050       InstructionCost MulCost =
7051           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7052 
7053       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7054           /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
7055           CostKind);
7056       InstructionCost ExtraExtCost = 0;
7057       if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
7058         Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
7059         ExtraExtCost = TTI.getCastInstrCost(
7060             ExtraExtOp->getOpcode(), ExtType,
7061             VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
7062             TTI::CastContextHint::None, CostKind, ExtraExtOp);
7063       }
7064 
7065       if (RedCost.isValid() &&
7066           (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
7067         return I == RetI ? RedCost : 0;
7068     } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
7069       // Matched reduce(mul())
7070       InstructionCost MulCost =
7071           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7072 
7073       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7074           /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy,
7075           CostKind);
7076 
7077       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
7078         return I == RetI ? RedCost : 0;
7079     }
7080   }
7081 
7082   return I == RetI ? Optional<InstructionCost>(BaseCost) : None;
7083 }
7084 
7085 InstructionCost
7086 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
7087                                                      ElementCount VF) {
7088   // Calculate scalar cost only. Vectorization cost should be ready at this
7089   // moment.
7090   if (VF.isScalar()) {
7091     Type *ValTy = getLoadStoreType(I);
7092     const Align Alignment = getLoadStoreAlignment(I);
7093     unsigned AS = getLoadStoreAddressSpace(I);
7094 
7095     return TTI.getAddressComputationCost(ValTy) +
7096            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
7097                                TTI::TCK_RecipThroughput, I);
7098   }
7099   return getWideningCost(I, VF);
7100 }
7101 
7102 LoopVectorizationCostModel::VectorizationCostTy
7103 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
7104                                                ElementCount VF) {
7105   // If we know that this instruction will remain uniform, check the cost of
7106   // the scalar version.
7107   if (isUniformAfterVectorization(I, VF))
7108     VF = ElementCount::getFixed(1);
7109 
7110   if (VF.isVector() && isProfitableToScalarize(I, VF))
7111     return VectorizationCostTy(InstsToScalarize[VF][I], false);
7112 
7113   // Forced scalars do not have any scalarization overhead.
7114   auto ForcedScalar = ForcedScalars.find(VF);
7115   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
7116     auto InstSet = ForcedScalar->second;
7117     if (InstSet.count(I))
7118       return VectorizationCostTy(
7119           (getInstructionCost(I, ElementCount::getFixed(1)).first *
7120            VF.getKnownMinValue()),
7121           false);
7122   }
7123 
7124   Type *VectorTy;
7125   InstructionCost C = getInstructionCost(I, VF, VectorTy);
7126 
7127   bool TypeNotScalarized = false;
7128   if (VF.isVector() && VectorTy->isVectorTy()) {
7129     unsigned NumParts = TTI.getNumberOfParts(VectorTy);
7130     if (NumParts)
7131       TypeNotScalarized = NumParts < VF.getKnownMinValue();
7132     else
7133       C = InstructionCost::getInvalid();
7134   }
7135   return VectorizationCostTy(C, TypeNotScalarized);
7136 }
7137 
7138 InstructionCost
7139 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
7140                                                      ElementCount VF) const {
7141 
7142   // There is no mechanism yet to create a scalable scalarization loop,
7143   // so this is currently Invalid.
7144   if (VF.isScalable())
7145     return InstructionCost::getInvalid();
7146 
7147   if (VF.isScalar())
7148     return 0;
7149 
7150   InstructionCost Cost = 0;
7151   Type *RetTy = ToVectorTy(I->getType(), VF);
7152   if (!RetTy->isVoidTy() &&
7153       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
7154     Cost += TTI.getScalarizationOverhead(
7155         cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true,
7156         false);
7157 
7158   // Some targets keep addresses scalar.
7159   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
7160     return Cost;
7161 
7162   // Some targets support efficient element stores.
7163   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
7164     return Cost;
7165 
7166   // Collect operands to consider.
7167   CallInst *CI = dyn_cast<CallInst>(I);
7168   Instruction::op_range Ops = CI ? CI->args() : I->operands();
7169 
7170   // Skip operands that do not require extraction/scalarization and do not incur
7171   // any overhead.
7172   SmallVector<Type *> Tys;
7173   for (auto *V : filterExtractingOperands(Ops, VF))
7174     Tys.push_back(MaybeVectorizeType(V->getType(), VF));
7175   return Cost + TTI.getOperandsScalarizationOverhead(
7176                     filterExtractingOperands(Ops, VF), Tys);
7177 }
7178 
7179 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
7180   if (VF.isScalar())
7181     return;
7182   NumPredStores = 0;
7183   for (BasicBlock *BB : TheLoop->blocks()) {
7184     // For each instruction in the old loop.
7185     for (Instruction &I : *BB) {
7186       Value *Ptr =  getLoadStorePointerOperand(&I);
7187       if (!Ptr)
7188         continue;
7189 
7190       // TODO: We should generate better code and update the cost model for
7191       // predicated uniform stores. Today they are treated as any other
7192       // predicated store (see added test cases in
7193       // invariant-store-vectorization.ll).
7194       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
7195         NumPredStores++;
7196 
7197       if (Legal->isUniformMemOp(I)) {
7198         // TODO: Avoid replicating loads and stores instead of
7199         // relying on instcombine to remove them.
7200         // Load: Scalar load + broadcast
7201         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
7202         InstructionCost Cost;
7203         if (isa<StoreInst>(&I) && VF.isScalable() &&
7204             isLegalGatherOrScatter(&I)) {
7205           Cost = getGatherScatterCost(&I, VF);
7206           setWideningDecision(&I, VF, CM_GatherScatter, Cost);
7207         } else {
7208           assert((isa<LoadInst>(&I) || !VF.isScalable()) &&
7209                  "Cannot yet scalarize uniform stores");
7210           Cost = getUniformMemOpCost(&I, VF);
7211           setWideningDecision(&I, VF, CM_Scalarize, Cost);
7212         }
7213         continue;
7214       }
7215 
7216       // We assume that widening is the best solution when possible.
7217       if (memoryInstructionCanBeWidened(&I, VF)) {
7218         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
7219         int ConsecutiveStride = Legal->isConsecutivePtr(
7220             getLoadStoreType(&I), getLoadStorePointerOperand(&I));
7221         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
7222                "Expected consecutive stride.");
7223         InstWidening Decision =
7224             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
7225         setWideningDecision(&I, VF, Decision, Cost);
7226         continue;
7227       }
7228 
7229       // Choose between Interleaving, Gather/Scatter or Scalarization.
7230       InstructionCost InterleaveCost = InstructionCost::getInvalid();
7231       unsigned NumAccesses = 1;
7232       if (isAccessInterleaved(&I)) {
7233         auto Group = getInterleavedAccessGroup(&I);
7234         assert(Group && "Fail to get an interleaved access group.");
7235 
7236         // Make one decision for the whole group.
7237         if (getWideningDecision(&I, VF) != CM_Unknown)
7238           continue;
7239 
7240         NumAccesses = Group->getNumMembers();
7241         if (interleavedAccessCanBeWidened(&I, VF))
7242           InterleaveCost = getInterleaveGroupCost(&I, VF);
7243       }
7244 
7245       InstructionCost GatherScatterCost =
7246           isLegalGatherOrScatter(&I)
7247               ? getGatherScatterCost(&I, VF) * NumAccesses
7248               : InstructionCost::getInvalid();
7249 
7250       InstructionCost ScalarizationCost =
7251           getMemInstScalarizationCost(&I, VF) * NumAccesses;
7252 
7253       // Choose better solution for the current VF,
7254       // write down this decision and use it during vectorization.
7255       InstructionCost Cost;
7256       InstWidening Decision;
7257       if (InterleaveCost <= GatherScatterCost &&
7258           InterleaveCost < ScalarizationCost) {
7259         Decision = CM_Interleave;
7260         Cost = InterleaveCost;
7261       } else if (GatherScatterCost < ScalarizationCost) {
7262         Decision = CM_GatherScatter;
7263         Cost = GatherScatterCost;
7264       } else {
7265         Decision = CM_Scalarize;
7266         Cost = ScalarizationCost;
7267       }
7268       // If the instructions belongs to an interleave group, the whole group
7269       // receives the same decision. The whole group receives the cost, but
7270       // the cost will actually be assigned to one instruction.
7271       if (auto Group = getInterleavedAccessGroup(&I))
7272         setWideningDecision(Group, VF, Decision, Cost);
7273       else
7274         setWideningDecision(&I, VF, Decision, Cost);
7275     }
7276   }
7277 
7278   // Make sure that any load of address and any other address computation
7279   // remains scalar unless there is gather/scatter support. This avoids
7280   // inevitable extracts into address registers, and also has the benefit of
7281   // activating LSR more, since that pass can't optimize vectorized
7282   // addresses.
7283   if (TTI.prefersVectorizedAddressing())
7284     return;
7285 
7286   // Start with all scalar pointer uses.
7287   SmallPtrSet<Instruction *, 8> AddrDefs;
7288   for (BasicBlock *BB : TheLoop->blocks())
7289     for (Instruction &I : *BB) {
7290       Instruction *PtrDef =
7291         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
7292       if (PtrDef && TheLoop->contains(PtrDef) &&
7293           getWideningDecision(&I, VF) != CM_GatherScatter)
7294         AddrDefs.insert(PtrDef);
7295     }
7296 
7297   // Add all instructions used to generate the addresses.
7298   SmallVector<Instruction *, 4> Worklist;
7299   append_range(Worklist, AddrDefs);
7300   while (!Worklist.empty()) {
7301     Instruction *I = Worklist.pop_back_val();
7302     for (auto &Op : I->operands())
7303       if (auto *InstOp = dyn_cast<Instruction>(Op))
7304         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
7305             AddrDefs.insert(InstOp).second)
7306           Worklist.push_back(InstOp);
7307   }
7308 
7309   for (auto *I : AddrDefs) {
7310     if (isa<LoadInst>(I)) {
7311       // Setting the desired widening decision should ideally be handled in
7312       // by cost functions, but since this involves the task of finding out
7313       // if the loaded register is involved in an address computation, it is
7314       // instead changed here when we know this is the case.
7315       InstWidening Decision = getWideningDecision(I, VF);
7316       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
7317         // Scalarize a widened load of address.
7318         setWideningDecision(
7319             I, VF, CM_Scalarize,
7320             (VF.getKnownMinValue() *
7321              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
7322       else if (auto Group = getInterleavedAccessGroup(I)) {
7323         // Scalarize an interleave group of address loads.
7324         for (unsigned I = 0; I < Group->getFactor(); ++I) {
7325           if (Instruction *Member = Group->getMember(I))
7326             setWideningDecision(
7327                 Member, VF, CM_Scalarize,
7328                 (VF.getKnownMinValue() *
7329                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
7330         }
7331       }
7332     } else
7333       // Make sure I gets scalarized and a cost estimate without
7334       // scalarization overhead.
7335       ForcedScalars[VF].insert(I);
7336   }
7337 }
7338 
7339 InstructionCost
7340 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
7341                                                Type *&VectorTy) {
7342   Type *RetTy = I->getType();
7343   if (canTruncateToMinimalBitwidth(I, VF))
7344     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
7345   auto SE = PSE.getSE();
7346   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7347 
7348   auto hasSingleCopyAfterVectorization = [this](Instruction *I,
7349                                                 ElementCount VF) -> bool {
7350     if (VF.isScalar())
7351       return true;
7352 
7353     auto Scalarized = InstsToScalarize.find(VF);
7354     assert(Scalarized != InstsToScalarize.end() &&
7355            "VF not yet analyzed for scalarization profitability");
7356     return !Scalarized->second.count(I) &&
7357            llvm::all_of(I->users(), [&](User *U) {
7358              auto *UI = cast<Instruction>(U);
7359              return !Scalarized->second.count(UI);
7360            });
7361   };
7362   (void) hasSingleCopyAfterVectorization;
7363 
7364   if (isScalarAfterVectorization(I, VF)) {
7365     // With the exception of GEPs and PHIs, after scalarization there should
7366     // only be one copy of the instruction generated in the loop. This is
7367     // because the VF is either 1, or any instructions that need scalarizing
7368     // have already been dealt with by the the time we get here. As a result,
7369     // it means we don't have to multiply the instruction cost by VF.
7370     assert(I->getOpcode() == Instruction::GetElementPtr ||
7371            I->getOpcode() == Instruction::PHI ||
7372            (I->getOpcode() == Instruction::BitCast &&
7373             I->getType()->isPointerTy()) ||
7374            hasSingleCopyAfterVectorization(I, VF));
7375     VectorTy = RetTy;
7376   } else
7377     VectorTy = ToVectorTy(RetTy, VF);
7378 
7379   // TODO: We need to estimate the cost of intrinsic calls.
7380   switch (I->getOpcode()) {
7381   case Instruction::GetElementPtr:
7382     // We mark this instruction as zero-cost because the cost of GEPs in
7383     // vectorized code depends on whether the corresponding memory instruction
7384     // is scalarized or not. Therefore, we handle GEPs with the memory
7385     // instruction cost.
7386     return 0;
7387   case Instruction::Br: {
7388     // In cases of scalarized and predicated instructions, there will be VF
7389     // predicated blocks in the vectorized loop. Each branch around these
7390     // blocks requires also an extract of its vector compare i1 element.
7391     bool ScalarPredicatedBB = false;
7392     BranchInst *BI = cast<BranchInst>(I);
7393     if (VF.isVector() && BI->isConditional() &&
7394         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
7395          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
7396       ScalarPredicatedBB = true;
7397 
7398     if (ScalarPredicatedBB) {
7399       // Not possible to scalarize scalable vector with predicated instructions.
7400       if (VF.isScalable())
7401         return InstructionCost::getInvalid();
7402       // Return cost for branches around scalarized and predicated blocks.
7403       auto *Vec_i1Ty =
7404           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
7405       return (
7406           TTI.getScalarizationOverhead(
7407               Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) +
7408           (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
7409     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
7410       // The back-edge branch will remain, as will all scalar branches.
7411       return TTI.getCFInstrCost(Instruction::Br, CostKind);
7412     else
7413       // This branch will be eliminated by if-conversion.
7414       return 0;
7415     // Note: We currently assume zero cost for an unconditional branch inside
7416     // a predicated block since it will become a fall-through, although we
7417     // may decide in the future to call TTI for all branches.
7418   }
7419   case Instruction::PHI: {
7420     auto *Phi = cast<PHINode>(I);
7421 
7422     // First-order recurrences are replaced by vector shuffles inside the loop.
7423     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
7424     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
7425       return TTI.getShuffleCost(
7426           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
7427           None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
7428 
7429     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7430     // converted into select instructions. We require N - 1 selects per phi
7431     // node, where N is the number of incoming values.
7432     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
7433       return (Phi->getNumIncomingValues() - 1) *
7434              TTI.getCmpSelInstrCost(
7435                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
7436                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
7437                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
7438 
7439     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
7440   }
7441   case Instruction::UDiv:
7442   case Instruction::SDiv:
7443   case Instruction::URem:
7444   case Instruction::SRem:
7445     // If we have a predicated instruction, it may not be executed for each
7446     // vector lane. Get the scalarization cost and scale this amount by the
7447     // probability of executing the predicated block. If the instruction is not
7448     // predicated, we fall through to the next case.
7449     if (VF.isVector() && isScalarWithPredication(I)) {
7450       InstructionCost Cost = 0;
7451 
7452       // These instructions have a non-void type, so account for the phi nodes
7453       // that we will create. This cost is likely to be zero. The phi node
7454       // cost, if any, should be scaled by the block probability because it
7455       // models a copy at the end of each predicated block.
7456       Cost += VF.getKnownMinValue() *
7457               TTI.getCFInstrCost(Instruction::PHI, CostKind);
7458 
7459       // The cost of the non-predicated instruction.
7460       Cost += VF.getKnownMinValue() *
7461               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
7462 
7463       // The cost of insertelement and extractelement instructions needed for
7464       // scalarization.
7465       Cost += getScalarizationOverhead(I, VF);
7466 
7467       // Scale the cost by the probability of executing the predicated blocks.
7468       // This assumes the predicated block for each vector lane is equally
7469       // likely.
7470       return Cost / getReciprocalPredBlockProb();
7471     }
7472     LLVM_FALLTHROUGH;
7473   case Instruction::Add:
7474   case Instruction::FAdd:
7475   case Instruction::Sub:
7476   case Instruction::FSub:
7477   case Instruction::Mul:
7478   case Instruction::FMul:
7479   case Instruction::FDiv:
7480   case Instruction::FRem:
7481   case Instruction::Shl:
7482   case Instruction::LShr:
7483   case Instruction::AShr:
7484   case Instruction::And:
7485   case Instruction::Or:
7486   case Instruction::Xor: {
7487     // Since we will replace the stride by 1 the multiplication should go away.
7488     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7489       return 0;
7490 
7491     // Detect reduction patterns
7492     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7493       return *RedCost;
7494 
7495     // Certain instructions can be cheaper to vectorize if they have a constant
7496     // second vector operand. One example of this are shifts on x86.
7497     Value *Op2 = I->getOperand(1);
7498     TargetTransformInfo::OperandValueProperties Op2VP;
7499     TargetTransformInfo::OperandValueKind Op2VK =
7500         TTI.getOperandInfo(Op2, Op2VP);
7501     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7502       Op2VK = TargetTransformInfo::OK_UniformValue;
7503 
7504     SmallVector<const Value *, 4> Operands(I->operand_values());
7505     return TTI.getArithmeticInstrCost(
7506         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7507         Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
7508   }
7509   case Instruction::FNeg: {
7510     return TTI.getArithmeticInstrCost(
7511         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7512         TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None,
7513         TargetTransformInfo::OP_None, I->getOperand(0), I);
7514   }
7515   case Instruction::Select: {
7516     SelectInst *SI = cast<SelectInst>(I);
7517     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7518     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7519 
7520     const Value *Op0, *Op1;
7521     using namespace llvm::PatternMatch;
7522     if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
7523                         match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
7524       // select x, y, false --> x & y
7525       // select x, true, y --> x | y
7526       TTI::OperandValueProperties Op1VP = TTI::OP_None;
7527       TTI::OperandValueProperties Op2VP = TTI::OP_None;
7528       TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP);
7529       TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP);
7530       assert(Op0->getType()->getScalarSizeInBits() == 1 &&
7531               Op1->getType()->getScalarSizeInBits() == 1);
7532 
7533       SmallVector<const Value *, 2> Operands{Op0, Op1};
7534       return TTI.getArithmeticInstrCost(
7535           match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
7536           CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I);
7537     }
7538 
7539     Type *CondTy = SI->getCondition()->getType();
7540     if (!ScalarCond)
7541       CondTy = VectorType::get(CondTy, VF);
7542 
7543     CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
7544     if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
7545       Pred = Cmp->getPredicate();
7546     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
7547                                   CostKind, I);
7548   }
7549   case Instruction::ICmp:
7550   case Instruction::FCmp: {
7551     Type *ValTy = I->getOperand(0)->getType();
7552     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7553     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7554       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7555     VectorTy = ToVectorTy(ValTy, VF);
7556     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7557                                   cast<CmpInst>(I)->getPredicate(), CostKind,
7558                                   I);
7559   }
7560   case Instruction::Store:
7561   case Instruction::Load: {
7562     ElementCount Width = VF;
7563     if (Width.isVector()) {
7564       InstWidening Decision = getWideningDecision(I, Width);
7565       assert(Decision != CM_Unknown &&
7566              "CM decision should be taken at this point");
7567       if (Decision == CM_Scalarize)
7568         Width = ElementCount::getFixed(1);
7569     }
7570     VectorTy = ToVectorTy(getLoadStoreType(I), Width);
7571     return getMemoryInstructionCost(I, VF);
7572   }
7573   case Instruction::BitCast:
7574     if (I->getType()->isPointerTy())
7575       return 0;
7576     LLVM_FALLTHROUGH;
7577   case Instruction::ZExt:
7578   case Instruction::SExt:
7579   case Instruction::FPToUI:
7580   case Instruction::FPToSI:
7581   case Instruction::FPExt:
7582   case Instruction::PtrToInt:
7583   case Instruction::IntToPtr:
7584   case Instruction::SIToFP:
7585   case Instruction::UIToFP:
7586   case Instruction::Trunc:
7587   case Instruction::FPTrunc: {
7588     // Computes the CastContextHint from a Load/Store instruction.
7589     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7590       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7591              "Expected a load or a store!");
7592 
7593       if (VF.isScalar() || !TheLoop->contains(I))
7594         return TTI::CastContextHint::Normal;
7595 
7596       switch (getWideningDecision(I, VF)) {
7597       case LoopVectorizationCostModel::CM_GatherScatter:
7598         return TTI::CastContextHint::GatherScatter;
7599       case LoopVectorizationCostModel::CM_Interleave:
7600         return TTI::CastContextHint::Interleave;
7601       case LoopVectorizationCostModel::CM_Scalarize:
7602       case LoopVectorizationCostModel::CM_Widen:
7603         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7604                                         : TTI::CastContextHint::Normal;
7605       case LoopVectorizationCostModel::CM_Widen_Reverse:
7606         return TTI::CastContextHint::Reversed;
7607       case LoopVectorizationCostModel::CM_Unknown:
7608         llvm_unreachable("Instr did not go through cost modelling?");
7609       }
7610 
7611       llvm_unreachable("Unhandled case!");
7612     };
7613 
7614     unsigned Opcode = I->getOpcode();
7615     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7616     // For Trunc, the context is the only user, which must be a StoreInst.
7617     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7618       if (I->hasOneUse())
7619         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7620           CCH = ComputeCCH(Store);
7621     }
7622     // For Z/Sext, the context is the operand, which must be a LoadInst.
7623     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7624              Opcode == Instruction::FPExt) {
7625       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7626         CCH = ComputeCCH(Load);
7627     }
7628 
7629     // We optimize the truncation of induction variables having constant
7630     // integer steps. The cost of these truncations is the same as the scalar
7631     // operation.
7632     if (isOptimizableIVTruncate(I, VF)) {
7633       auto *Trunc = cast<TruncInst>(I);
7634       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7635                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7636     }
7637 
7638     // Detect reduction patterns
7639     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7640       return *RedCost;
7641 
7642     Type *SrcScalarTy = I->getOperand(0)->getType();
7643     Type *SrcVecTy =
7644         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7645     if (canTruncateToMinimalBitwidth(I, VF)) {
7646       // This cast is going to be shrunk. This may remove the cast or it might
7647       // turn it into slightly different cast. For example, if MinBW == 16,
7648       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7649       //
7650       // Calculate the modified src and dest types.
7651       Type *MinVecTy = VectorTy;
7652       if (Opcode == Instruction::Trunc) {
7653         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7654         VectorTy =
7655             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7656       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7657         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7658         VectorTy =
7659             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7660       }
7661     }
7662 
7663     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7664   }
7665   case Instruction::Call: {
7666     if (RecurrenceDescriptor::isFMulAddIntrinsic(I))
7667       if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7668         return *RedCost;
7669     bool NeedToScalarize;
7670     CallInst *CI = cast<CallInst>(I);
7671     InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7672     if (getVectorIntrinsicIDForCall(CI, TLI)) {
7673       InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
7674       return std::min(CallCost, IntrinsicCost);
7675     }
7676     return CallCost;
7677   }
7678   case Instruction::ExtractValue:
7679     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7680   case Instruction::Alloca:
7681     // We cannot easily widen alloca to a scalable alloca, as
7682     // the result would need to be a vector of pointers.
7683     if (VF.isScalable())
7684       return InstructionCost::getInvalid();
7685     LLVM_FALLTHROUGH;
7686   default:
7687     // This opcode is unknown. Assume that it is the same as 'mul'.
7688     return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7689   } // end of switch.
7690 }
7691 
7692 char LoopVectorize::ID = 0;
7693 
7694 static const char lv_name[] = "Loop Vectorization";
7695 
7696 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7697 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7698 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7699 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
7700 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7701 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7702 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7703 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7704 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7705 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7706 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7707 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7708 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7709 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
7710 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
7711 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7712 
7713 namespace llvm {
7714 
7715 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7716 
7717 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7718                               bool VectorizeOnlyWhenForced) {
7719   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7720 }
7721 
7722 } // end namespace llvm
7723 
7724 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
7725   // Check if the pointer operand of a load or store instruction is
7726   // consecutive.
7727   if (auto *Ptr = getLoadStorePointerOperand(Inst))
7728     return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr);
7729   return false;
7730 }
7731 
7732 void LoopVectorizationCostModel::collectValuesToIgnore() {
7733   // Ignore ephemeral values.
7734   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7735 
7736   // Ignore type-promoting instructions we identified during reduction
7737   // detection.
7738   for (auto &Reduction : Legal->getReductionVars()) {
7739     const RecurrenceDescriptor &RedDes = Reduction.second;
7740     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7741     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7742   }
7743   // Ignore type-casting instructions we identified during induction
7744   // detection.
7745   for (auto &Induction : Legal->getInductionVars()) {
7746     const InductionDescriptor &IndDes = Induction.second;
7747     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7748     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7749   }
7750 }
7751 
7752 void LoopVectorizationCostModel::collectInLoopReductions() {
7753   for (auto &Reduction : Legal->getReductionVars()) {
7754     PHINode *Phi = Reduction.first;
7755     const RecurrenceDescriptor &RdxDesc = Reduction.second;
7756 
7757     // We don't collect reductions that are type promoted (yet).
7758     if (RdxDesc.getRecurrenceType() != Phi->getType())
7759       continue;
7760 
7761     // If the target would prefer this reduction to happen "in-loop", then we
7762     // want to record it as such.
7763     unsigned Opcode = RdxDesc.getOpcode();
7764     if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7765         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7766                                    TargetTransformInfo::ReductionFlags()))
7767       continue;
7768 
7769     // Check that we can correctly put the reductions into the loop, by
7770     // finding the chain of operations that leads from the phi to the loop
7771     // exit value.
7772     SmallVector<Instruction *, 4> ReductionOperations =
7773         RdxDesc.getReductionOpChain(Phi, TheLoop);
7774     bool InLoop = !ReductionOperations.empty();
7775     if (InLoop) {
7776       InLoopReductionChains[Phi] = ReductionOperations;
7777       // Add the elements to InLoopReductionImmediateChains for cost modelling.
7778       Instruction *LastChain = Phi;
7779       for (auto *I : ReductionOperations) {
7780         InLoopReductionImmediateChains[I] = LastChain;
7781         LastChain = I;
7782       }
7783     }
7784     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7785                       << " reduction for phi: " << *Phi << "\n");
7786   }
7787 }
7788 
7789 // TODO: we could return a pair of values that specify the max VF and
7790 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7791 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7792 // doesn't have a cost model that can choose which plan to execute if
7793 // more than one is generated.
7794 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7795                                  LoopVectorizationCostModel &CM) {
7796   unsigned WidestType;
7797   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7798   return WidestVectorRegBits / WidestType;
7799 }
7800 
7801 VectorizationFactor
7802 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7803   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
7804   ElementCount VF = UserVF;
7805   // Outer loop handling: They may require CFG and instruction level
7806   // transformations before even evaluating whether vectorization is profitable.
7807   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7808   // the vectorization pipeline.
7809   if (!OrigLoop->isInnermost()) {
7810     // If the user doesn't provide a vectorization factor, determine a
7811     // reasonable one.
7812     if (UserVF.isZero()) {
7813       VF = ElementCount::getFixed(determineVPlanVF(
7814           TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
7815               .getFixedSize(),
7816           CM));
7817       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7818 
7819       // Make sure we have a VF > 1 for stress testing.
7820       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7821         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7822                           << "overriding computed VF.\n");
7823         VF = ElementCount::getFixed(4);
7824       }
7825     }
7826     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7827     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7828            "VF needs to be a power of two");
7829     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7830                       << "VF " << VF << " to build VPlans.\n");
7831     buildVPlans(VF, VF);
7832 
7833     // For VPlan build stress testing, we bail out after VPlan construction.
7834     if (VPlanBuildStressTest)
7835       return VectorizationFactor::Disabled();
7836 
7837     return {VF, 0 /*Cost*/};
7838   }
7839 
7840   LLVM_DEBUG(
7841       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7842                 "VPlan-native path.\n");
7843   return VectorizationFactor::Disabled();
7844 }
7845 
7846 Optional<VectorizationFactor>
7847 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7848   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7849   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7850   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7851     return None;
7852 
7853   // Invalidate interleave groups if all blocks of loop will be predicated.
7854   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7855       !useMaskedInterleavedAccesses(*TTI)) {
7856     LLVM_DEBUG(
7857         dbgs()
7858         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7859            "which requires masked-interleaved support.\n");
7860     if (CM.InterleaveInfo.invalidateGroups())
7861       // Invalidating interleave groups also requires invalidating all decisions
7862       // based on them, which includes widening decisions and uniform and scalar
7863       // values.
7864       CM.invalidateCostModelingDecisions();
7865   }
7866 
7867   ElementCount MaxUserVF =
7868       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7869   bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
7870   if (!UserVF.isZero() && UserVFIsLegal) {
7871     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7872            "VF needs to be a power of two");
7873     // Collect the instructions (and their associated costs) that will be more
7874     // profitable to scalarize.
7875     if (CM.selectUserVectorizationFactor(UserVF)) {
7876       LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7877       CM.collectInLoopReductions();
7878       buildVPlansWithVPRecipes(UserVF, UserVF);
7879       LLVM_DEBUG(printPlans(dbgs()));
7880       return {{UserVF, 0}};
7881     } else
7882       reportVectorizationInfo("UserVF ignored because of invalid costs.",
7883                               "InvalidCost", ORE, OrigLoop);
7884   }
7885 
7886   // Populate the set of Vectorization Factor Candidates.
7887   ElementCountSet VFCandidates;
7888   for (auto VF = ElementCount::getFixed(1);
7889        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7890     VFCandidates.insert(VF);
7891   for (auto VF = ElementCount::getScalable(1);
7892        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7893     VFCandidates.insert(VF);
7894 
7895   for (const auto &VF : VFCandidates) {
7896     // Collect Uniform and Scalar instructions after vectorization with VF.
7897     CM.collectUniformsAndScalars(VF);
7898 
7899     // Collect the instructions (and their associated costs) that will be more
7900     // profitable to scalarize.
7901     if (VF.isVector())
7902       CM.collectInstsToScalarize(VF);
7903   }
7904 
7905   CM.collectInLoopReductions();
7906   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7907   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7908 
7909   LLVM_DEBUG(printPlans(dbgs()));
7910   if (!MaxFactors.hasVector())
7911     return VectorizationFactor::Disabled();
7912 
7913   // Select the optimal vectorization factor.
7914   auto SelectedVF = CM.selectVectorizationFactor(VFCandidates);
7915 
7916   // Check if it is profitable to vectorize with runtime checks.
7917   unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
7918   if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) {
7919     bool PragmaThresholdReached =
7920         NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
7921     bool ThresholdReached =
7922         NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
7923     if ((ThresholdReached && !Hints.allowReordering()) ||
7924         PragmaThresholdReached) {
7925       ORE->emit([&]() {
7926         return OptimizationRemarkAnalysisAliasing(
7927                    DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(),
7928                    OrigLoop->getHeader())
7929                << "loop not vectorized: cannot prove it is safe to reorder "
7930                   "memory operations";
7931       });
7932       LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
7933       Hints.emitRemarkWithHints();
7934       return VectorizationFactor::Disabled();
7935     }
7936   }
7937   return SelectedVF;
7938 }
7939 
7940 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
7941   assert(count_if(VPlans,
7942                   [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7943              1 &&
7944          "Best VF has not a single VPlan.");
7945 
7946   for (const VPlanPtr &Plan : VPlans) {
7947     if (Plan->hasVF(VF))
7948       return *Plan.get();
7949   }
7950   llvm_unreachable("No plan found!");
7951 }
7952 
7953 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
7954                                            VPlan &BestVPlan,
7955                                            InnerLoopVectorizer &ILV,
7956                                            DominatorTree *DT) {
7957   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
7958                     << '\n');
7959 
7960   // Perform the actual loop transformation.
7961 
7962   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
7963   VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
7964   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7965   State.CanonicalIV = ILV.Induction;
7966   ILV.collectPoisonGeneratingRecipes(State);
7967 
7968   ILV.printDebugTracesAtStart();
7969 
7970   //===------------------------------------------------===//
7971   //
7972   // Notice: any optimization or new instruction that go
7973   // into the code below should also be implemented in
7974   // the cost-model.
7975   //
7976   //===------------------------------------------------===//
7977 
7978   // 2. Copy and widen instructions from the old loop into the new loop.
7979   BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr), State);
7980   BestVPlan.execute(&State);
7981 
7982   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7983   //    predication, updating analyses.
7984   ILV.fixVectorizedLoop(State);
7985 
7986   ILV.printDebugTracesAtEnd();
7987 }
7988 
7989 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
7990 void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
7991   for (const auto &Plan : VPlans)
7992     if (PrintVPlansInDotFormat)
7993       Plan->printDOT(O);
7994     else
7995       Plan->print(O);
7996 }
7997 #endif
7998 
7999 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
8000     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
8001 
8002   // We create new control-flow for the vectorized loop, so the original exit
8003   // conditions will be dead after vectorization if it's only used by the
8004   // terminator
8005   SmallVector<BasicBlock*> ExitingBlocks;
8006   OrigLoop->getExitingBlocks(ExitingBlocks);
8007   for (auto *BB : ExitingBlocks) {
8008     auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));
8009     if (!Cmp || !Cmp->hasOneUse())
8010       continue;
8011 
8012     // TODO: we should introduce a getUniqueExitingBlocks on Loop
8013     if (!DeadInstructions.insert(Cmp).second)
8014       continue;
8015 
8016     // The operands of the icmp is often a dead trunc, used by IndUpdate.
8017     // TODO: can recurse through operands in general
8018     for (Value *Op : Cmp->operands()) {
8019       if (isa<TruncInst>(Op) && Op->hasOneUse())
8020           DeadInstructions.insert(cast<Instruction>(Op));
8021     }
8022   }
8023 
8024   // We create new "steps" for induction variable updates to which the original
8025   // induction variables map. An original update instruction will be dead if
8026   // all its users except the induction variable are dead.
8027   auto *Latch = OrigLoop->getLoopLatch();
8028   for (auto &Induction : Legal->getInductionVars()) {
8029     PHINode *Ind = Induction.first;
8030     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
8031 
8032     // If the tail is to be folded by masking, the primary induction variable,
8033     // if exists, isn't dead: it will be used for masking. Don't kill it.
8034     if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
8035       continue;
8036 
8037     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
8038           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
8039         }))
8040       DeadInstructions.insert(IndUpdate);
8041   }
8042 }
8043 
8044 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
8045 
8046 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
8047 
8048 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
8049   SmallVector<Metadata *, 4> MDs;
8050   // Reserve first location for self reference to the LoopID metadata node.
8051   MDs.push_back(nullptr);
8052   bool IsUnrollMetadata = false;
8053   MDNode *LoopID = L->getLoopID();
8054   if (LoopID) {
8055     // First find existing loop unrolling disable metadata.
8056     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
8057       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
8058       if (MD) {
8059         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
8060         IsUnrollMetadata =
8061             S && S->getString().startswith("llvm.loop.unroll.disable");
8062       }
8063       MDs.push_back(LoopID->getOperand(i));
8064     }
8065   }
8066 
8067   if (!IsUnrollMetadata) {
8068     // Add runtime unroll disable metadata.
8069     LLVMContext &Context = L->getHeader()->getContext();
8070     SmallVector<Metadata *, 1> DisableOperands;
8071     DisableOperands.push_back(
8072         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
8073     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
8074     MDs.push_back(DisableNode);
8075     MDNode *NewLoopID = MDNode::get(Context, MDs);
8076     // Set operand 0 to refer to the loop id itself.
8077     NewLoopID->replaceOperandWith(0, NewLoopID);
8078     L->setLoopID(NewLoopID);
8079   }
8080 }
8081 
8082 //===--------------------------------------------------------------------===//
8083 // EpilogueVectorizerMainLoop
8084 //===--------------------------------------------------------------------===//
8085 
8086 /// This function is partially responsible for generating the control flow
8087 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
8088 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
8089   MDNode *OrigLoopID = OrigLoop->getLoopID();
8090   Loop *Lp = createVectorLoopSkeleton("");
8091 
8092   // Generate the code to check the minimum iteration count of the vector
8093   // epilogue (see below).
8094   EPI.EpilogueIterationCountCheck =
8095       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true);
8096   EPI.EpilogueIterationCountCheck->setName("iter.check");
8097 
8098   // Generate the code to check any assumptions that we've made for SCEV
8099   // expressions.
8100   EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader);
8101 
8102   // Generate the code that checks at runtime if arrays overlap. We put the
8103   // checks into a separate block to make the more common case of few elements
8104   // faster.
8105   EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
8106 
8107   // Generate the iteration count check for the main loop, *after* the check
8108   // for the epilogue loop, so that the path-length is shorter for the case
8109   // that goes directly through the vector epilogue. The longer-path length for
8110   // the main loop is compensated for, by the gain from vectorizing the larger
8111   // trip count. Note: the branch will get updated later on when we vectorize
8112   // the epilogue.
8113   EPI.MainLoopIterationCountCheck =
8114       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
8115 
8116   // Generate the induction variable.
8117   OldInduction = Legal->getPrimaryInduction();
8118   Type *IdxTy = Legal->getWidestInductionType();
8119   Value *StartIdx = ConstantInt::get(IdxTy, 0);
8120 
8121   IRBuilder<> B(&*Lp->getLoopPreheader()->getFirstInsertionPt());
8122   Value *Step = getRuntimeVF(B, IdxTy, VF * UF);
8123   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
8124   EPI.VectorTripCount = CountRoundDown;
8125   Induction =
8126       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
8127                               getDebugLocFromInstOrOperands(OldInduction));
8128 
8129   // Skip induction resume value creation here because they will be created in
8130   // the second pass. If we created them here, they wouldn't be used anyway,
8131   // because the vplan in the second pass still contains the inductions from the
8132   // original loop.
8133 
8134   return completeLoopSkeleton(Lp, OrigLoopID);
8135 }
8136 
8137 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
8138   LLVM_DEBUG({
8139     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
8140            << "Main Loop VF:" << EPI.MainLoopVF
8141            << ", Main Loop UF:" << EPI.MainLoopUF
8142            << ", Epilogue Loop VF:" << EPI.EpilogueVF
8143            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8144   });
8145 }
8146 
8147 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
8148   DEBUG_WITH_TYPE(VerboseDebug, {
8149     dbgs() << "intermediate fn:\n"
8150            << *OrigLoop->getHeader()->getParent() << "\n";
8151   });
8152 }
8153 
8154 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
8155     Loop *L, BasicBlock *Bypass, bool ForEpilogue) {
8156   assert(L && "Expected valid Loop.");
8157   assert(Bypass && "Expected valid bypass basic block.");
8158   ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
8159   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
8160   Value *Count = getOrCreateTripCount(L);
8161   // Reuse existing vector loop preheader for TC checks.
8162   // Note that new preheader block is generated for vector loop.
8163   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
8164   IRBuilder<> Builder(TCCheckBlock->getTerminator());
8165 
8166   // Generate code to check if the loop's trip count is less than VF * UF of the
8167   // main vector loop.
8168   auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ?
8169       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
8170 
8171   Value *CheckMinIters = Builder.CreateICmp(
8172       P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
8173       "min.iters.check");
8174 
8175   if (!ForEpilogue)
8176     TCCheckBlock->setName("vector.main.loop.iter.check");
8177 
8178   // Create new preheader for vector loop.
8179   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
8180                                    DT, LI, nullptr, "vector.ph");
8181 
8182   if (ForEpilogue) {
8183     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
8184                                  DT->getNode(Bypass)->getIDom()) &&
8185            "TC check is expected to dominate Bypass");
8186 
8187     // Update dominator for Bypass & LoopExit.
8188     DT->changeImmediateDominator(Bypass, TCCheckBlock);
8189     if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
8190       // For loops with multiple exits, there's no edge from the middle block
8191       // to exit blocks (as the epilogue must run) and thus no need to update
8192       // the immediate dominator of the exit blocks.
8193       DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
8194 
8195     LoopBypassBlocks.push_back(TCCheckBlock);
8196 
8197     // Save the trip count so we don't have to regenerate it in the
8198     // vec.epilog.iter.check. This is safe to do because the trip count
8199     // generated here dominates the vector epilog iter check.
8200     EPI.TripCount = Count;
8201   }
8202 
8203   ReplaceInstWithInst(
8204       TCCheckBlock->getTerminator(),
8205       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
8206 
8207   return TCCheckBlock;
8208 }
8209 
8210 //===--------------------------------------------------------------------===//
8211 // EpilogueVectorizerEpilogueLoop
8212 //===--------------------------------------------------------------------===//
8213 
8214 /// This function is partially responsible for generating the control flow
8215 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
8216 BasicBlock *
8217 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
8218   MDNode *OrigLoopID = OrigLoop->getLoopID();
8219   Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
8220 
8221   // Now, compare the remaining count and if there aren't enough iterations to
8222   // execute the vectorized epilogue skip to the scalar part.
8223   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
8224   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
8225   LoopVectorPreHeader =
8226       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
8227                  LI, nullptr, "vec.epilog.ph");
8228   emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader,
8229                                           VecEpilogueIterationCountCheck);
8230 
8231   // Adjust the control flow taking the state info from the main loop
8232   // vectorization into account.
8233   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
8234          "expected this to be saved from the previous pass.");
8235   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
8236       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
8237 
8238   DT->changeImmediateDominator(LoopVectorPreHeader,
8239                                EPI.MainLoopIterationCountCheck);
8240 
8241   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
8242       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8243 
8244   if (EPI.SCEVSafetyCheck)
8245     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
8246         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8247   if (EPI.MemSafetyCheck)
8248     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
8249         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8250 
8251   DT->changeImmediateDominator(
8252       VecEpilogueIterationCountCheck,
8253       VecEpilogueIterationCountCheck->getSinglePredecessor());
8254 
8255   DT->changeImmediateDominator(LoopScalarPreHeader,
8256                                EPI.EpilogueIterationCountCheck);
8257   if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
8258     // If there is an epilogue which must run, there's no edge from the
8259     // middle block to exit blocks  and thus no need to update the immediate
8260     // dominator of the exit blocks.
8261     DT->changeImmediateDominator(LoopExitBlock,
8262                                  EPI.EpilogueIterationCountCheck);
8263 
8264   // Keep track of bypass blocks, as they feed start values to the induction
8265   // phis in the scalar loop preheader.
8266   if (EPI.SCEVSafetyCheck)
8267     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
8268   if (EPI.MemSafetyCheck)
8269     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
8270   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
8271 
8272   // Generate a resume induction for the vector epilogue and put it in the
8273   // vector epilogue preheader
8274   Type *IdxTy = Legal->getWidestInductionType();
8275   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
8276                                          LoopVectorPreHeader->getFirstNonPHI());
8277   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
8278   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
8279                            EPI.MainLoopIterationCountCheck);
8280 
8281   // Generate the induction variable.
8282   OldInduction = Legal->getPrimaryInduction();
8283   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
8284   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
8285   Value *StartIdx = EPResumeVal;
8286   Induction =
8287       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
8288                               getDebugLocFromInstOrOperands(OldInduction));
8289 
8290   // Generate induction resume values. These variables save the new starting
8291   // indexes for the scalar loop. They are used to test if there are any tail
8292   // iterations left once the vector loop has completed.
8293   // Note that when the vectorized epilogue is skipped due to iteration count
8294   // check, then the resume value for the induction variable comes from
8295   // the trip count of the main vector loop, hence passing the AdditionalBypass
8296   // argument.
8297   createInductionResumeValues(Lp, CountRoundDown,
8298                               {VecEpilogueIterationCountCheck,
8299                                EPI.VectorTripCount} /* AdditionalBypass */);
8300 
8301   AddRuntimeUnrollDisableMetaData(Lp);
8302   return completeLoopSkeleton(Lp, OrigLoopID);
8303 }
8304 
8305 BasicBlock *
8306 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
8307     Loop *L, BasicBlock *Bypass, BasicBlock *Insert) {
8308 
8309   assert(EPI.TripCount &&
8310          "Expected trip count to have been safed in the first pass.");
8311   assert(
8312       (!isa<Instruction>(EPI.TripCount) ||
8313        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
8314       "saved trip count does not dominate insertion point.");
8315   Value *TC = EPI.TripCount;
8316   IRBuilder<> Builder(Insert->getTerminator());
8317   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
8318 
8319   // Generate code to check if the loop's trip count is less than VF * UF of the
8320   // vector epilogue loop.
8321   auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ?
8322       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
8323 
8324   Value *CheckMinIters =
8325       Builder.CreateICmp(P, Count,
8326                          createStepForVF(Builder, Count->getType(),
8327                                          EPI.EpilogueVF, EPI.EpilogueUF),
8328                          "min.epilog.iters.check");
8329 
8330   ReplaceInstWithInst(
8331       Insert->getTerminator(),
8332       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
8333 
8334   LoopBypassBlocks.push_back(Insert);
8335   return Insert;
8336 }
8337 
8338 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
8339   LLVM_DEBUG({
8340     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
8341            << "Epilogue Loop VF:" << EPI.EpilogueVF
8342            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8343   });
8344 }
8345 
8346 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
8347   DEBUG_WITH_TYPE(VerboseDebug, {
8348     dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
8349   });
8350 }
8351 
8352 bool LoopVectorizationPlanner::getDecisionAndClampRange(
8353     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
8354   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
8355   bool PredicateAtRangeStart = Predicate(Range.Start);
8356 
8357   for (ElementCount TmpVF = Range.Start * 2;
8358        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
8359     if (Predicate(TmpVF) != PredicateAtRangeStart) {
8360       Range.End = TmpVF;
8361       break;
8362     }
8363 
8364   return PredicateAtRangeStart;
8365 }
8366 
8367 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
8368 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
8369 /// of VF's starting at a given VF and extending it as much as possible. Each
8370 /// vectorization decision can potentially shorten this sub-range during
8371 /// buildVPlan().
8372 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
8373                                            ElementCount MaxVF) {
8374   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8375   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8376     VFRange SubRange = {VF, MaxVFPlusOne};
8377     VPlans.push_back(buildVPlan(SubRange));
8378     VF = SubRange.End;
8379   }
8380 }
8381 
8382 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
8383                                          VPlanPtr &Plan) {
8384   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8385 
8386   // Look for cached value.
8387   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8388   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8389   if (ECEntryIt != EdgeMaskCache.end())
8390     return ECEntryIt->second;
8391 
8392   VPValue *SrcMask = createBlockInMask(Src, Plan);
8393 
8394   // The terminator has to be a branch inst!
8395   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8396   assert(BI && "Unexpected terminator found");
8397 
8398   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8399     return EdgeMaskCache[Edge] = SrcMask;
8400 
8401   // If source is an exiting block, we know the exit edge is dynamically dead
8402   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
8403   // adding uses of an otherwise potentially dead instruction.
8404   if (OrigLoop->isLoopExiting(Src))
8405     return EdgeMaskCache[Edge] = SrcMask;
8406 
8407   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
8408   assert(EdgeMask && "No Edge Mask found for condition");
8409 
8410   if (BI->getSuccessor(0) != Dst)
8411     EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
8412 
8413   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8414     // The condition is 'SrcMask && EdgeMask', which is equivalent to
8415     // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8416     // The select version does not introduce new UB if SrcMask is false and
8417     // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8418     VPValue *False = Plan->getOrAddVPValue(
8419         ConstantInt::getFalse(BI->getCondition()->getType()));
8420     EdgeMask =
8421         Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
8422   }
8423 
8424   return EdgeMaskCache[Edge] = EdgeMask;
8425 }
8426 
8427 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
8428   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8429 
8430   // Look for cached value.
8431   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
8432   if (BCEntryIt != BlockMaskCache.end())
8433     return BCEntryIt->second;
8434 
8435   // All-one mask is modelled as no-mask following the convention for masked
8436   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8437   VPValue *BlockMask = nullptr;
8438 
8439   if (OrigLoop->getHeader() == BB) {
8440     if (!CM.blockNeedsPredicationForAnyReason(BB))
8441       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
8442 
8443     // Introduce the early-exit compare IV <= BTC to form header block mask.
8444     // This is used instead of IV < TC because TC may wrap, unlike BTC.
8445     // Start by constructing the desired canonical IV in the header block.
8446     VPValue *IV = nullptr;
8447     if (Legal->getPrimaryInduction())
8448       IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
8449     else {
8450       VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock();
8451       auto *IVRecipe = new VPWidenCanonicalIVRecipe();
8452       HeaderVPBB->insert(IVRecipe, HeaderVPBB->getFirstNonPhi());
8453       IV = IVRecipe;
8454     }
8455 
8456     // Create the block in mask as the first non-phi instruction in the block.
8457     VPBuilder::InsertPointGuard Guard(Builder);
8458     auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
8459     Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
8460 
8461     bool TailFolded = !CM.isScalarEpilogueAllowed();
8462 
8463     if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
8464       VPValue *TC = Plan->getOrCreateTripCount();
8465       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC});
8466     } else {
8467       VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
8468       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
8469     }
8470     return BlockMaskCache[BB] = BlockMask;
8471   }
8472 
8473   // This is the block mask. We OR all incoming edges.
8474   for (auto *Predecessor : predecessors(BB)) {
8475     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8476     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
8477       return BlockMaskCache[BB] = EdgeMask;
8478 
8479     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8480       BlockMask = EdgeMask;
8481       continue;
8482     }
8483 
8484     BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8485   }
8486 
8487   return BlockMaskCache[BB] = BlockMask;
8488 }
8489 
8490 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
8491                                                 ArrayRef<VPValue *> Operands,
8492                                                 VFRange &Range,
8493                                                 VPlanPtr &Plan) {
8494   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8495          "Must be called with either a load or store");
8496 
8497   auto willWiden = [&](ElementCount VF) -> bool {
8498     if (VF.isScalar())
8499       return false;
8500     LoopVectorizationCostModel::InstWidening Decision =
8501         CM.getWideningDecision(I, VF);
8502     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8503            "CM decision should be taken at this point.");
8504     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8505       return true;
8506     if (CM.isScalarAfterVectorization(I, VF) ||
8507         CM.isProfitableToScalarize(I, VF))
8508       return false;
8509     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8510   };
8511 
8512   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8513     return nullptr;
8514 
8515   VPValue *Mask = nullptr;
8516   if (Legal->isMaskRequired(I))
8517     Mask = createBlockInMask(I->getParent(), Plan);
8518 
8519   // Determine if the pointer operand of the access is either consecutive or
8520   // reverse consecutive.
8521   LoopVectorizationCostModel::InstWidening Decision =
8522       CM.getWideningDecision(I, Range.Start);
8523   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8524   bool Consecutive =
8525       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8526 
8527   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8528     return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
8529                                               Consecutive, Reverse);
8530 
8531   StoreInst *Store = cast<StoreInst>(I);
8532   return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
8533                                             Mask, Consecutive, Reverse);
8534 }
8535 
8536 VPWidenIntOrFpInductionRecipe *
8537 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi,
8538                                            ArrayRef<VPValue *> Operands) const {
8539   // Check if this is an integer or fp induction. If so, build the recipe that
8540   // produces its scalar and vector values.
8541   if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) {
8542     assert(II->getStartValue() ==
8543            Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8544     return new VPWidenIntOrFpInductionRecipe(Phi, Operands[0], *II);
8545   }
8546 
8547   return nullptr;
8548 }
8549 
8550 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8551     TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range,
8552     VPlan &Plan) const {
8553   // Optimize the special case where the source is a constant integer
8554   // induction variable. Notice that we can only optimize the 'trunc' case
8555   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8556   // (c) other casts depend on pointer size.
8557 
8558   // Determine whether \p K is a truncation based on an induction variable that
8559   // can be optimized.
8560   auto isOptimizableIVTruncate =
8561       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8562     return [=](ElementCount VF) -> bool {
8563       return CM.isOptimizableIVTruncate(K, VF);
8564     };
8565   };
8566 
8567   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8568           isOptimizableIVTruncate(I), Range)) {
8569 
8570     auto *Phi = cast<PHINode>(I->getOperand(0));
8571     const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8572     VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
8573     return new VPWidenIntOrFpInductionRecipe(Phi, Start, II, I);
8574   }
8575   return nullptr;
8576 }
8577 
8578 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
8579                                                 ArrayRef<VPValue *> Operands,
8580                                                 VPlanPtr &Plan) {
8581   // If all incoming values are equal, the incoming VPValue can be used directly
8582   // instead of creating a new VPBlendRecipe.
8583   VPValue *FirstIncoming = Operands[0];
8584   if (all_of(Operands, [FirstIncoming](const VPValue *Inc) {
8585         return FirstIncoming == Inc;
8586       })) {
8587     return Operands[0];
8588   }
8589 
8590   // We know that all PHIs in non-header blocks are converted into selects, so
8591   // we don't have to worry about the insertion order and we can just use the
8592   // builder. At this point we generate the predication tree. There may be
8593   // duplications since this is a simple recursive scan, but future
8594   // optimizations will clean it up.
8595   SmallVector<VPValue *, 2> OperandsWithMask;
8596   unsigned NumIncoming = Phi->getNumIncomingValues();
8597 
8598   for (unsigned In = 0; In < NumIncoming; In++) {
8599     VPValue *EdgeMask =
8600       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8601     assert((EdgeMask || NumIncoming == 1) &&
8602            "Multiple predecessors with one having a full mask");
8603     OperandsWithMask.push_back(Operands[In]);
8604     if (EdgeMask)
8605       OperandsWithMask.push_back(EdgeMask);
8606   }
8607   return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
8608 }
8609 
8610 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8611                                                    ArrayRef<VPValue *> Operands,
8612                                                    VFRange &Range) const {
8613 
8614   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8615       [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); },
8616       Range);
8617 
8618   if (IsPredicated)
8619     return nullptr;
8620 
8621   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8622   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8623              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8624              ID == Intrinsic::pseudoprobe ||
8625              ID == Intrinsic::experimental_noalias_scope_decl))
8626     return nullptr;
8627 
8628   auto willWiden = [&](ElementCount VF) -> bool {
8629     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8630     // The following case may be scalarized depending on the VF.
8631     // The flag shows whether we use Intrinsic or a usual Call for vectorized
8632     // version of the instruction.
8633     // Is it beneficial to perform intrinsic call compared to lib call?
8634     bool NeedToScalarize = false;
8635     InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
8636     InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
8637     bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
8638     return UseVectorIntrinsic || !NeedToScalarize;
8639   };
8640 
8641   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8642     return nullptr;
8643 
8644   ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());
8645   return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()));
8646 }
8647 
8648 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8649   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8650          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8651   // Instruction should be widened, unless it is scalar after vectorization,
8652   // scalarization is profitable or it is predicated.
8653   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8654     return CM.isScalarAfterVectorization(I, VF) ||
8655            CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I);
8656   };
8657   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8658                                                              Range);
8659 }
8660 
8661 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8662                                            ArrayRef<VPValue *> Operands) const {
8663   auto IsVectorizableOpcode = [](unsigned Opcode) {
8664     switch (Opcode) {
8665     case Instruction::Add:
8666     case Instruction::And:
8667     case Instruction::AShr:
8668     case Instruction::BitCast:
8669     case Instruction::FAdd:
8670     case Instruction::FCmp:
8671     case Instruction::FDiv:
8672     case Instruction::FMul:
8673     case Instruction::FNeg:
8674     case Instruction::FPExt:
8675     case Instruction::FPToSI:
8676     case Instruction::FPToUI:
8677     case Instruction::FPTrunc:
8678     case Instruction::FRem:
8679     case Instruction::FSub:
8680     case Instruction::ICmp:
8681     case Instruction::IntToPtr:
8682     case Instruction::LShr:
8683     case Instruction::Mul:
8684     case Instruction::Or:
8685     case Instruction::PtrToInt:
8686     case Instruction::SDiv:
8687     case Instruction::Select:
8688     case Instruction::SExt:
8689     case Instruction::Shl:
8690     case Instruction::SIToFP:
8691     case Instruction::SRem:
8692     case Instruction::Sub:
8693     case Instruction::Trunc:
8694     case Instruction::UDiv:
8695     case Instruction::UIToFP:
8696     case Instruction::URem:
8697     case Instruction::Xor:
8698     case Instruction::ZExt:
8699       return true;
8700     }
8701     return false;
8702   };
8703 
8704   if (!IsVectorizableOpcode(I->getOpcode()))
8705     return nullptr;
8706 
8707   // Success: widen this instruction.
8708   return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8709 }
8710 
8711 void VPRecipeBuilder::fixHeaderPhis() {
8712   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8713   for (VPHeaderPHIRecipe *R : PhisToFix) {
8714     auto *PN = cast<PHINode>(R->getUnderlyingValue());
8715     VPRecipeBase *IncR =
8716         getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8717     R->addOperand(IncR->getVPSingleValue());
8718   }
8719 }
8720 
8721 VPBasicBlock *VPRecipeBuilder::handleReplication(
8722     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8723     VPlanPtr &Plan) {
8724   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8725       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8726       Range);
8727 
8728   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8729       [&](ElementCount VF) { return CM.isPredicatedInst(I, IsUniform); },
8730       Range);
8731 
8732   // Even if the instruction is not marked as uniform, there are certain
8733   // intrinsic calls that can be effectively treated as such, so we check for
8734   // them here. Conservatively, we only do this for scalable vectors, since
8735   // for fixed-width VFs we can always fall back on full scalarization.
8736   if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8737     switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8738     case Intrinsic::assume:
8739     case Intrinsic::lifetime_start:
8740     case Intrinsic::lifetime_end:
8741       // For scalable vectors if one of the operands is variant then we still
8742       // want to mark as uniform, which will generate one instruction for just
8743       // the first lane of the vector. We can't scalarize the call in the same
8744       // way as for fixed-width vectors because we don't know how many lanes
8745       // there are.
8746       //
8747       // The reasons for doing it this way for scalable vectors are:
8748       //   1. For the assume intrinsic generating the instruction for the first
8749       //      lane is still be better than not generating any at all. For
8750       //      example, the input may be a splat across all lanes.
8751       //   2. For the lifetime start/end intrinsics the pointer operand only
8752       //      does anything useful when the input comes from a stack object,
8753       //      which suggests it should always be uniform. For non-stack objects
8754       //      the effect is to poison the object, which still allows us to
8755       //      remove the call.
8756       IsUniform = true;
8757       break;
8758     default:
8759       break;
8760     }
8761   }
8762 
8763   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8764                                        IsUniform, IsPredicated);
8765   setRecipe(I, Recipe);
8766   Plan->addVPValue(I, Recipe);
8767 
8768   // Find if I uses a predicated instruction. If so, it will use its scalar
8769   // value. Avoid hoisting the insert-element which packs the scalar value into
8770   // a vector value, as that happens iff all users use the vector value.
8771   for (VPValue *Op : Recipe->operands()) {
8772     auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef());
8773     if (!PredR)
8774       continue;
8775     auto *RepR =
8776         cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef());
8777     assert(RepR->isPredicated() &&
8778            "expected Replicate recipe to be predicated");
8779     RepR->setAlsoPack(false);
8780   }
8781 
8782   // Finalize the recipe for Instr, first if it is not predicated.
8783   if (!IsPredicated) {
8784     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8785     VPBB->appendRecipe(Recipe);
8786     return VPBB;
8787   }
8788   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8789 
8790   VPBlockBase *SingleSucc = VPBB->getSingleSuccessor();
8791   assert(SingleSucc && "VPBB must have a single successor when handling "
8792                        "predicated replication.");
8793   VPBlockUtils::disconnectBlocks(VPBB, SingleSucc);
8794   // Record predicated instructions for above packing optimizations.
8795   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
8796   VPBlockUtils::insertBlockAfter(Region, VPBB);
8797   auto *RegSucc = new VPBasicBlock();
8798   VPBlockUtils::insertBlockAfter(RegSucc, Region);
8799   VPBlockUtils::connectBlocks(RegSucc, SingleSucc);
8800   return RegSucc;
8801 }
8802 
8803 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
8804                                                       VPRecipeBase *PredRecipe,
8805                                                       VPlanPtr &Plan) {
8806   // Instructions marked for predication are replicated and placed under an
8807   // if-then construct to prevent side-effects.
8808 
8809   // Generate recipes to compute the block mask for this region.
8810   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
8811 
8812   // Build the triangular if-then region.
8813   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8814   assert(Instr->getParent() && "Predicated instruction not in any basic block");
8815   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8816   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8817   auto *PHIRecipe = Instr->getType()->isVoidTy()
8818                         ? nullptr
8819                         : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
8820   if (PHIRecipe) {
8821     Plan->removeVPValueFor(Instr);
8822     Plan->addVPValue(Instr, PHIRecipe);
8823   }
8824   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8825   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8826   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
8827 
8828   // Note: first set Entry as region entry and then connect successors starting
8829   // from it in order, to propagate the "parent" of each VPBasicBlock.
8830   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
8831   VPBlockUtils::connectBlocks(Pred, Exit);
8832 
8833   return Region;
8834 }
8835 
8836 VPRecipeOrVPValueTy
8837 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8838                                         ArrayRef<VPValue *> Operands,
8839                                         VFRange &Range, VPlanPtr &Plan) {
8840   // First, check for specific widening recipes that deal with calls, memory
8841   // operations, inductions and Phi nodes.
8842   if (auto *CI = dyn_cast<CallInst>(Instr))
8843     return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
8844 
8845   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8846     return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
8847 
8848   VPRecipeBase *Recipe;
8849   if (auto Phi = dyn_cast<PHINode>(Instr)) {
8850     if (Phi->getParent() != OrigLoop->getHeader())
8851       return tryToBlend(Phi, Operands, Plan);
8852     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands)))
8853       return toVPRecipeResult(Recipe);
8854 
8855     VPHeaderPHIRecipe *PhiRecipe = nullptr;
8856     if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) {
8857       VPValue *StartV = Operands[0];
8858       if (Legal->isReductionVariable(Phi)) {
8859         const RecurrenceDescriptor &RdxDesc =
8860             Legal->getReductionVars().find(Phi)->second;
8861         assert(RdxDesc.getRecurrenceStartValue() ==
8862                Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8863         PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8864                                              CM.isInLoopReduction(Phi),
8865                                              CM.useOrderedReductions(RdxDesc));
8866       } else {
8867         PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8868       }
8869 
8870       // Record the incoming value from the backedge, so we can add the incoming
8871       // value from the backedge after all recipes have been created.
8872       recordRecipeOf(cast<Instruction>(
8873           Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())));
8874       PhisToFix.push_back(PhiRecipe);
8875     } else {
8876       // TODO: record start and backedge value for remaining pointer induction
8877       // phis.
8878       assert(Phi->getType()->isPointerTy() &&
8879              "only pointer phis should be handled here");
8880       PhiRecipe = new VPWidenPHIRecipe(Phi);
8881     }
8882 
8883     return toVPRecipeResult(PhiRecipe);
8884   }
8885 
8886   if (isa<TruncInst>(Instr) &&
8887       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
8888                                                Range, *Plan)))
8889     return toVPRecipeResult(Recipe);
8890 
8891   if (!shouldWiden(Instr, Range))
8892     return nullptr;
8893 
8894   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8895     return toVPRecipeResult(new VPWidenGEPRecipe(
8896         GEP, make_range(Operands.begin(), Operands.end()), OrigLoop));
8897 
8898   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8899     bool InvariantCond =
8900         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
8901     return toVPRecipeResult(new VPWidenSelectRecipe(
8902         *SI, make_range(Operands.begin(), Operands.end()), InvariantCond));
8903   }
8904 
8905   return toVPRecipeResult(tryToWiden(Instr, Operands));
8906 }
8907 
8908 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8909                                                         ElementCount MaxVF) {
8910   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8911 
8912   // Collect instructions from the original loop that will become trivially dead
8913   // in the vectorized loop. We don't need to vectorize these instructions. For
8914   // example, original induction update instructions can become dead because we
8915   // separately emit induction "steps" when generating code for the new loop.
8916   // Similarly, we create a new latch condition when setting up the structure
8917   // of the new loop, so the old one can become dead.
8918   SmallPtrSet<Instruction *, 4> DeadInstructions;
8919   collectTriviallyDeadInstructions(DeadInstructions);
8920 
8921   // Add assume instructions we need to drop to DeadInstructions, to prevent
8922   // them from being added to the VPlan.
8923   // TODO: We only need to drop assumes in blocks that get flattend. If the
8924   // control flow is preserved, we should keep them.
8925   auto &ConditionalAssumes = Legal->getConditionalAssumes();
8926   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
8927 
8928   MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
8929   // Dead instructions do not need sinking. Remove them from SinkAfter.
8930   for (Instruction *I : DeadInstructions)
8931     SinkAfter.erase(I);
8932 
8933   // Cannot sink instructions after dead instructions (there won't be any
8934   // recipes for them). Instead, find the first non-dead previous instruction.
8935   for (auto &P : Legal->getSinkAfter()) {
8936     Instruction *SinkTarget = P.second;
8937     Instruction *FirstInst = &*SinkTarget->getParent()->begin();
8938     (void)FirstInst;
8939     while (DeadInstructions.contains(SinkTarget)) {
8940       assert(
8941           SinkTarget != FirstInst &&
8942           "Must find a live instruction (at least the one feeding the "
8943           "first-order recurrence PHI) before reaching beginning of the block");
8944       SinkTarget = SinkTarget->getPrevNode();
8945       assert(SinkTarget != P.first &&
8946              "sink source equals target, no sinking required");
8947     }
8948     P.second = SinkTarget;
8949   }
8950 
8951   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8952   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8953     VFRange SubRange = {VF, MaxVFPlusOne};
8954     VPlans.push_back(
8955         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
8956     VF = SubRange.End;
8957   }
8958 }
8959 
8960 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
8961     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
8962     const MapVector<Instruction *, Instruction *> &SinkAfter) {
8963 
8964   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8965 
8966   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8967 
8968   // ---------------------------------------------------------------------------
8969   // Pre-construction: record ingredients whose recipes we'll need to further
8970   // process after constructing the initial VPlan.
8971   // ---------------------------------------------------------------------------
8972 
8973   // Mark instructions we'll need to sink later and their targets as
8974   // ingredients whose recipe we'll need to record.
8975   for (auto &Entry : SinkAfter) {
8976     RecipeBuilder.recordRecipeOf(Entry.first);
8977     RecipeBuilder.recordRecipeOf(Entry.second);
8978   }
8979   for (auto &Reduction : CM.getInLoopReductionChains()) {
8980     PHINode *Phi = Reduction.first;
8981     RecurKind Kind =
8982         Legal->getReductionVars().find(Phi)->second.getRecurrenceKind();
8983     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8984 
8985     RecipeBuilder.recordRecipeOf(Phi);
8986     for (auto &R : ReductionOperations) {
8987       RecipeBuilder.recordRecipeOf(R);
8988       // For min/max reducitons, where we have a pair of icmp/select, we also
8989       // need to record the ICmp recipe, so it can be removed later.
8990       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
8991              "Only min/max recurrences allowed for inloop reductions");
8992       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
8993         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
8994     }
8995   }
8996 
8997   // For each interleave group which is relevant for this (possibly trimmed)
8998   // Range, add it to the set of groups to be later applied to the VPlan and add
8999   // placeholders for its members' Recipes which we'll be replacing with a
9000   // single VPInterleaveRecipe.
9001   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
9002     auto applyIG = [IG, this](ElementCount VF) -> bool {
9003       return (VF.isVector() && // Query is illegal for VF == 1
9004               CM.getWideningDecision(IG->getInsertPos(), VF) ==
9005                   LoopVectorizationCostModel::CM_Interleave);
9006     };
9007     if (!getDecisionAndClampRange(applyIG, Range))
9008       continue;
9009     InterleaveGroups.insert(IG);
9010     for (unsigned i = 0; i < IG->getFactor(); i++)
9011       if (Instruction *Member = IG->getMember(i))
9012         RecipeBuilder.recordRecipeOf(Member);
9013   };
9014 
9015   // ---------------------------------------------------------------------------
9016   // Build initial VPlan: Scan the body of the loop in a topological order to
9017   // visit each basic block after having visited its predecessor basic blocks.
9018   // ---------------------------------------------------------------------------
9019 
9020   // Create initial VPlan skeleton, with separate header and latch blocks.
9021   VPBasicBlock *HeaderVPBB = new VPBasicBlock();
9022   VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
9023   VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
9024   auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop");
9025   auto Plan = std::make_unique<VPlan>(TopRegion);
9026 
9027   // Scan the body of the loop in a topological order to visit each basic block
9028   // after having visited its predecessor basic blocks.
9029   LoopBlocksDFS DFS(OrigLoop);
9030   DFS.perform(LI);
9031 
9032   VPBasicBlock *VPBB = HeaderVPBB;
9033   SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove;
9034   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
9035     // Relevant instructions from basic block BB will be grouped into VPRecipe
9036     // ingredients and fill a new VPBasicBlock.
9037     unsigned VPBBsForBB = 0;
9038     VPBB->setName(BB->getName());
9039     Builder.setInsertPoint(VPBB);
9040 
9041     // Introduce each ingredient into VPlan.
9042     // TODO: Model and preserve debug instrinsics in VPlan.
9043     for (Instruction &I : BB->instructionsWithoutDebug()) {
9044       Instruction *Instr = &I;
9045 
9046       // First filter out irrelevant instructions, to ensure no recipes are
9047       // built for them.
9048       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
9049         continue;
9050 
9051       SmallVector<VPValue *, 4> Operands;
9052       auto *Phi = dyn_cast<PHINode>(Instr);
9053       if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
9054         Operands.push_back(Plan->getOrAddVPValue(
9055             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
9056       } else {
9057         auto OpRange = Plan->mapToVPValues(Instr->operands());
9058         Operands = {OpRange.begin(), OpRange.end()};
9059       }
9060       if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
9061               Instr, Operands, Range, Plan)) {
9062         // If Instr can be simplified to an existing VPValue, use it.
9063         if (RecipeOrValue.is<VPValue *>()) {
9064           auto *VPV = RecipeOrValue.get<VPValue *>();
9065           Plan->addVPValue(Instr, VPV);
9066           // If the re-used value is a recipe, register the recipe for the
9067           // instruction, in case the recipe for Instr needs to be recorded.
9068           if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef()))
9069             RecipeBuilder.setRecipe(Instr, R);
9070           continue;
9071         }
9072         // Otherwise, add the new recipe.
9073         VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>();
9074         for (auto *Def : Recipe->definedValues()) {
9075           auto *UV = Def->getUnderlyingValue();
9076           Plan->addVPValue(UV, Def);
9077         }
9078 
9079         if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) &&
9080             HeaderVPBB->getFirstNonPhi() != VPBB->end()) {
9081           // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section
9082           // of the header block. That can happen for truncates of induction
9083           // variables. Those recipes are moved to the phi section of the header
9084           // block after applying SinkAfter, which relies on the original
9085           // position of the trunc.
9086           assert(isa<TruncInst>(Instr));
9087           InductionsToMove.push_back(
9088               cast<VPWidenIntOrFpInductionRecipe>(Recipe));
9089         }
9090         RecipeBuilder.setRecipe(Instr, Recipe);
9091         VPBB->appendRecipe(Recipe);
9092         continue;
9093       }
9094 
9095       // Otherwise, if all widening options failed, Instruction is to be
9096       // replicated. This may create a successor for VPBB.
9097       VPBasicBlock *NextVPBB =
9098           RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan);
9099       if (NextVPBB != VPBB) {
9100         VPBB = NextVPBB;
9101         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
9102                                     : "");
9103       }
9104     }
9105 
9106     VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
9107     VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
9108   }
9109 
9110   // Fold the last, empty block into its predecessor.
9111   VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB);
9112   assert(VPBB && "expected to fold last (empty) block");
9113   // After here, VPBB should not be used.
9114   VPBB = nullptr;
9115 
9116   assert(isa<VPRegionBlock>(Plan->getEntry()) &&
9117          !Plan->getEntry()->getEntryBasicBlock()->empty() &&
9118          "entry block must be set to a VPRegionBlock having a non-empty entry "
9119          "VPBasicBlock");
9120   RecipeBuilder.fixHeaderPhis();
9121 
9122   // ---------------------------------------------------------------------------
9123   // Transform initial VPlan: Apply previously taken decisions, in order, to
9124   // bring the VPlan to its final state.
9125   // ---------------------------------------------------------------------------
9126 
9127   // Apply Sink-After legal constraints.
9128   auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {
9129     auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
9130     if (Region && Region->isReplicator()) {
9131       assert(Region->getNumSuccessors() == 1 &&
9132              Region->getNumPredecessors() == 1 && "Expected SESE region!");
9133       assert(R->getParent()->size() == 1 &&
9134              "A recipe in an original replicator region must be the only "
9135              "recipe in its block");
9136       return Region;
9137     }
9138     return nullptr;
9139   };
9140   for (auto &Entry : SinkAfter) {
9141     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
9142     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
9143 
9144     auto *TargetRegion = GetReplicateRegion(Target);
9145     auto *SinkRegion = GetReplicateRegion(Sink);
9146     if (!SinkRegion) {
9147       // If the sink source is not a replicate region, sink the recipe directly.
9148       if (TargetRegion) {
9149         // The target is in a replication region, make sure to move Sink to
9150         // the block after it, not into the replication region itself.
9151         VPBasicBlock *NextBlock =
9152             cast<VPBasicBlock>(TargetRegion->getSuccessors().front());
9153         Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
9154       } else
9155         Sink->moveAfter(Target);
9156       continue;
9157     }
9158 
9159     // The sink source is in a replicate region. Unhook the region from the CFG.
9160     auto *SinkPred = SinkRegion->getSinglePredecessor();
9161     auto *SinkSucc = SinkRegion->getSingleSuccessor();
9162     VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion);
9163     VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc);
9164     VPBlockUtils::connectBlocks(SinkPred, SinkSucc);
9165 
9166     if (TargetRegion) {
9167       // The target recipe is also in a replicate region, move the sink region
9168       // after the target region.
9169       auto *TargetSucc = TargetRegion->getSingleSuccessor();
9170       VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc);
9171       VPBlockUtils::connectBlocks(TargetRegion, SinkRegion);
9172       VPBlockUtils::connectBlocks(SinkRegion, TargetSucc);
9173     } else {
9174       // The sink source is in a replicate region, we need to move the whole
9175       // replicate region, which should only contain a single recipe in the
9176       // main block.
9177       auto *SplitBlock =
9178           Target->getParent()->splitAt(std::next(Target->getIterator()));
9179 
9180       auto *SplitPred = SplitBlock->getSinglePredecessor();
9181 
9182       VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock);
9183       VPBlockUtils::connectBlocks(SplitPred, SinkRegion);
9184       VPBlockUtils::connectBlocks(SinkRegion, SplitBlock);
9185     }
9186   }
9187 
9188   VPlanTransforms::removeRedundantInductionCasts(*Plan);
9189 
9190   // Now that sink-after is done, move induction recipes for optimized truncates
9191   // to the phi section of the header block.
9192   for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove)
9193     Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
9194 
9195   // Adjust the recipes for any inloop reductions.
9196   adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExit()), Plan,
9197                              RecipeBuilder, Range.Start);
9198 
9199   // Introduce a recipe to combine the incoming and previous values of a
9200   // first-order recurrence.
9201   for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
9202     auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R);
9203     if (!RecurPhi)
9204       continue;
9205 
9206     VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe();
9207     VPBasicBlock *InsertBlock = PrevRecipe->getParent();
9208     auto *Region = GetReplicateRegion(PrevRecipe);
9209     if (Region)
9210       InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor());
9211     if (Region || PrevRecipe->isPhi())
9212       Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
9213     else
9214       Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator()));
9215 
9216     auto *RecurSplice = cast<VPInstruction>(
9217         Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
9218                              {RecurPhi, RecurPhi->getBackedgeValue()}));
9219 
9220     RecurPhi->replaceAllUsesWith(RecurSplice);
9221     // Set the first operand of RecurSplice to RecurPhi again, after replacing
9222     // all users.
9223     RecurSplice->setOperand(0, RecurPhi);
9224   }
9225 
9226   // Interleave memory: for each Interleave Group we marked earlier as relevant
9227   // for this VPlan, replace the Recipes widening its memory instructions with a
9228   // single VPInterleaveRecipe at its insertion point.
9229   for (auto IG : InterleaveGroups) {
9230     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
9231         RecipeBuilder.getRecipe(IG->getInsertPos()));
9232     SmallVector<VPValue *, 4> StoredValues;
9233     for (unsigned i = 0; i < IG->getFactor(); ++i)
9234       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
9235         auto *StoreR =
9236             cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
9237         StoredValues.push_back(StoreR->getStoredValue());
9238       }
9239 
9240     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
9241                                         Recipe->getMask());
9242     VPIG->insertBefore(Recipe);
9243     unsigned J = 0;
9244     for (unsigned i = 0; i < IG->getFactor(); ++i)
9245       if (Instruction *Member = IG->getMember(i)) {
9246         if (!Member->getType()->isVoidTy()) {
9247           VPValue *OriginalV = Plan->getVPValue(Member);
9248           Plan->removeVPValueFor(Member);
9249           Plan->addVPValue(Member, VPIG->getVPValue(J));
9250           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
9251           J++;
9252         }
9253         RecipeBuilder.getRecipe(Member)->eraseFromParent();
9254       }
9255   }
9256 
9257   // From this point onwards, VPlan-to-VPlan transformations may change the plan
9258   // in ways that accessing values using original IR values is incorrect.
9259   Plan->disableValue2VPValue();
9260 
9261   VPlanTransforms::sinkScalarOperands(*Plan);
9262   VPlanTransforms::mergeReplicateRegions(*Plan);
9263 
9264   std::string PlanName;
9265   raw_string_ostream RSO(PlanName);
9266   ElementCount VF = Range.Start;
9267   Plan->addVF(VF);
9268   RSO << "Initial VPlan for VF={" << VF;
9269   for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
9270     Plan->addVF(VF);
9271     RSO << "," << VF;
9272   }
9273   RSO << "},UF>=1";
9274   RSO.flush();
9275   Plan->setName(PlanName);
9276 
9277   // Fold Exit block into its predecessor if possible.
9278   // TODO: Fold block earlier once all VPlan transforms properly maintain a
9279   // VPBasicBlock as exit.
9280   VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExit());
9281 
9282   assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
9283   return Plan;
9284 }
9285 
9286 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9287   // Outer loop handling: They may require CFG and instruction level
9288   // transformations before even evaluating whether vectorization is profitable.
9289   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9290   // the vectorization pipeline.
9291   assert(!OrigLoop->isInnermost());
9292   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9293 
9294   // Create new empty VPlan
9295   auto Plan = std::make_unique<VPlan>();
9296 
9297   // Build hierarchical CFG
9298   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9299   HCFGBuilder.buildHierarchicalCFG();
9300 
9301   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
9302        VF *= 2)
9303     Plan->addVF(VF);
9304 
9305   if (EnableVPlanPredication) {
9306     VPlanPredicator VPP(*Plan);
9307     VPP.predicate();
9308 
9309     // Avoid running transformation to recipes until masked code generation in
9310     // VPlan-native path is in place.
9311     return Plan;
9312   }
9313 
9314   SmallPtrSet<Instruction *, 1> DeadInstructions;
9315   VPlanTransforms::VPInstructionsToVPRecipes(
9316       OrigLoop, Plan,
9317       [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
9318       DeadInstructions, *PSE.getSE());
9319   return Plan;
9320 }
9321 
9322 // Adjust the recipes for reductions. For in-loop reductions the chain of
9323 // instructions leading from the loop exit instr to the phi need to be converted
9324 // to reductions, with one operand being vector and the other being the scalar
9325 // reduction chain. For other reductions, a select is introduced between the phi
9326 // and live-out recipes when folding the tail.
9327 void LoopVectorizationPlanner::adjustRecipesForReductions(
9328     VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
9329     ElementCount MinVF) {
9330   for (auto &Reduction : CM.getInLoopReductionChains()) {
9331     PHINode *Phi = Reduction.first;
9332     const RecurrenceDescriptor &RdxDesc =
9333         Legal->getReductionVars().find(Phi)->second;
9334     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9335 
9336     if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc))
9337       continue;
9338 
9339     // ReductionOperations are orders top-down from the phi's use to the
9340     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
9341     // which of the two operands will remain scalar and which will be reduced.
9342     // For minmax the chain will be the select instructions.
9343     Instruction *Chain = Phi;
9344     for (Instruction *R : ReductionOperations) {
9345       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
9346       RecurKind Kind = RdxDesc.getRecurrenceKind();
9347 
9348       VPValue *ChainOp = Plan->getVPValue(Chain);
9349       unsigned FirstOpId;
9350       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
9351              "Only min/max recurrences allowed for inloop reductions");
9352       // Recognize a call to the llvm.fmuladd intrinsic.
9353       bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9354       assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) &&
9355              "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9356       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9357         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
9358                "Expected to replace a VPWidenSelectSC");
9359         FirstOpId = 1;
9360       } else {
9361         assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) ||
9362                 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) &&
9363                "Expected to replace a VPWidenSC");
9364         FirstOpId = 0;
9365       }
9366       unsigned VecOpId =
9367           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
9368       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
9369 
9370       auto *CondOp = CM.foldTailByMasking()
9371                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
9372                          : nullptr;
9373 
9374       if (IsFMulAdd) {
9375         // If the instruction is a call to the llvm.fmuladd intrinsic then we
9376         // need to create an fmul recipe to use as the vector operand for the
9377         // fadd reduction.
9378         VPInstruction *FMulRecipe = new VPInstruction(
9379             Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))});
9380         FMulRecipe->setFastMathFlags(R->getFastMathFlags());
9381         WidenRecipe->getParent()->insert(FMulRecipe,
9382                                          WidenRecipe->getIterator());
9383         VecOp = FMulRecipe;
9384       }
9385       VPReductionRecipe *RedRecipe =
9386           new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
9387       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9388       Plan->removeVPValueFor(R);
9389       Plan->addVPValue(R, RedRecipe);
9390       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
9391       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9392       WidenRecipe->eraseFromParent();
9393 
9394       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9395         VPRecipeBase *CompareRecipe =
9396             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
9397         assert(isa<VPWidenRecipe>(CompareRecipe) &&
9398                "Expected to replace a VPWidenSC");
9399         assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
9400                "Expected no remaining users");
9401         CompareRecipe->eraseFromParent();
9402       }
9403       Chain = R;
9404     }
9405   }
9406 
9407   // If tail is folded by masking, introduce selects between the phi
9408   // and the live-out instruction of each reduction, at the end of the latch.
9409   if (CM.foldTailByMasking()) {
9410     for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
9411       VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9412       if (!PhiR || PhiR->isInLoop())
9413         continue;
9414       Builder.setInsertPoint(LatchVPBB);
9415       VPValue *Cond =
9416           RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
9417       VPValue *Red = PhiR->getBackedgeValue();
9418       Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});
9419     }
9420   }
9421 }
9422 
9423 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9424 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9425                                VPSlotTracker &SlotTracker) const {
9426   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9427   IG->getInsertPos()->printAsOperand(O, false);
9428   O << ", ";
9429   getAddr()->printAsOperand(O, SlotTracker);
9430   VPValue *Mask = getMask();
9431   if (Mask) {
9432     O << ", ";
9433     Mask->printAsOperand(O, SlotTracker);
9434   }
9435 
9436   unsigned OpIdx = 0;
9437   for (unsigned i = 0; i < IG->getFactor(); ++i) {
9438     if (!IG->getMember(i))
9439       continue;
9440     if (getNumStoreOperands() > 0) {
9441       O << "\n" << Indent << "  store ";
9442       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9443       O << " to index " << i;
9444     } else {
9445       O << "\n" << Indent << "  ";
9446       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
9447       O << " = load from index " << i;
9448     }
9449     ++OpIdx;
9450   }
9451 }
9452 #endif
9453 
9454 void VPWidenCallRecipe::execute(VPTransformState &State) {
9455   State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
9456                                   *this, State);
9457 }
9458 
9459 void VPWidenSelectRecipe::execute(VPTransformState &State) {
9460   auto &I = *cast<SelectInst>(getUnderlyingInstr());
9461   State.ILV->setDebugLocFromInst(&I);
9462 
9463   // The condition can be loop invariant  but still defined inside the
9464   // loop. This means that we can't just use the original 'cond' value.
9465   // We have to take the 'vectorized' value and pick the first lane.
9466   // Instcombine will make this a no-op.
9467   auto *InvarCond =
9468       InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr;
9469 
9470   for (unsigned Part = 0; Part < State.UF; ++Part) {
9471     Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part);
9472     Value *Op0 = State.get(getOperand(1), Part);
9473     Value *Op1 = State.get(getOperand(2), Part);
9474     Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
9475     State.set(this, Sel, Part);
9476     State.ILV->addMetadata(Sel, &I);
9477   }
9478 }
9479 
9480 void VPWidenRecipe::execute(VPTransformState &State) {
9481   auto &I = *cast<Instruction>(getUnderlyingValue());
9482   auto &Builder = State.Builder;
9483   switch (I.getOpcode()) {
9484   case Instruction::Call:
9485   case Instruction::Br:
9486   case Instruction::PHI:
9487   case Instruction::GetElementPtr:
9488   case Instruction::Select:
9489     llvm_unreachable("This instruction is handled by a different recipe.");
9490   case Instruction::UDiv:
9491   case Instruction::SDiv:
9492   case Instruction::SRem:
9493   case Instruction::URem:
9494   case Instruction::Add:
9495   case Instruction::FAdd:
9496   case Instruction::Sub:
9497   case Instruction::FSub:
9498   case Instruction::FNeg:
9499   case Instruction::Mul:
9500   case Instruction::FMul:
9501   case Instruction::FDiv:
9502   case Instruction::FRem:
9503   case Instruction::Shl:
9504   case Instruction::LShr:
9505   case Instruction::AShr:
9506   case Instruction::And:
9507   case Instruction::Or:
9508   case Instruction::Xor: {
9509     // Just widen unops and binops.
9510     State.ILV->setDebugLocFromInst(&I);
9511 
9512     for (unsigned Part = 0; Part < State.UF; ++Part) {
9513       SmallVector<Value *, 2> Ops;
9514       for (VPValue *VPOp : operands())
9515         Ops.push_back(State.get(VPOp, Part));
9516 
9517       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
9518 
9519       if (auto *VecOp = dyn_cast<Instruction>(V)) {
9520         VecOp->copyIRFlags(&I);
9521 
9522         // If the instruction is vectorized and was in a basic block that needed
9523         // predication, we can't propagate poison-generating flags (nuw/nsw,
9524         // exact, etc.). The control flow has been linearized and the
9525         // instruction is no longer guarded by the predicate, which could make
9526         // the flag properties to no longer hold.
9527         if (State.MayGeneratePoisonRecipes.contains(this))
9528           VecOp->dropPoisonGeneratingFlags();
9529       }
9530 
9531       // Use this vector value for all users of the original instruction.
9532       State.set(this, V, Part);
9533       State.ILV->addMetadata(V, &I);
9534     }
9535 
9536     break;
9537   }
9538   case Instruction::ICmp:
9539   case Instruction::FCmp: {
9540     // Widen compares. Generate vector compares.
9541     bool FCmp = (I.getOpcode() == Instruction::FCmp);
9542     auto *Cmp = cast<CmpInst>(&I);
9543     State.ILV->setDebugLocFromInst(Cmp);
9544     for (unsigned Part = 0; Part < State.UF; ++Part) {
9545       Value *A = State.get(getOperand(0), Part);
9546       Value *B = State.get(getOperand(1), Part);
9547       Value *C = nullptr;
9548       if (FCmp) {
9549         // Propagate fast math flags.
9550         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
9551         Builder.setFastMathFlags(Cmp->getFastMathFlags());
9552         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
9553       } else {
9554         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
9555       }
9556       State.set(this, C, Part);
9557       State.ILV->addMetadata(C, &I);
9558     }
9559 
9560     break;
9561   }
9562 
9563   case Instruction::ZExt:
9564   case Instruction::SExt:
9565   case Instruction::FPToUI:
9566   case Instruction::FPToSI:
9567   case Instruction::FPExt:
9568   case Instruction::PtrToInt:
9569   case Instruction::IntToPtr:
9570   case Instruction::SIToFP:
9571   case Instruction::UIToFP:
9572   case Instruction::Trunc:
9573   case Instruction::FPTrunc:
9574   case Instruction::BitCast: {
9575     auto *CI = cast<CastInst>(&I);
9576     State.ILV->setDebugLocFromInst(CI);
9577 
9578     /// Vectorize casts.
9579     Type *DestTy = (State.VF.isScalar())
9580                        ? CI->getType()
9581                        : VectorType::get(CI->getType(), State.VF);
9582 
9583     for (unsigned Part = 0; Part < State.UF; ++Part) {
9584       Value *A = State.get(getOperand(0), Part);
9585       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
9586       State.set(this, Cast, Part);
9587       State.ILV->addMetadata(Cast, &I);
9588     }
9589     break;
9590   }
9591   default:
9592     // This instruction is not vectorized by simple widening.
9593     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
9594     llvm_unreachable("Unhandled instruction!");
9595   } // end of switch.
9596 }
9597 
9598 void VPWidenGEPRecipe::execute(VPTransformState &State) {
9599   auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
9600   // Construct a vector GEP by widening the operands of the scalar GEP as
9601   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
9602   // results in a vector of pointers when at least one operand of the GEP
9603   // is vector-typed. Thus, to keep the representation compact, we only use
9604   // vector-typed operands for loop-varying values.
9605 
9606   if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
9607     // If we are vectorizing, but the GEP has only loop-invariant operands,
9608     // the GEP we build (by only using vector-typed operands for
9609     // loop-varying values) would be a scalar pointer. Thus, to ensure we
9610     // produce a vector of pointers, we need to either arbitrarily pick an
9611     // operand to broadcast, or broadcast a clone of the original GEP.
9612     // Here, we broadcast a clone of the original.
9613     //
9614     // TODO: If at some point we decide to scalarize instructions having
9615     //       loop-invariant operands, this special case will no longer be
9616     //       required. We would add the scalarization decision to
9617     //       collectLoopScalars() and teach getVectorValue() to broadcast
9618     //       the lane-zero scalar value.
9619     auto *Clone = State.Builder.Insert(GEP->clone());
9620     for (unsigned Part = 0; Part < State.UF; ++Part) {
9621       Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone);
9622       State.set(this, EntryPart, Part);
9623       State.ILV->addMetadata(EntryPart, GEP);
9624     }
9625   } else {
9626     // If the GEP has at least one loop-varying operand, we are sure to
9627     // produce a vector of pointers. But if we are only unrolling, we want
9628     // to produce a scalar GEP for each unroll part. Thus, the GEP we
9629     // produce with the code below will be scalar (if VF == 1) or vector
9630     // (otherwise). Note that for the unroll-only case, we still maintain
9631     // values in the vector mapping with initVector, as we do for other
9632     // instructions.
9633     for (unsigned Part = 0; Part < State.UF; ++Part) {
9634       // The pointer operand of the new GEP. If it's loop-invariant, we
9635       // won't broadcast it.
9636       auto *Ptr = IsPtrLoopInvariant
9637                       ? State.get(getOperand(0), VPIteration(0, 0))
9638                       : State.get(getOperand(0), Part);
9639 
9640       // Collect all the indices for the new GEP. If any index is
9641       // loop-invariant, we won't broadcast it.
9642       SmallVector<Value *, 4> Indices;
9643       for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
9644         VPValue *Operand = getOperand(I);
9645         if (IsIndexLoopInvariant[I - 1])
9646           Indices.push_back(State.get(Operand, VPIteration(0, 0)));
9647         else
9648           Indices.push_back(State.get(Operand, Part));
9649       }
9650 
9651       // If the GEP instruction is vectorized and was in a basic block that
9652       // needed predication, we can't propagate the poison-generating 'inbounds'
9653       // flag. The control flow has been linearized and the GEP is no longer
9654       // guarded by the predicate, which could make the 'inbounds' properties to
9655       // no longer hold.
9656       bool IsInBounds =
9657           GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0;
9658 
9659       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
9660       // but it should be a vector, otherwise.
9661       auto *NewGEP = IsInBounds
9662                          ? State.Builder.CreateInBoundsGEP(
9663                                GEP->getSourceElementType(), Ptr, Indices)
9664                          : State.Builder.CreateGEP(GEP->getSourceElementType(),
9665                                                    Ptr, Indices);
9666       assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
9667              "NewGEP is not a pointer vector");
9668       State.set(this, NewGEP, Part);
9669       State.ILV->addMetadata(NewGEP, GEP);
9670     }
9671   }
9672 }
9673 
9674 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
9675   assert(!State.Instance && "Int or FP induction being replicated.");
9676   State.ILV->widenIntOrFpInduction(IV, getInductionDescriptor(),
9677                                    getStartValue()->getLiveInIRValue(),
9678                                    getTruncInst(), getVPValue(0), State);
9679 }
9680 
9681 void VPWidenPHIRecipe::execute(VPTransformState &State) {
9682   State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this,
9683                                  State);
9684 }
9685 
9686 void VPBlendRecipe::execute(VPTransformState &State) {
9687   State.ILV->setDebugLocFromInst(Phi, &State.Builder);
9688   // We know that all PHIs in non-header blocks are converted into
9689   // selects, so we don't have to worry about the insertion order and we
9690   // can just use the builder.
9691   // At this point we generate the predication tree. There may be
9692   // duplications since this is a simple recursive scan, but future
9693   // optimizations will clean it up.
9694 
9695   unsigned NumIncoming = getNumIncomingValues();
9696 
9697   // Generate a sequence of selects of the form:
9698   // SELECT(Mask3, In3,
9699   //        SELECT(Mask2, In2,
9700   //               SELECT(Mask1, In1,
9701   //                      In0)))
9702   // Note that Mask0 is never used: lanes for which no path reaches this phi and
9703   // are essentially undef are taken from In0.
9704   InnerLoopVectorizer::VectorParts Entry(State.UF);
9705   for (unsigned In = 0; In < NumIncoming; ++In) {
9706     for (unsigned Part = 0; Part < State.UF; ++Part) {
9707       // We might have single edge PHIs (blocks) - use an identity
9708       // 'select' for the first PHI operand.
9709       Value *In0 = State.get(getIncomingValue(In), Part);
9710       if (In == 0)
9711         Entry[Part] = In0; // Initialize with the first incoming value.
9712       else {
9713         // Select between the current value and the previous incoming edge
9714         // based on the incoming mask.
9715         Value *Cond = State.get(getMask(In), Part);
9716         Entry[Part] =
9717             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
9718       }
9719     }
9720   }
9721   for (unsigned Part = 0; Part < State.UF; ++Part)
9722     State.set(this, Entry[Part], Part);
9723 }
9724 
9725 void VPInterleaveRecipe::execute(VPTransformState &State) {
9726   assert(!State.Instance && "Interleave group being replicated.");
9727   State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9728                                       getStoredValues(), getMask());
9729 }
9730 
9731 void VPReductionRecipe::execute(VPTransformState &State) {
9732   assert(!State.Instance && "Reduction being replicated.");
9733   Value *PrevInChain = State.get(getChainOp(), 0);
9734   RecurKind Kind = RdxDesc->getRecurrenceKind();
9735   bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc);
9736   // Propagate the fast-math flags carried by the underlying instruction.
9737   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
9738   State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags());
9739   for (unsigned Part = 0; Part < State.UF; ++Part) {
9740     Value *NewVecOp = State.get(getVecOp(), Part);
9741     if (VPValue *Cond = getCondOp()) {
9742       Value *NewCond = State.get(Cond, Part);
9743       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
9744       Value *Iden = RdxDesc->getRecurrenceIdentity(
9745           Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags());
9746       Value *IdenVec =
9747           State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
9748       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
9749       NewVecOp = Select;
9750     }
9751     Value *NewRed;
9752     Value *NextInChain;
9753     if (IsOrdered) {
9754       if (State.VF.isVector())
9755         NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp,
9756                                         PrevInChain);
9757       else
9758         NewRed = State.Builder.CreateBinOp(
9759             (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain,
9760             NewVecOp);
9761       PrevInChain = NewRed;
9762     } else {
9763       PrevInChain = State.get(getChainOp(), Part);
9764       NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
9765     }
9766     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9767       NextInChain =
9768           createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
9769                          NewRed, PrevInChain);
9770     } else if (IsOrdered)
9771       NextInChain = NewRed;
9772     else
9773       NextInChain = State.Builder.CreateBinOp(
9774           (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed,
9775           PrevInChain);
9776     State.set(this, NextInChain, Part);
9777   }
9778 }
9779 
9780 void VPReplicateRecipe::execute(VPTransformState &State) {
9781   if (State.Instance) { // Generate a single instance.
9782     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9783     State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance,
9784                                     IsPredicated, State);
9785     // Insert scalar instance packing it into a vector.
9786     if (AlsoPack && State.VF.isVector()) {
9787       // If we're constructing lane 0, initialize to start from poison.
9788       if (State.Instance->Lane.isFirstLane()) {
9789         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9790         Value *Poison = PoisonValue::get(
9791             VectorType::get(getUnderlyingValue()->getType(), State.VF));
9792         State.set(this, Poison, State.Instance->Part);
9793       }
9794       State.ILV->packScalarIntoVectorValue(this, *State.Instance, State);
9795     }
9796     return;
9797   }
9798 
9799   // Generate scalar instances for all VF lanes of all UF parts, unless the
9800   // instruction is uniform inwhich case generate only the first lane for each
9801   // of the UF parts.
9802   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
9803   assert((!State.VF.isScalable() || IsUniform) &&
9804          "Can't scalarize a scalable vector");
9805   for (unsigned Part = 0; Part < State.UF; ++Part)
9806     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9807       State.ILV->scalarizeInstruction(getUnderlyingInstr(), this,
9808                                       VPIteration(Part, Lane), IsPredicated,
9809                                       State);
9810 }
9811 
9812 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
9813   assert(State.Instance && "Branch on Mask works only on single instance.");
9814 
9815   unsigned Part = State.Instance->Part;
9816   unsigned Lane = State.Instance->Lane.getKnownLane();
9817 
9818   Value *ConditionBit = nullptr;
9819   VPValue *BlockInMask = getMask();
9820   if (BlockInMask) {
9821     ConditionBit = State.get(BlockInMask, Part);
9822     if (ConditionBit->getType()->isVectorTy())
9823       ConditionBit = State.Builder.CreateExtractElement(
9824           ConditionBit, State.Builder.getInt32(Lane));
9825   } else // Block in mask is all-one.
9826     ConditionBit = State.Builder.getTrue();
9827 
9828   // Replace the temporary unreachable terminator with a new conditional branch,
9829   // whose two destinations will be set later when they are created.
9830   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
9831   assert(isa<UnreachableInst>(CurrentTerminator) &&
9832          "Expected to replace unreachable terminator with conditional branch.");
9833   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
9834   CondBr->setSuccessor(0, nullptr);
9835   ReplaceInstWithInst(CurrentTerminator, CondBr);
9836 }
9837 
9838 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
9839   assert(State.Instance && "Predicated instruction PHI works per instance.");
9840   Instruction *ScalarPredInst =
9841       cast<Instruction>(State.get(getOperand(0), *State.Instance));
9842   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
9843   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
9844   assert(PredicatingBB && "Predicated block has no single predecessor.");
9845   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
9846          "operand must be VPReplicateRecipe");
9847 
9848   // By current pack/unpack logic we need to generate only a single phi node: if
9849   // a vector value for the predicated instruction exists at this point it means
9850   // the instruction has vector users only, and a phi for the vector value is
9851   // needed. In this case the recipe of the predicated instruction is marked to
9852   // also do that packing, thereby "hoisting" the insert-element sequence.
9853   // Otherwise, a phi node for the scalar value is needed.
9854   unsigned Part = State.Instance->Part;
9855   if (State.hasVectorValue(getOperand(0), Part)) {
9856     Value *VectorValue = State.get(getOperand(0), Part);
9857     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
9858     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
9859     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
9860     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
9861     if (State.hasVectorValue(this, Part))
9862       State.reset(this, VPhi, Part);
9863     else
9864       State.set(this, VPhi, Part);
9865     // NOTE: Currently we need to update the value of the operand, so the next
9866     // predicated iteration inserts its generated value in the correct vector.
9867     State.reset(getOperand(0), VPhi, Part);
9868   } else {
9869     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
9870     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
9871     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
9872                      PredicatingBB);
9873     Phi->addIncoming(ScalarPredInst, PredicatedBB);
9874     if (State.hasScalarValue(this, *State.Instance))
9875       State.reset(this, Phi, *State.Instance);
9876     else
9877       State.set(this, Phi, *State.Instance);
9878     // NOTE: Currently we need to update the value of the operand, so the next
9879     // predicated iteration inserts its generated value in the correct vector.
9880     State.reset(getOperand(0), Phi, *State.Instance);
9881   }
9882 }
9883 
9884 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9885   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9886 
9887   // Attempt to issue a wide load.
9888   LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
9889   StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
9890 
9891   assert((LI || SI) && "Invalid Load/Store instruction");
9892   assert((!SI || StoredValue) && "No stored value provided for widened store");
9893   assert((!LI || !StoredValue) && "Stored value provided for widened load");
9894 
9895   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9896 
9897   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9898   const Align Alignment = getLoadStoreAlignment(&Ingredient);
9899   bool CreateGatherScatter = !Consecutive;
9900 
9901   auto &Builder = State.Builder;
9902   InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
9903   bool isMaskRequired = getMask();
9904   if (isMaskRequired)
9905     for (unsigned Part = 0; Part < State.UF; ++Part)
9906       BlockInMaskParts[Part] = State.get(getMask(), Part);
9907 
9908   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
9909     // Calculate the pointer for the specific unroll-part.
9910     GetElementPtrInst *PartPtr = nullptr;
9911 
9912     bool InBounds = false;
9913     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
9914       InBounds = gep->isInBounds();
9915     if (Reverse) {
9916       // If the address is consecutive but reversed, then the
9917       // wide store needs to start at the last vector element.
9918       // RunTimeVF =  VScale * VF.getKnownMinValue()
9919       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
9920       Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF);
9921       // NumElt = -Part * RunTimeVF
9922       Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
9923       // LastLane = 1 - RunTimeVF
9924       Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
9925       PartPtr =
9926           cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
9927       PartPtr->setIsInBounds(InBounds);
9928       PartPtr = cast<GetElementPtrInst>(
9929           Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
9930       PartPtr->setIsInBounds(InBounds);
9931       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
9932         BlockInMaskParts[Part] =
9933             Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse");
9934     } else {
9935       Value *Increment =
9936           createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part);
9937       PartPtr = cast<GetElementPtrInst>(
9938           Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
9939       PartPtr->setIsInBounds(InBounds);
9940     }
9941 
9942     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
9943     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
9944   };
9945 
9946   // Handle Stores:
9947   if (SI) {
9948     State.ILV->setDebugLocFromInst(SI);
9949 
9950     for (unsigned Part = 0; Part < State.UF; ++Part) {
9951       Instruction *NewSI = nullptr;
9952       Value *StoredVal = State.get(StoredValue, Part);
9953       if (CreateGatherScatter) {
9954         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9955         Value *VectorGep = State.get(getAddr(), Part);
9956         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
9957                                             MaskPart);
9958       } else {
9959         if (Reverse) {
9960           // If we store to reverse consecutive memory locations, then we need
9961           // to reverse the order of elements in the stored value.
9962           StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
9963           // We don't want to update the value in the map as it might be used in
9964           // another expression. So don't call resetVectorValue(StoredVal).
9965         }
9966         auto *VecPtr =
9967             CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9968         if (isMaskRequired)
9969           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
9970                                             BlockInMaskParts[Part]);
9971         else
9972           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
9973       }
9974       State.ILV->addMetadata(NewSI, SI);
9975     }
9976     return;
9977   }
9978 
9979   // Handle loads.
9980   assert(LI && "Must have a load instruction");
9981   State.ILV->setDebugLocFromInst(LI);
9982   for (unsigned Part = 0; Part < State.UF; ++Part) {
9983     Value *NewLI;
9984     if (CreateGatherScatter) {
9985       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9986       Value *VectorGep = State.get(getAddr(), Part);
9987       NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
9988                                          nullptr, "wide.masked.gather");
9989       State.ILV->addMetadata(NewLI, LI);
9990     } else {
9991       auto *VecPtr =
9992           CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9993       if (isMaskRequired)
9994         NewLI = Builder.CreateMaskedLoad(
9995             DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
9996             PoisonValue::get(DataTy), "wide.masked.load");
9997       else
9998         NewLI =
9999             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
10000 
10001       // Add metadata to the load, but setVectorValue to the reverse shuffle.
10002       State.ILV->addMetadata(NewLI, LI);
10003       if (Reverse)
10004         NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
10005     }
10006 
10007     State.set(getVPSingleValue(), NewLI, Part);
10008   }
10009 }
10010 
10011 // Determine how to lower the scalar epilogue, which depends on 1) optimising
10012 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
10013 // predication, and 4) a TTI hook that analyses whether the loop is suitable
10014 // for predication.
10015 static ScalarEpilogueLowering getScalarEpilogueLowering(
10016     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
10017     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
10018     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
10019     LoopVectorizationLegality &LVL) {
10020   // 1) OptSize takes precedence over all other options, i.e. if this is set,
10021   // don't look at hints or options, and don't request a scalar epilogue.
10022   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
10023   // LoopAccessInfo (due to code dependency and not being able to reliably get
10024   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
10025   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
10026   // versioning when the vectorization is forced, unlike hasOptSize. So revert
10027   // back to the old way and vectorize with versioning when forced. See D81345.)
10028   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
10029                                                       PGSOQueryType::IRPass) &&
10030                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
10031     return CM_ScalarEpilogueNotAllowedOptSize;
10032 
10033   // 2) If set, obey the directives
10034   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
10035     switch (PreferPredicateOverEpilogue) {
10036     case PreferPredicateTy::ScalarEpilogue:
10037       return CM_ScalarEpilogueAllowed;
10038     case PreferPredicateTy::PredicateElseScalarEpilogue:
10039       return CM_ScalarEpilogueNotNeededUsePredicate;
10040     case PreferPredicateTy::PredicateOrDontVectorize:
10041       return CM_ScalarEpilogueNotAllowedUsePredicate;
10042     };
10043   }
10044 
10045   // 3) If set, obey the hints
10046   switch (Hints.getPredicate()) {
10047   case LoopVectorizeHints::FK_Enabled:
10048     return CM_ScalarEpilogueNotNeededUsePredicate;
10049   case LoopVectorizeHints::FK_Disabled:
10050     return CM_ScalarEpilogueAllowed;
10051   };
10052 
10053   // 4) if the TTI hook indicates this is profitable, request predication.
10054   if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
10055                                        LVL.getLAI()))
10056     return CM_ScalarEpilogueNotNeededUsePredicate;
10057 
10058   return CM_ScalarEpilogueAllowed;
10059 }
10060 
10061 Value *VPTransformState::get(VPValue *Def, unsigned Part) {
10062   // If Values have been set for this Def return the one relevant for \p Part.
10063   if (hasVectorValue(Def, Part))
10064     return Data.PerPartOutput[Def][Part];
10065 
10066   if (!hasScalarValue(Def, {Part, 0})) {
10067     Value *IRV = Def->getLiveInIRValue();
10068     Value *B = ILV->getBroadcastInstrs(IRV);
10069     set(Def, B, Part);
10070     return B;
10071   }
10072 
10073   Value *ScalarValue = get(Def, {Part, 0});
10074   // If we aren't vectorizing, we can just copy the scalar map values over
10075   // to the vector map.
10076   if (VF.isScalar()) {
10077     set(Def, ScalarValue, Part);
10078     return ScalarValue;
10079   }
10080 
10081   auto *RepR = dyn_cast<VPReplicateRecipe>(Def);
10082   bool IsUniform = RepR && RepR->isUniform();
10083 
10084   unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
10085   // Check if there is a scalar value for the selected lane.
10086   if (!hasScalarValue(Def, {Part, LastLane})) {
10087     // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform.
10088     assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) &&
10089            "unexpected recipe found to be invariant");
10090     IsUniform = true;
10091     LastLane = 0;
10092   }
10093 
10094   auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));
10095   // Set the insert point after the last scalarized instruction or after the
10096   // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
10097   // will directly follow the scalar definitions.
10098   auto OldIP = Builder.saveIP();
10099   auto NewIP =
10100       isa<PHINode>(LastInst)
10101           ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
10102           : std::next(BasicBlock::iterator(LastInst));
10103   Builder.SetInsertPoint(&*NewIP);
10104 
10105   // However, if we are vectorizing, we need to construct the vector values.
10106   // If the value is known to be uniform after vectorization, we can just
10107   // broadcast the scalar value corresponding to lane zero for each unroll
10108   // iteration. Otherwise, we construct the vector values using
10109   // insertelement instructions. Since the resulting vectors are stored in
10110   // State, we will only generate the insertelements once.
10111   Value *VectorValue = nullptr;
10112   if (IsUniform) {
10113     VectorValue = ILV->getBroadcastInstrs(ScalarValue);
10114     set(Def, VectorValue, Part);
10115   } else {
10116     // Initialize packing with insertelements to start from undef.
10117     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
10118     Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
10119     set(Def, Undef, Part);
10120     for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
10121       ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this);
10122     VectorValue = get(Def, Part);
10123   }
10124   Builder.restoreIP(OldIP);
10125   return VectorValue;
10126 }
10127 
10128 // Process the loop in the VPlan-native vectorization path. This path builds
10129 // VPlan upfront in the vectorization pipeline, which allows to apply
10130 // VPlan-to-VPlan transformations from the very beginning without modifying the
10131 // input LLVM IR.
10132 static bool processLoopInVPlanNativePath(
10133     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
10134     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
10135     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
10136     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
10137     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
10138     LoopVectorizationRequirements &Requirements) {
10139 
10140   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
10141     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
10142     return false;
10143   }
10144   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
10145   Function *F = L->getHeader()->getParent();
10146   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
10147 
10148   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10149       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
10150 
10151   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
10152                                 &Hints, IAI);
10153   // Use the planner for outer loop vectorization.
10154   // TODO: CM is not used at this point inside the planner. Turn CM into an
10155   // optional argument if we don't need it in the future.
10156   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints,
10157                                Requirements, ORE);
10158 
10159   // Get user vectorization factor.
10160   ElementCount UserVF = Hints.getWidth();
10161 
10162   CM.collectElementTypesForWidening();
10163 
10164   // Plan how to best vectorize, return the best VF and its cost.
10165   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
10166 
10167   // If we are stress testing VPlan builds, do not attempt to generate vector
10168   // code. Masked vector code generation support will follow soon.
10169   // Also, do not attempt to vectorize if no vector code will be produced.
10170   if (VPlanBuildStressTest || EnableVPlanPredication ||
10171       VectorizationFactor::Disabled() == VF)
10172     return false;
10173 
10174   VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10175 
10176   {
10177     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10178                              F->getParent()->getDataLayout());
10179     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
10180                            &CM, BFI, PSI, Checks);
10181     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
10182                       << L->getHeader()->getParent()->getName() << "\"\n");
10183     LVP.executePlan(VF.Width, 1, BestPlan, LB, DT);
10184   }
10185 
10186   // Mark the loop as already vectorized to avoid vectorizing again.
10187   Hints.setAlreadyVectorized();
10188   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10189   return true;
10190 }
10191 
10192 // Emit a remark if there are stores to floats that required a floating point
10193 // extension. If the vectorized loop was generated with floating point there
10194 // will be a performance penalty from the conversion overhead and the change in
10195 // the vector width.
10196 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
10197   SmallVector<Instruction *, 4> Worklist;
10198   for (BasicBlock *BB : L->getBlocks()) {
10199     for (Instruction &Inst : *BB) {
10200       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
10201         if (S->getValueOperand()->getType()->isFloatTy())
10202           Worklist.push_back(S);
10203       }
10204     }
10205   }
10206 
10207   // Traverse the floating point stores upwards searching, for floating point
10208   // conversions.
10209   SmallPtrSet<const Instruction *, 4> Visited;
10210   SmallPtrSet<const Instruction *, 4> EmittedRemark;
10211   while (!Worklist.empty()) {
10212     auto *I = Worklist.pop_back_val();
10213     if (!L->contains(I))
10214       continue;
10215     if (!Visited.insert(I).second)
10216       continue;
10217 
10218     // Emit a remark if the floating point store required a floating
10219     // point conversion.
10220     // TODO: More work could be done to identify the root cause such as a
10221     // constant or a function return type and point the user to it.
10222     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
10223       ORE->emit([&]() {
10224         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
10225                                           I->getDebugLoc(), L->getHeader())
10226                << "floating point conversion changes vector width. "
10227                << "Mixed floating point precision requires an up/down "
10228                << "cast that will negatively impact performance.";
10229       });
10230 
10231     for (Use &Op : I->operands())
10232       if (auto *OpI = dyn_cast<Instruction>(Op))
10233         Worklist.push_back(OpI);
10234   }
10235 }
10236 
10237 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
10238     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10239                                !EnableLoopInterleaving),
10240       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10241                               !EnableLoopVectorization) {}
10242 
10243 bool LoopVectorizePass::processLoop(Loop *L) {
10244   assert((EnableVPlanNativePath || L->isInnermost()) &&
10245          "VPlan-native path is not enabled. Only process inner loops.");
10246 
10247 #ifndef NDEBUG
10248   const std::string DebugLocStr = getDebugLocString(L);
10249 #endif /* NDEBUG */
10250 
10251   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
10252                     << L->getHeader()->getParent()->getName() << "\" from "
10253                     << DebugLocStr << "\n");
10254 
10255   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
10256 
10257   LLVM_DEBUG(
10258       dbgs() << "LV: Loop hints:"
10259              << " force="
10260              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
10261                      ? "disabled"
10262                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
10263                             ? "enabled"
10264                             : "?"))
10265              << " width=" << Hints.getWidth()
10266              << " interleave=" << Hints.getInterleave() << "\n");
10267 
10268   // Function containing loop
10269   Function *F = L->getHeader()->getParent();
10270 
10271   // Looking at the diagnostic output is the only way to determine if a loop
10272   // was vectorized (other than looking at the IR or machine code), so it
10273   // is important to generate an optimization remark for each loop. Most of
10274   // these messages are generated as OptimizationRemarkAnalysis. Remarks
10275   // generated as OptimizationRemark and OptimizationRemarkMissed are
10276   // less verbose reporting vectorized loops and unvectorized loops that may
10277   // benefit from vectorization, respectively.
10278 
10279   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10280     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10281     return false;
10282   }
10283 
10284   PredicatedScalarEvolution PSE(*SE, *L);
10285 
10286   // Check if it is legal to vectorize the loop.
10287   LoopVectorizationRequirements Requirements;
10288   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
10289                                 &Requirements, &Hints, DB, AC, BFI, PSI);
10290   if (!LVL.canVectorize(EnableVPlanNativePath)) {
10291     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10292     Hints.emitRemarkWithHints();
10293     return false;
10294   }
10295 
10296   // Check the function attributes and profiles to find out if this function
10297   // should be optimized for size.
10298   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10299       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
10300 
10301   // Entrance to the VPlan-native vectorization path. Outer loops are processed
10302   // here. They may require CFG and instruction level transformations before
10303   // even evaluating whether vectorization is profitable. Since we cannot modify
10304   // the incoming IR, we need to build VPlan upfront in the vectorization
10305   // pipeline.
10306   if (!L->isInnermost())
10307     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10308                                         ORE, BFI, PSI, Hints, Requirements);
10309 
10310   assert(L->isInnermost() && "Inner loop expected.");
10311 
10312   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10313   // count by optimizing for size, to minimize overheads.
10314   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
10315   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10316     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10317                       << "This loop is worth vectorizing only if no scalar "
10318                       << "iteration overheads are incurred.");
10319     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10320       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10321     else {
10322       LLVM_DEBUG(dbgs() << "\n");
10323       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10324     }
10325   }
10326 
10327   // Check the function attributes to see if implicit floats are allowed.
10328   // FIXME: This check doesn't seem possibly correct -- what if the loop is
10329   // an integer loop and the vector instructions selected are purely integer
10330   // vector instructions?
10331   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10332     reportVectorizationFailure(
10333         "Can't vectorize when the NoImplicitFloat attribute is used",
10334         "loop not vectorized due to NoImplicitFloat attribute",
10335         "NoImplicitFloat", ORE, L);
10336     Hints.emitRemarkWithHints();
10337     return false;
10338   }
10339 
10340   // Check if the target supports potentially unsafe FP vectorization.
10341   // FIXME: Add a check for the type of safety issue (denormal, signaling)
10342   // for the target we're vectorizing for, to make sure none of the
10343   // additional fp-math flags can help.
10344   if (Hints.isPotentiallyUnsafe() &&
10345       TTI->isFPVectorizationPotentiallyUnsafe()) {
10346     reportVectorizationFailure(
10347         "Potentially unsafe FP op prevents vectorization",
10348         "loop not vectorized due to unsafe FP support.",
10349         "UnsafeFP", ORE, L);
10350     Hints.emitRemarkWithHints();
10351     return false;
10352   }
10353 
10354   bool AllowOrderedReductions;
10355   // If the flag is set, use that instead and override the TTI behaviour.
10356   if (ForceOrderedReductions.getNumOccurrences() > 0)
10357     AllowOrderedReductions = ForceOrderedReductions;
10358   else
10359     AllowOrderedReductions = TTI->enableOrderedReductions();
10360   if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10361     ORE->emit([&]() {
10362       auto *ExactFPMathInst = Requirements.getExactFPInst();
10363       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10364                                                  ExactFPMathInst->getDebugLoc(),
10365                                                  ExactFPMathInst->getParent())
10366              << "loop not vectorized: cannot prove it is safe to reorder "
10367                 "floating-point operations";
10368     });
10369     LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10370                          "reorder floating-point operations\n");
10371     Hints.emitRemarkWithHints();
10372     return false;
10373   }
10374 
10375   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10376   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10377 
10378   // If an override option has been passed in for interleaved accesses, use it.
10379   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
10380     UseInterleaved = EnableInterleavedMemAccesses;
10381 
10382   // Analyze interleaved memory accesses.
10383   if (UseInterleaved) {
10384     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
10385   }
10386 
10387   // Use the cost model.
10388   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10389                                 F, &Hints, IAI);
10390   CM.collectValuesToIgnore();
10391   CM.collectElementTypesForWidening();
10392 
10393   // Use the planner for vectorization.
10394   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints,
10395                                Requirements, ORE);
10396 
10397   // Get user vectorization factor and interleave count.
10398   ElementCount UserVF = Hints.getWidth();
10399   unsigned UserIC = Hints.getInterleave();
10400 
10401   // Plan how to best vectorize, return the best VF and its cost.
10402   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
10403 
10404   VectorizationFactor VF = VectorizationFactor::Disabled();
10405   unsigned IC = 1;
10406 
10407   if (MaybeVF) {
10408     VF = *MaybeVF;
10409     // Select the interleave count.
10410     IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());
10411   }
10412 
10413   // Identify the diagnostic messages that should be produced.
10414   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10415   bool VectorizeLoop = true, InterleaveLoop = true;
10416   if (VF.Width.isScalar()) {
10417     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10418     VecDiagMsg = std::make_pair(
10419         "VectorizationNotBeneficial",
10420         "the cost-model indicates that vectorization is not beneficial");
10421     VectorizeLoop = false;
10422   }
10423 
10424   if (!MaybeVF && UserIC > 1) {
10425     // Tell the user interleaving was avoided up-front, despite being explicitly
10426     // requested.
10427     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10428                          "interleaving should be avoided up front\n");
10429     IntDiagMsg = std::make_pair(
10430         "InterleavingAvoided",
10431         "Ignoring UserIC, because interleaving was avoided up front");
10432     InterleaveLoop = false;
10433   } else if (IC == 1 && UserIC <= 1) {
10434     // Tell the user interleaving is not beneficial.
10435     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10436     IntDiagMsg = std::make_pair(
10437         "InterleavingNotBeneficial",
10438         "the cost-model indicates that interleaving is not beneficial");
10439     InterleaveLoop = false;
10440     if (UserIC == 1) {
10441       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10442       IntDiagMsg.second +=
10443           " and is explicitly disabled or interleave count is set to 1";
10444     }
10445   } else if (IC > 1 && UserIC == 1) {
10446     // Tell the user interleaving is beneficial, but it explicitly disabled.
10447     LLVM_DEBUG(
10448         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10449     IntDiagMsg = std::make_pair(
10450         "InterleavingBeneficialButDisabled",
10451         "the cost-model indicates that interleaving is beneficial "
10452         "but is explicitly disabled or interleave count is set to 1");
10453     InterleaveLoop = false;
10454   }
10455 
10456   // Override IC if user provided an interleave count.
10457   IC = UserIC > 0 ? UserIC : IC;
10458 
10459   // Emit diagnostic messages, if any.
10460   const char *VAPassName = Hints.vectorizeAnalysisPassName();
10461   if (!VectorizeLoop && !InterleaveLoop) {
10462     // Do not vectorize or interleaving the loop.
10463     ORE->emit([&]() {
10464       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10465                                       L->getStartLoc(), L->getHeader())
10466              << VecDiagMsg.second;
10467     });
10468     ORE->emit([&]() {
10469       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10470                                       L->getStartLoc(), L->getHeader())
10471              << IntDiagMsg.second;
10472     });
10473     return false;
10474   } else if (!VectorizeLoop && InterleaveLoop) {
10475     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10476     ORE->emit([&]() {
10477       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10478                                         L->getStartLoc(), L->getHeader())
10479              << VecDiagMsg.second;
10480     });
10481   } else if (VectorizeLoop && !InterleaveLoop) {
10482     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10483                       << ") in " << DebugLocStr << '\n');
10484     ORE->emit([&]() {
10485       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10486                                         L->getStartLoc(), L->getHeader())
10487              << IntDiagMsg.second;
10488     });
10489   } else if (VectorizeLoop && InterleaveLoop) {
10490     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10491                       << ") in " << DebugLocStr << '\n');
10492     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10493   }
10494 
10495   bool DisableRuntimeUnroll = false;
10496   MDNode *OrigLoopID = L->getLoopID();
10497   {
10498     // Optimistically generate runtime checks. Drop them if they turn out to not
10499     // be profitable. Limit the scope of Checks, so the cleanup happens
10500     // immediately after vector codegeneration is done.
10501     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10502                              F->getParent()->getDataLayout());
10503     if (!VF.Width.isScalar() || IC > 1)
10504       Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate());
10505 
10506     using namespace ore;
10507     if (!VectorizeLoop) {
10508       assert(IC > 1 && "interleave count should not be 1 or 0");
10509       // If we decided that it is not legal to vectorize the loop, then
10510       // interleave it.
10511       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10512                                  &CM, BFI, PSI, Checks);
10513 
10514       VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10515       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT);
10516 
10517       ORE->emit([&]() {
10518         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10519                                   L->getHeader())
10520                << "interleaved loop (interleaved count: "
10521                << NV("InterleaveCount", IC) << ")";
10522       });
10523     } else {
10524       // If we decided that it is *legal* to vectorize the loop, then do it.
10525 
10526       // Consider vectorizing the epilogue too if it's profitable.
10527       VectorizationFactor EpilogueVF =
10528           CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
10529       if (EpilogueVF.Width.isVector()) {
10530 
10531         // The first pass vectorizes the main loop and creates a scalar epilogue
10532         // to be vectorized by executing the plan (potentially with a different
10533         // factor) again shortly afterwards.
10534         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10535         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10536                                            EPI, &LVL, &CM, BFI, PSI, Checks);
10537 
10538         VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
10539         LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV,
10540                         DT);
10541         ++LoopsVectorized;
10542 
10543         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10544         formLCSSARecursively(*L, *DT, LI, SE);
10545 
10546         // Second pass vectorizes the epilogue and adjusts the control flow
10547         // edges from the first pass.
10548         EPI.MainLoopVF = EPI.EpilogueVF;
10549         EPI.MainLoopUF = EPI.EpilogueUF;
10550         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10551                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
10552                                                  Checks);
10553 
10554         VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
10555         LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10556                         DT);
10557         ++LoopsEpilogueVectorized;
10558 
10559         if (!MainILV.areSafetyChecksAdded())
10560           DisableRuntimeUnroll = true;
10561       } else {
10562         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
10563                                &LVL, &CM, BFI, PSI, Checks);
10564 
10565         VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10566         LVP.executePlan(VF.Width, IC, BestPlan, LB, DT);
10567         ++LoopsVectorized;
10568 
10569         // Add metadata to disable runtime unrolling a scalar loop when there
10570         // are no runtime checks about strides and memory. A scalar loop that is
10571         // rarely used is not worth unrolling.
10572         if (!LB.areSafetyChecksAdded())
10573           DisableRuntimeUnroll = true;
10574       }
10575       // Report the vectorization decision.
10576       ORE->emit([&]() {
10577         return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
10578                                   L->getHeader())
10579                << "vectorized loop (vectorization width: "
10580                << NV("VectorizationFactor", VF.Width)
10581                << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
10582       });
10583     }
10584 
10585     if (ORE->allowExtraAnalysis(LV_NAME))
10586       checkMixedPrecision(L, ORE);
10587   }
10588 
10589   Optional<MDNode *> RemainderLoopID =
10590       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10591                                       LLVMLoopVectorizeFollowupEpilogue});
10592   if (RemainderLoopID.hasValue()) {
10593     L->setLoopID(RemainderLoopID.getValue());
10594   } else {
10595     if (DisableRuntimeUnroll)
10596       AddRuntimeUnrollDisableMetaData(L);
10597 
10598     // Mark the loop as already vectorized to avoid vectorizing again.
10599     Hints.setAlreadyVectorized();
10600   }
10601 
10602   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10603   return true;
10604 }
10605 
10606 LoopVectorizeResult LoopVectorizePass::runImpl(
10607     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10608     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
10609     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
10610     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
10611     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10612   SE = &SE_;
10613   LI = &LI_;
10614   TTI = &TTI_;
10615   DT = &DT_;
10616   BFI = &BFI_;
10617   TLI = TLI_;
10618   AA = &AA_;
10619   AC = &AC_;
10620   GetLAA = &GetLAA_;
10621   DB = &DB_;
10622   ORE = &ORE_;
10623   PSI = PSI_;
10624 
10625   // Don't attempt if
10626   // 1. the target claims to have no vector registers, and
10627   // 2. interleaving won't help ILP.
10628   //
10629   // The second condition is necessary because, even if the target has no
10630   // vector registers, loop vectorization may still enable scalar
10631   // interleaving.
10632   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10633       TTI->getMaxInterleaveFactor(1) < 2)
10634     return LoopVectorizeResult(false, false);
10635 
10636   bool Changed = false, CFGChanged = false;
10637 
10638   // The vectorizer requires loops to be in simplified form.
10639   // Since simplification may add new inner loops, it has to run before the
10640   // legality and profitability checks. This means running the loop vectorizer
10641   // will simplify all loops, regardless of whether anything end up being
10642   // vectorized.
10643   for (auto &L : *LI)
10644     Changed |= CFGChanged |=
10645         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10646 
10647   // Build up a worklist of inner-loops to vectorize. This is necessary as
10648   // the act of vectorizing or partially unrolling a loop creates new loops
10649   // and can invalidate iterators across the loops.
10650   SmallVector<Loop *, 8> Worklist;
10651 
10652   for (Loop *L : *LI)
10653     collectSupportedLoops(*L, LI, ORE, Worklist);
10654 
10655   LoopsAnalyzed += Worklist.size();
10656 
10657   // Now walk the identified inner loops.
10658   while (!Worklist.empty()) {
10659     Loop *L = Worklist.pop_back_val();
10660 
10661     // For the inner loops we actually process, form LCSSA to simplify the
10662     // transform.
10663     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10664 
10665     Changed |= CFGChanged |= processLoop(L);
10666   }
10667 
10668   // Process each loop nest in the function.
10669   return LoopVectorizeResult(Changed, CFGChanged);
10670 }
10671 
10672 PreservedAnalyses LoopVectorizePass::run(Function &F,
10673                                          FunctionAnalysisManager &AM) {
10674     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
10675     auto &LI = AM.getResult<LoopAnalysis>(F);
10676     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10677     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10678     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
10679     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10680     auto &AA = AM.getResult<AAManager>(F);
10681     auto &AC = AM.getResult<AssumptionAnalysis>(F);
10682     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10683     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10684 
10685     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
10686     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
10687         [&](Loop &L) -> const LoopAccessInfo & {
10688       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,      SE,
10689                                         TLI, TTI, nullptr, nullptr, nullptr};
10690       return LAM.getResult<LoopAccessAnalysis>(L, AR);
10691     };
10692     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10693     ProfileSummaryInfo *PSI =
10694         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10695     LoopVectorizeResult Result =
10696         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
10697     if (!Result.MadeAnyChange)
10698       return PreservedAnalyses::all();
10699     PreservedAnalyses PA;
10700 
10701     // We currently do not preserve loopinfo/dominator analyses with outer loop
10702     // vectorization. Until this is addressed, mark these analyses as preserved
10703     // only for non-VPlan-native path.
10704     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10705     if (!EnableVPlanNativePath) {
10706       PA.preserve<LoopAnalysis>();
10707       PA.preserve<DominatorTreeAnalysis>();
10708     }
10709 
10710     if (Result.MadeCFGChange) {
10711       // Making CFG changes likely means a loop got vectorized. Indicate that
10712       // extra simplification passes should be run.
10713       // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10714       // be run if runtime checks have been added.
10715       AM.getResult<ShouldRunExtraVectorPasses>(F);
10716       PA.preserve<ShouldRunExtraVectorPasses>();
10717     } else {
10718       PA.preserveSet<CFGAnalyses>();
10719     }
10720     return PA;
10721 }
10722 
10723 void LoopVectorizePass::printPipeline(
10724     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10725   static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10726       OS, MapClassName2PassName);
10727 
10728   OS << "<";
10729   OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10730   OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10731   OS << ">";
10732 }
10733