1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SmallPtrSet.h"
73 #include "llvm/ADT/SmallSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
91 #include "llvm/Analysis/ProfileSummaryInfo.h"
92 #include "llvm/Analysis/ScalarEvolution.h"
93 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
94 #include "llvm/Analysis/TargetLibraryInfo.h"
95 #include "llvm/Analysis/TargetTransformInfo.h"
96 #include "llvm/Analysis/VectorUtils.h"
97 #include "llvm/IR/Attributes.h"
98 #include "llvm/IR/BasicBlock.h"
99 #include "llvm/IR/CFG.h"
100 #include "llvm/IR/Constant.h"
101 #include "llvm/IR/Constants.h"
102 #include "llvm/IR/DataLayout.h"
103 #include "llvm/IR/DebugInfoMetadata.h"
104 #include "llvm/IR/DebugLoc.h"
105 #include "llvm/IR/DerivedTypes.h"
106 #include "llvm/IR/DiagnosticInfo.h"
107 #include "llvm/IR/Dominators.h"
108 #include "llvm/IR/Function.h"
109 #include "llvm/IR/IRBuilder.h"
110 #include "llvm/IR/InstrTypes.h"
111 #include "llvm/IR/Instruction.h"
112 #include "llvm/IR/Instructions.h"
113 #include "llvm/IR/IntrinsicInst.h"
114 #include "llvm/IR/Intrinsics.h"
115 #include "llvm/IR/LLVMContext.h"
116 #include "llvm/IR/Metadata.h"
117 #include "llvm/IR/Module.h"
118 #include "llvm/IR/Operator.h"
119 #include "llvm/IR/PatternMatch.h"
120 #include "llvm/IR/Type.h"
121 #include "llvm/IR/Use.h"
122 #include "llvm/IR/User.h"
123 #include "llvm/IR/Value.h"
124 #include "llvm/IR/ValueHandle.h"
125 #include "llvm/IR/Verifier.h"
126 #include "llvm/InitializePasses.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/InstructionCost.h"
134 #include "llvm/Support/MathExtras.h"
135 #include "llvm/Support/raw_ostream.h"
136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
137 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
138 #include "llvm/Transforms/Utils/LoopSimplify.h"
139 #include "llvm/Transforms/Utils/LoopUtils.h"
140 #include "llvm/Transforms/Utils/LoopVersioning.h"
141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
142 #include "llvm/Transforms/Utils/SizeOpts.h"
143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
144 #include <algorithm>
145 #include <cassert>
146 #include <cstdint>
147 #include <cstdlib>
148 #include <functional>
149 #include <iterator>
150 #include <limits>
151 #include <memory>
152 #include <string>
153 #include <tuple>
154 #include <utility>
155 
156 using namespace llvm;
157 
158 #define LV_NAME "loop-vectorize"
159 #define DEBUG_TYPE LV_NAME
160 
161 #ifndef NDEBUG
162 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
163 #endif
164 
165 /// @{
166 /// Metadata attribute names
167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
168 const char LLVMLoopVectorizeFollowupVectorized[] =
169     "llvm.loop.vectorize.followup_vectorized";
170 const char LLVMLoopVectorizeFollowupEpilogue[] =
171     "llvm.loop.vectorize.followup_epilogue";
172 /// @}
173 
174 STATISTIC(LoopsVectorized, "Number of loops vectorized");
175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
177 
178 static cl::opt<bool> EnableEpilogueVectorization(
179     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
180     cl::desc("Enable vectorization of epilogue loops."));
181 
182 static cl::opt<unsigned> EpilogueVectorizationForceVF(
183     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
184     cl::desc("When epilogue vectorization is enabled, and a value greater than "
185              "1 is specified, forces the given VF for all applicable epilogue "
186              "loops."));
187 
188 static cl::opt<unsigned> EpilogueVectorizationMinVF(
189     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
190     cl::desc("Only loops with vectorization factor equal to or larger than "
191              "the specified value are considered for epilogue vectorization."));
192 
193 /// Loops with a known constant trip count below this number are vectorized only
194 /// if no scalar iteration overheads are incurred.
195 static cl::opt<unsigned> TinyTripCountVectorThreshold(
196     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
197     cl::desc("Loops with a constant trip count that is smaller than this "
198              "value are vectorized only if no scalar iteration overheads "
199              "are incurred."));
200 
201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
202     "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
203     cl::desc("The maximum allowed number of runtime memory checks with a "
204              "vectorize(enable) pragma."));
205 
206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
207 // that predication is preferred, and this lists all options. I.e., the
208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
209 // and predicate the instructions accordingly. If tail-folding fails, there are
210 // different fallback strategies depending on these values:
211 namespace PreferPredicateTy {
212   enum Option {
213     ScalarEpilogue = 0,
214     PredicateElseScalarEpilogue,
215     PredicateOrDontVectorize
216   };
217 } // namespace PreferPredicateTy
218 
219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
220     "prefer-predicate-over-epilogue",
221     cl::init(PreferPredicateTy::ScalarEpilogue),
222     cl::Hidden,
223     cl::desc("Tail-folding and predication preferences over creating a scalar "
224              "epilogue loop."),
225     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
226                          "scalar-epilogue",
227                          "Don't tail-predicate loops, create scalar epilogue"),
228               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
229                          "predicate-else-scalar-epilogue",
230                          "prefer tail-folding, create scalar epilogue if tail "
231                          "folding fails."),
232               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
233                          "predicate-dont-vectorize",
234                          "prefers tail-folding, don't attempt vectorization if "
235                          "tail-folding fails.")));
236 
237 static cl::opt<bool> MaximizeBandwidth(
238     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
239     cl::desc("Maximize bandwidth when selecting vectorization factor which "
240              "will be determined by the smallest type in loop."));
241 
242 static cl::opt<bool> EnableInterleavedMemAccesses(
243     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
244     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
245 
246 /// An interleave-group may need masking if it resides in a block that needs
247 /// predication, or in order to mask away gaps.
248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
249     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
250     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
251 
252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
253     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
254     cl::desc("We don't interleave loops with a estimated constant trip count "
255              "below this number"));
256 
257 static cl::opt<unsigned> ForceTargetNumScalarRegs(
258     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
259     cl::desc("A flag that overrides the target's number of scalar registers."));
260 
261 static cl::opt<unsigned> ForceTargetNumVectorRegs(
262     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
263     cl::desc("A flag that overrides the target's number of vector registers."));
264 
265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
266     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
267     cl::desc("A flag that overrides the target's max interleave factor for "
268              "scalar loops."));
269 
270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
271     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
272     cl::desc("A flag that overrides the target's max interleave factor for "
273              "vectorized loops."));
274 
275 static cl::opt<unsigned> ForceTargetInstructionCost(
276     "force-target-instruction-cost", cl::init(0), cl::Hidden,
277     cl::desc("A flag that overrides the target's expected cost for "
278              "an instruction to a single constant value. Mostly "
279              "useful for getting consistent testing."));
280 
281 static cl::opt<bool> ForceTargetSupportsScalableVectors(
282     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
283     cl::desc(
284         "Pretend that scalable vectors are supported, even if the target does "
285         "not support them. This flag should only be used for testing."));
286 
287 static cl::opt<unsigned> SmallLoopCost(
288     "small-loop-cost", cl::init(20), cl::Hidden,
289     cl::desc(
290         "The cost of a loop that is considered 'small' by the interleaver."));
291 
292 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
293     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
294     cl::desc("Enable the use of the block frequency analysis to access PGO "
295              "heuristics minimizing code growth in cold regions and being more "
296              "aggressive in hot regions."));
297 
298 // Runtime interleave loops for load/store throughput.
299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
300     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
301     cl::desc(
302         "Enable runtime interleaving until load/store ports are saturated"));
303 
304 /// Interleave small loops with scalar reductions.
305 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
306     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
307     cl::desc("Enable interleaving for loops with small iteration counts that "
308              "contain scalar reductions to expose ILP."));
309 
310 /// The number of stores in a loop that are allowed to need predication.
311 static cl::opt<unsigned> NumberOfStoresToPredicate(
312     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
313     cl::desc("Max number of stores to be predicated behind an if."));
314 
315 static cl::opt<bool> EnableIndVarRegisterHeur(
316     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
317     cl::desc("Count the induction variable only once when interleaving"));
318 
319 static cl::opt<bool> EnableCondStoresVectorization(
320     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
321     cl::desc("Enable if predication of stores during vectorization."));
322 
323 static cl::opt<unsigned> MaxNestedScalarReductionIC(
324     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
325     cl::desc("The maximum interleave count to use when interleaving a scalar "
326              "reduction in a nested loop."));
327 
328 static cl::opt<bool>
329     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
330                            cl::Hidden,
331                            cl::desc("Prefer in-loop vector reductions, "
332                                     "overriding the targets preference."));
333 
334 static cl::opt<bool> ForceOrderedReductions(
335     "force-ordered-reductions", cl::init(false), cl::Hidden,
336     cl::desc("Enable the vectorisation of loops with in-order (strict) "
337              "FP reductions"));
338 
339 static cl::opt<bool> PreferPredicatedReductionSelect(
340     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
341     cl::desc(
342         "Prefer predicating a reduction operation over an after loop select."));
343 
344 cl::opt<bool> EnableVPlanNativePath(
345     "enable-vplan-native-path", cl::init(false), cl::Hidden,
346     cl::desc("Enable VPlan-native vectorization path with "
347              "support for outer loop vectorization."));
348 
349 // FIXME: Remove this switch once we have divergence analysis. Currently we
350 // assume divergent non-backedge branches when this switch is true.
351 cl::opt<bool> EnableVPlanPredication(
352     "enable-vplan-predication", cl::init(false), cl::Hidden,
353     cl::desc("Enable VPlan-native vectorization path predicator with "
354              "support for outer loop vectorization."));
355 
356 // This flag enables the stress testing of the VPlan H-CFG construction in the
357 // VPlan-native vectorization path. It must be used in conjuction with
358 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
359 // verification of the H-CFGs built.
360 static cl::opt<bool> VPlanBuildStressTest(
361     "vplan-build-stress-test", cl::init(false), cl::Hidden,
362     cl::desc(
363         "Build VPlan for every supported loop nest in the function and bail "
364         "out right after the build (stress test the VPlan H-CFG construction "
365         "in the VPlan-native vectorization path)."));
366 
367 cl::opt<bool> llvm::EnableLoopInterleaving(
368     "interleave-loops", cl::init(true), cl::Hidden,
369     cl::desc("Enable loop interleaving in Loop vectorization passes"));
370 cl::opt<bool> llvm::EnableLoopVectorization(
371     "vectorize-loops", cl::init(true), cl::Hidden,
372     cl::desc("Run the Loop vectorization passes"));
373 
374 cl::opt<bool> PrintVPlansInDotFormat(
375     "vplan-print-in-dot-format", cl::init(false), cl::Hidden,
376     cl::desc("Use dot format instead of plain text when dumping VPlans"));
377 
378 /// A helper function that returns true if the given type is irregular. The
379 /// type is irregular if its allocated size doesn't equal the store size of an
380 /// element of the corresponding vector type.
381 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
382   // Determine if an array of N elements of type Ty is "bitcast compatible"
383   // with a <N x Ty> vector.
384   // This is only true if there is no padding between the array elements.
385   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
386 }
387 
388 /// A helper function that returns the reciprocal of the block probability of
389 /// predicated blocks. If we return X, we are assuming the predicated block
390 /// will execute once for every X iterations of the loop header.
391 ///
392 /// TODO: We should use actual block probability here, if available. Currently,
393 ///       we always assume predicated blocks have a 50% chance of executing.
394 static unsigned getReciprocalPredBlockProb() { return 2; }
395 
396 /// A helper function that returns an integer or floating-point constant with
397 /// value C.
398 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
399   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
400                            : ConstantFP::get(Ty, C);
401 }
402 
403 /// Returns "best known" trip count for the specified loop \p L as defined by
404 /// the following procedure:
405 ///   1) Returns exact trip count if it is known.
406 ///   2) Returns expected trip count according to profile data if any.
407 ///   3) Returns upper bound estimate if it is known.
408 ///   4) Returns None if all of the above failed.
409 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
410   // Check if exact trip count is known.
411   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
412     return ExpectedTC;
413 
414   // Check if there is an expected trip count available from profile data.
415   if (LoopVectorizeWithBlockFrequency)
416     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
417       return EstimatedTC;
418 
419   // Check if upper bound estimate is known.
420   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
421     return ExpectedTC;
422 
423   return None;
424 }
425 
426 // Forward declare GeneratedRTChecks.
427 class GeneratedRTChecks;
428 
429 namespace llvm {
430 
431 AnalysisKey ShouldRunExtraVectorPasses::Key;
432 
433 /// InnerLoopVectorizer vectorizes loops which contain only one basic
434 /// block to a specified vectorization factor (VF).
435 /// This class performs the widening of scalars into vectors, or multiple
436 /// scalars. This class also implements the following features:
437 /// * It inserts an epilogue loop for handling loops that don't have iteration
438 ///   counts that are known to be a multiple of the vectorization factor.
439 /// * It handles the code generation for reduction variables.
440 /// * Scalarization (implementation using scalars) of un-vectorizable
441 ///   instructions.
442 /// InnerLoopVectorizer does not perform any vectorization-legality
443 /// checks, and relies on the caller to check for the different legality
444 /// aspects. The InnerLoopVectorizer relies on the
445 /// LoopVectorizationLegality class to provide information about the induction
446 /// and reduction variables that were found to a given vectorization factor.
447 class InnerLoopVectorizer {
448 public:
449   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
450                       LoopInfo *LI, DominatorTree *DT,
451                       const TargetLibraryInfo *TLI,
452                       const TargetTransformInfo *TTI, AssumptionCache *AC,
453                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
454                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
455                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
456                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
457       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
458         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
459         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
460         PSI(PSI), RTChecks(RTChecks) {
461     // Query this against the original loop and save it here because the profile
462     // of the original loop header may change as the transformation happens.
463     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
464         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
465   }
466 
467   virtual ~InnerLoopVectorizer() = default;
468 
469   /// Create a new empty loop that will contain vectorized instructions later
470   /// on, while the old loop will be used as the scalar remainder. Control flow
471   /// is generated around the vectorized (and scalar epilogue) loops consisting
472   /// of various checks and bypasses. Return the pre-header block of the new
473   /// loop and the start value for the canonical induction, if it is != 0. The
474   /// latter is the case when vectorizing the epilogue loop. In the case of
475   /// epilogue vectorization, this function is overriden to handle the more
476   /// complex control flow around the loops.
477   virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton();
478 
479   /// Widen a single call instruction within the innermost loop.
480   void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
481                             VPTransformState &State);
482 
483   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
484   void fixVectorizedLoop(VPTransformState &State);
485 
486   // Return true if any runtime check is added.
487   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
488 
489   /// A type for vectorized values in the new loop. Each value from the
490   /// original loop, when vectorized, is represented by UF vector values in the
491   /// new unrolled loop, where UF is the unroll factor.
492   using VectorParts = SmallVector<Value *, 2>;
493 
494   /// Vectorize a single first-order recurrence or pointer induction PHINode in
495   /// a block. This method handles the induction variable canonicalization. It
496   /// supports both VF = 1 for unrolled loops and arbitrary length vectors.
497   void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR,
498                            VPTransformState &State);
499 
500   /// A helper function to scalarize a single Instruction in the innermost loop.
501   /// Generates a sequence of scalar instances for each lane between \p MinLane
502   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
503   /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
504   /// Instr's operands.
505   void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe,
506                             const VPIteration &Instance, bool IfPredicateInstr,
507                             VPTransformState &State);
508 
509   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
510   /// is provided, the integer induction variable will first be truncated to
511   /// the corresponding type. \p CanonicalIV is the scalar value generated for
512   /// the canonical induction variable.
513   void widenIntOrFpInduction(PHINode *IV, VPWidenIntOrFpInductionRecipe *Def,
514                              VPTransformState &State, Value *CanonicalIV);
515 
516   /// Construct the vector value of a scalarized value \p V one lane at a time.
517   void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
518                                  VPTransformState &State);
519 
520   /// Try to vectorize interleaved access group \p Group with the base address
521   /// given in \p Addr, optionally masking the vector operations if \p
522   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
523   /// values in the vectorized loop.
524   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
525                                 ArrayRef<VPValue *> VPDefs,
526                                 VPTransformState &State, VPValue *Addr,
527                                 ArrayRef<VPValue *> StoredValues,
528                                 VPValue *BlockInMask = nullptr);
529 
530   /// Set the debug location in the builder \p Ptr using the debug location in
531   /// \p V. If \p Ptr is None then it uses the class member's Builder.
532   void setDebugLocFromInst(const Value *V,
533                            Optional<IRBuilder<> *> CustomBuilder = None);
534 
535   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
536   void fixNonInductionPHIs(VPTransformState &State);
537 
538   /// Returns true if the reordering of FP operations is not allowed, but we are
539   /// able to vectorize with strict in-order reductions for the given RdxDesc.
540   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
541 
542   /// Create a broadcast instruction. This method generates a broadcast
543   /// instruction (shuffle) for loop invariant values and for the induction
544   /// value. If this is the induction variable then we extend it to N, N+1, ...
545   /// this is needed because each iteration in the loop corresponds to a SIMD
546   /// element.
547   virtual Value *getBroadcastInstrs(Value *V);
548 
549   /// Add metadata from one instruction to another.
550   ///
551   /// This includes both the original MDs from \p From and additional ones (\see
552   /// addNewMetadata).  Use this for *newly created* instructions in the vector
553   /// loop.
554   void addMetadata(Instruction *To, Instruction *From);
555 
556   /// Similar to the previous function but it adds the metadata to a
557   /// vector of instructions.
558   void addMetadata(ArrayRef<Value *> To, Instruction *From);
559 
560   // Returns the resume value (bc.merge.rdx) for a reduction as
561   // generated by fixReduction.
562   PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);
563 
564 protected:
565   friend class LoopVectorizationPlanner;
566 
567   /// A small list of PHINodes.
568   using PhiVector = SmallVector<PHINode *, 4>;
569 
570   /// A type for scalarized values in the new loop. Each value from the
571   /// original loop, when scalarized, is represented by UF x VF scalar values
572   /// in the new unrolled loop, where UF is the unroll factor and VF is the
573   /// vectorization factor.
574   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
575 
576   /// Set up the values of the IVs correctly when exiting the vector loop.
577   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
578                     Value *CountRoundDown, Value *EndValue,
579                     BasicBlock *MiddleBlock);
580 
581   /// Introduce a conditional branch (on true, condition to be set later) at the
582   /// end of the header=latch connecting it to itself (across the backedge) and
583   /// to the exit block of \p L.
584   void createHeaderBranch(Loop *L);
585 
586   /// Handle all cross-iteration phis in the header.
587   void fixCrossIterationPHIs(VPTransformState &State);
588 
589   /// Create the exit value of first order recurrences in the middle block and
590   /// update their users.
591   void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
592                                VPTransformState &State);
593 
594   /// Create code for the loop exit value of the reduction.
595   void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
596 
597   /// Clear NSW/NUW flags from reduction instructions if necessary.
598   void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
599                                VPTransformState &State);
600 
601   /// Fixup the LCSSA phi nodes in the unique exit block.  This simply
602   /// means we need to add the appropriate incoming value from the middle
603   /// block as exiting edges from the scalar epilogue loop (if present) are
604   /// already in place, and we exit the vector loop exclusively to the middle
605   /// block.
606   void fixLCSSAPHIs(VPTransformState &State);
607 
608   /// Iteratively sink the scalarized operands of a predicated instruction into
609   /// the block that was created for it.
610   void sinkScalarOperands(Instruction *PredInst);
611 
612   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
613   /// represented as.
614   void truncateToMinimalBitwidths(VPTransformState &State);
615 
616   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
617   /// variable on which to base the steps, \p Step is the size of the step, and
618   /// \p EntryVal is the value from the original loop that maps to the steps.
619   /// Note that \p EntryVal doesn't have to be an induction variable - it
620   /// can also be a truncate instruction.
621   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
622                         const InductionDescriptor &ID, VPValue *Def,
623                         VPTransformState &State);
624 
625   /// Create a vector induction phi node based on an existing scalar one. \p
626   /// EntryVal is the value from the original loop that maps to the vector phi
627   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
628   /// truncate instruction, instead of widening the original IV, we widen a
629   /// version of the IV truncated to \p EntryVal's type.
630   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
631                                        Value *Step, Value *Start,
632                                        Instruction *EntryVal, VPValue *Def,
633                                        VPTransformState &State);
634 
635   /// Returns (and creates if needed) the original loop trip count.
636   Value *getOrCreateTripCount(Loop *NewLoop);
637 
638   /// Returns (and creates if needed) the trip count of the widened loop.
639   Value *getOrCreateVectorTripCount(Loop *NewLoop);
640 
641   /// Returns a bitcasted value to the requested vector type.
642   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
643   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
644                                 const DataLayout &DL);
645 
646   /// Emit a bypass check to see if the vector trip count is zero, including if
647   /// it overflows.
648   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
649 
650   /// Emit a bypass check to see if all of the SCEV assumptions we've
651   /// had to make are correct. Returns the block containing the checks or
652   /// nullptr if no checks have been added.
653   BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass);
654 
655   /// Emit bypass checks to check any memory assumptions we may have made.
656   /// Returns the block containing the checks or nullptr if no checks have been
657   /// added.
658   BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
659 
660   /// Compute the transformed value of Index at offset StartValue using step
661   /// StepValue.
662   /// For integer induction, returns StartValue + Index * StepValue.
663   /// For pointer induction, returns StartValue[Index * StepValue].
664   /// FIXME: The newly created binary instructions should contain nsw/nuw
665   /// flags, which can be found from the original scalar operations.
666   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
667                               const DataLayout &DL,
668                               const InductionDescriptor &ID,
669                               BasicBlock *VectorHeader) const;
670 
671   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
672   /// vector loop preheader, middle block and scalar preheader. Also
673   /// allocate a loop object for the new vector loop and return it.
674   Loop *createVectorLoopSkeleton(StringRef Prefix);
675 
676   /// Create new phi nodes for the induction variables to resume iteration count
677   /// in the scalar epilogue, from where the vectorized loop left off.
678   /// In cases where the loop skeleton is more complicated (eg. epilogue
679   /// vectorization) and the resume values can come from an additional bypass
680   /// block, the \p AdditionalBypass pair provides information about the bypass
681   /// block and the end value on the edge from bypass to this loop.
682   void createInductionResumeValues(
683       Loop *L,
684       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
685 
686   /// Complete the loop skeleton by adding debug MDs, creating appropriate
687   /// conditional branches in the middle block, preparing the builder and
688   /// running the verifier. Take in the vector loop \p L as argument, and return
689   /// the preheader of the completed vector loop.
690   BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
691 
692   /// Add additional metadata to \p To that was not present on \p Orig.
693   ///
694   /// Currently this is used to add the noalias annotations based on the
695   /// inserted memchecks.  Use this for instructions that are *cloned* into the
696   /// vector loop.
697   void addNewMetadata(Instruction *To, const Instruction *Orig);
698 
699   /// Collect poison-generating recipes that may generate a poison value that is
700   /// used after vectorization, even when their operands are not poison. Those
701   /// recipes meet the following conditions:
702   ///  * Contribute to the address computation of a recipe generating a widen
703   ///    memory load/store (VPWidenMemoryInstructionRecipe or
704   ///    VPInterleaveRecipe).
705   ///  * Such a widen memory load/store has at least one underlying Instruction
706   ///    that is in a basic block that needs predication and after vectorization
707   ///    the generated instruction won't be predicated.
708   void collectPoisonGeneratingRecipes(VPTransformState &State);
709 
710   /// Allow subclasses to override and print debug traces before/after vplan
711   /// execution, when trace information is requested.
712   virtual void printDebugTracesAtStart(){};
713   virtual void printDebugTracesAtEnd(){};
714 
715   /// The original loop.
716   Loop *OrigLoop;
717 
718   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
719   /// dynamic knowledge to simplify SCEV expressions and converts them to a
720   /// more usable form.
721   PredicatedScalarEvolution &PSE;
722 
723   /// Loop Info.
724   LoopInfo *LI;
725 
726   /// Dominator Tree.
727   DominatorTree *DT;
728 
729   /// Alias Analysis.
730   AAResults *AA;
731 
732   /// Target Library Info.
733   const TargetLibraryInfo *TLI;
734 
735   /// Target Transform Info.
736   const TargetTransformInfo *TTI;
737 
738   /// Assumption Cache.
739   AssumptionCache *AC;
740 
741   /// Interface to emit optimization remarks.
742   OptimizationRemarkEmitter *ORE;
743 
744   /// LoopVersioning.  It's only set up (non-null) if memchecks were
745   /// used.
746   ///
747   /// This is currently only used to add no-alias metadata based on the
748   /// memchecks.  The actually versioning is performed manually.
749   std::unique_ptr<LoopVersioning> LVer;
750 
751   /// The vectorization SIMD factor to use. Each vector will have this many
752   /// vector elements.
753   ElementCount VF;
754 
755   /// The vectorization unroll factor to use. Each scalar is vectorized to this
756   /// many different vector instructions.
757   unsigned UF;
758 
759   /// The builder that we use
760   IRBuilder<> Builder;
761 
762   // --- Vectorization state ---
763 
764   /// The vector-loop preheader.
765   BasicBlock *LoopVectorPreHeader;
766 
767   /// The scalar-loop preheader.
768   BasicBlock *LoopScalarPreHeader;
769 
770   /// Middle Block between the vector and the scalar.
771   BasicBlock *LoopMiddleBlock;
772 
773   /// The unique ExitBlock of the scalar loop if one exists.  Note that
774   /// there can be multiple exiting edges reaching this block.
775   BasicBlock *LoopExitBlock;
776 
777   /// The vector loop body.
778   BasicBlock *LoopVectorBody;
779 
780   /// The scalar loop body.
781   BasicBlock *LoopScalarBody;
782 
783   /// A list of all bypass blocks. The first block is the entry of the loop.
784   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
785 
786   /// Store instructions that were predicated.
787   SmallVector<Instruction *, 4> PredicatedInstructions;
788 
789   /// Trip count of the original loop.
790   Value *TripCount = nullptr;
791 
792   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
793   Value *VectorTripCount = nullptr;
794 
795   /// The legality analysis.
796   LoopVectorizationLegality *Legal;
797 
798   /// The profitablity analysis.
799   LoopVectorizationCostModel *Cost;
800 
801   // Record whether runtime checks are added.
802   bool AddedSafetyChecks = false;
803 
804   // Holds the end values for each induction variable. We save the end values
805   // so we can later fix-up the external users of the induction variables.
806   DenseMap<PHINode *, Value *> IVEndValues;
807 
808   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
809   // fixed up at the end of vector code generation.
810   SmallVector<PHINode *, 8> OrigPHIsToFix;
811 
812   /// BFI and PSI are used to check for profile guided size optimizations.
813   BlockFrequencyInfo *BFI;
814   ProfileSummaryInfo *PSI;
815 
816   // Whether this loop should be optimized for size based on profile guided size
817   // optimizatios.
818   bool OptForSizeBasedOnProfile;
819 
820   /// Structure to hold information about generated runtime checks, responsible
821   /// for cleaning the checks, if vectorization turns out unprofitable.
822   GeneratedRTChecks &RTChecks;
823 
824   // Holds the resume values for reductions in the loops, used to set the
825   // correct start value of reduction PHIs when vectorizing the epilogue.
826   SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
827       ReductionResumeValues;
828 };
829 
830 class InnerLoopUnroller : public InnerLoopVectorizer {
831 public:
832   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
833                     LoopInfo *LI, DominatorTree *DT,
834                     const TargetLibraryInfo *TLI,
835                     const TargetTransformInfo *TTI, AssumptionCache *AC,
836                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
837                     LoopVectorizationLegality *LVL,
838                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
839                     ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
840       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
841                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
842                             BFI, PSI, Check) {}
843 
844 private:
845   Value *getBroadcastInstrs(Value *V) override;
846 };
847 
848 /// Encapsulate information regarding vectorization of a loop and its epilogue.
849 /// This information is meant to be updated and used across two stages of
850 /// epilogue vectorization.
851 struct EpilogueLoopVectorizationInfo {
852   ElementCount MainLoopVF = ElementCount::getFixed(0);
853   unsigned MainLoopUF = 0;
854   ElementCount EpilogueVF = ElementCount::getFixed(0);
855   unsigned EpilogueUF = 0;
856   BasicBlock *MainLoopIterationCountCheck = nullptr;
857   BasicBlock *EpilogueIterationCountCheck = nullptr;
858   BasicBlock *SCEVSafetyCheck = nullptr;
859   BasicBlock *MemSafetyCheck = nullptr;
860   Value *TripCount = nullptr;
861   Value *VectorTripCount = nullptr;
862 
863   EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
864                                 ElementCount EVF, unsigned EUF)
865       : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
866     assert(EUF == 1 &&
867            "A high UF for the epilogue loop is likely not beneficial.");
868   }
869 };
870 
871 /// An extension of the inner loop vectorizer that creates a skeleton for a
872 /// vectorized loop that has its epilogue (residual) also vectorized.
873 /// The idea is to run the vplan on a given loop twice, firstly to setup the
874 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
875 /// from the first step and vectorize the epilogue.  This is achieved by
876 /// deriving two concrete strategy classes from this base class and invoking
877 /// them in succession from the loop vectorizer planner.
878 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
879 public:
880   InnerLoopAndEpilogueVectorizer(
881       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
882       DominatorTree *DT, const TargetLibraryInfo *TLI,
883       const TargetTransformInfo *TTI, AssumptionCache *AC,
884       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
885       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
886       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
887       GeneratedRTChecks &Checks)
888       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
889                             EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI,
890                             Checks),
891         EPI(EPI) {}
892 
893   // Override this function to handle the more complex control flow around the
894   // three loops.
895   std::pair<BasicBlock *, Value *>
896   createVectorizedLoopSkeleton() final override {
897     return createEpilogueVectorizedLoopSkeleton();
898   }
899 
900   /// The interface for creating a vectorized skeleton using one of two
901   /// different strategies, each corresponding to one execution of the vplan
902   /// as described above.
903   virtual std::pair<BasicBlock *, Value *>
904   createEpilogueVectorizedLoopSkeleton() = 0;
905 
906   /// Holds and updates state information required to vectorize the main loop
907   /// and its epilogue in two separate passes. This setup helps us avoid
908   /// regenerating and recomputing runtime safety checks. It also helps us to
909   /// shorten the iteration-count-check path length for the cases where the
910   /// iteration count of the loop is so small that the main vector loop is
911   /// completely skipped.
912   EpilogueLoopVectorizationInfo &EPI;
913 };
914 
915 /// A specialized derived class of inner loop vectorizer that performs
916 /// vectorization of *main* loops in the process of vectorizing loops and their
917 /// epilogues.
918 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
919 public:
920   EpilogueVectorizerMainLoop(
921       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
922       DominatorTree *DT, const TargetLibraryInfo *TLI,
923       const TargetTransformInfo *TTI, AssumptionCache *AC,
924       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
925       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
926       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
927       GeneratedRTChecks &Check)
928       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
929                                        EPI, LVL, CM, BFI, PSI, Check) {}
930   /// Implements the interface for creating a vectorized skeleton using the
931   /// *main loop* strategy (ie the first pass of vplan execution).
932   std::pair<BasicBlock *, Value *>
933   createEpilogueVectorizedLoopSkeleton() final override;
934 
935 protected:
936   /// Emits an iteration count bypass check once for the main loop (when \p
937   /// ForEpilogue is false) and once for the epilogue loop (when \p
938   /// ForEpilogue is true).
939   BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
940                                              bool ForEpilogue);
941   void printDebugTracesAtStart() override;
942   void printDebugTracesAtEnd() override;
943 };
944 
945 // A specialized derived class of inner loop vectorizer that performs
946 // vectorization of *epilogue* loops in the process of vectorizing loops and
947 // their epilogues.
948 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
949 public:
950   EpilogueVectorizerEpilogueLoop(
951       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
952       DominatorTree *DT, const TargetLibraryInfo *TLI,
953       const TargetTransformInfo *TTI, AssumptionCache *AC,
954       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
955       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
956       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
957       GeneratedRTChecks &Checks)
958       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
959                                        EPI, LVL, CM, BFI, PSI, Checks) {}
960   /// Implements the interface for creating a vectorized skeleton using the
961   /// *epilogue loop* strategy (ie the second pass of vplan execution).
962   std::pair<BasicBlock *, Value *>
963   createEpilogueVectorizedLoopSkeleton() final override;
964 
965 protected:
966   /// Emits an iteration count bypass check after the main vector loop has
967   /// finished to see if there are any iterations left to execute by either
968   /// the vector epilogue or the scalar epilogue.
969   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
970                                                       BasicBlock *Bypass,
971                                                       BasicBlock *Insert);
972   void printDebugTracesAtStart() override;
973   void printDebugTracesAtEnd() override;
974 };
975 } // end namespace llvm
976 
977 /// Look for a meaningful debug location on the instruction or it's
978 /// operands.
979 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
980   if (!I)
981     return I;
982 
983   DebugLoc Empty;
984   if (I->getDebugLoc() != Empty)
985     return I;
986 
987   for (Use &Op : I->operands()) {
988     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
989       if (OpInst->getDebugLoc() != Empty)
990         return OpInst;
991   }
992 
993   return I;
994 }
995 
996 void InnerLoopVectorizer::setDebugLocFromInst(
997     const Value *V, Optional<IRBuilder<> *> CustomBuilder) {
998   IRBuilder<> *B = (CustomBuilder == None) ? &Builder : *CustomBuilder;
999   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) {
1000     const DILocation *DIL = Inst->getDebugLoc();
1001 
1002     // When a FSDiscriminator is enabled, we don't need to add the multiply
1003     // factors to the discriminators.
1004     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
1005         !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) {
1006       // FIXME: For scalable vectors, assume vscale=1.
1007       auto NewDIL =
1008           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
1009       if (NewDIL)
1010         B->SetCurrentDebugLocation(NewDIL.getValue());
1011       else
1012         LLVM_DEBUG(dbgs()
1013                    << "Failed to create new discriminator: "
1014                    << DIL->getFilename() << " Line: " << DIL->getLine());
1015     } else
1016       B->SetCurrentDebugLocation(DIL);
1017   } else
1018     B->SetCurrentDebugLocation(DebugLoc());
1019 }
1020 
1021 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
1022 /// is passed, the message relates to that particular instruction.
1023 #ifndef NDEBUG
1024 static void debugVectorizationMessage(const StringRef Prefix,
1025                                       const StringRef DebugMsg,
1026                                       Instruction *I) {
1027   dbgs() << "LV: " << Prefix << DebugMsg;
1028   if (I != nullptr)
1029     dbgs() << " " << *I;
1030   else
1031     dbgs() << '.';
1032   dbgs() << '\n';
1033 }
1034 #endif
1035 
1036 /// Create an analysis remark that explains why vectorization failed
1037 ///
1038 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
1039 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
1040 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
1041 /// the location of the remark.  \return the remark object that can be
1042 /// streamed to.
1043 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
1044     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
1045   Value *CodeRegion = TheLoop->getHeader();
1046   DebugLoc DL = TheLoop->getStartLoc();
1047 
1048   if (I) {
1049     CodeRegion = I->getParent();
1050     // If there is no debug location attached to the instruction, revert back to
1051     // using the loop's.
1052     if (I->getDebugLoc())
1053       DL = I->getDebugLoc();
1054   }
1055 
1056   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
1057 }
1058 
1059 namespace llvm {
1060 
1061 /// Return a value for Step multiplied by VF.
1062 Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF,
1063                        int64_t Step) {
1064   assert(Ty->isIntegerTy() && "Expected an integer step");
1065   Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue());
1066   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
1067 }
1068 
1069 /// Return the runtime value for VF.
1070 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) {
1071   Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
1072   return VF.isScalable() ? B.CreateVScale(EC) : EC;
1073 }
1074 
1075 static Value *getRuntimeVFAsFloat(IRBuilder<> &B, Type *FTy, ElementCount VF) {
1076   assert(FTy->isFloatingPointTy() && "Expected floating point type!");
1077   Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
1078   Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
1079   return B.CreateUIToFP(RuntimeVF, FTy);
1080 }
1081 
1082 void reportVectorizationFailure(const StringRef DebugMsg,
1083                                 const StringRef OREMsg, const StringRef ORETag,
1084                                 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1085                                 Instruction *I) {
1086   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
1087   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1088   ORE->emit(
1089       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1090       << "loop not vectorized: " << OREMsg);
1091 }
1092 
1093 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1094                              OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1095                              Instruction *I) {
1096   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
1097   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1098   ORE->emit(
1099       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1100       << Msg);
1101 }
1102 
1103 } // end namespace llvm
1104 
1105 #ifndef NDEBUG
1106 /// \return string containing a file name and a line # for the given loop.
1107 static std::string getDebugLocString(const Loop *L) {
1108   std::string Result;
1109   if (L) {
1110     raw_string_ostream OS(Result);
1111     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1112       LoopDbgLoc.print(OS);
1113     else
1114       // Just print the module name.
1115       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1116     OS.flush();
1117   }
1118   return Result;
1119 }
1120 #endif
1121 
1122 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1123                                          const Instruction *Orig) {
1124   // If the loop was versioned with memchecks, add the corresponding no-alias
1125   // metadata.
1126   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1127     LVer->annotateInstWithNoAlias(To, Orig);
1128 }
1129 
1130 void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
1131     VPTransformState &State) {
1132 
1133   // Collect recipes in the backward slice of `Root` that may generate a poison
1134   // value that is used after vectorization.
1135   SmallPtrSet<VPRecipeBase *, 16> Visited;
1136   auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
1137     SmallVector<VPRecipeBase *, 16> Worklist;
1138     Worklist.push_back(Root);
1139 
1140     // Traverse the backward slice of Root through its use-def chain.
1141     while (!Worklist.empty()) {
1142       VPRecipeBase *CurRec = Worklist.back();
1143       Worklist.pop_back();
1144 
1145       if (!Visited.insert(CurRec).second)
1146         continue;
1147 
1148       // Prune search if we find another recipe generating a widen memory
1149       // instruction. Widen memory instructions involved in address computation
1150       // will lead to gather/scatter instructions, which don't need to be
1151       // handled.
1152       if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
1153           isa<VPInterleaveRecipe>(CurRec) ||
1154           isa<VPCanonicalIVPHIRecipe>(CurRec))
1155         continue;
1156 
1157       // This recipe contributes to the address computation of a widen
1158       // load/store. Collect recipe if its underlying instruction has
1159       // poison-generating flags.
1160       Instruction *Instr = CurRec->getUnderlyingInstr();
1161       if (Instr && Instr->hasPoisonGeneratingFlags())
1162         State.MayGeneratePoisonRecipes.insert(CurRec);
1163 
1164       // Add new definitions to the worklist.
1165       for (VPValue *operand : CurRec->operands())
1166         if (VPDef *OpDef = operand->getDef())
1167           Worklist.push_back(cast<VPRecipeBase>(OpDef));
1168     }
1169   });
1170 
1171   // Traverse all the recipes in the VPlan and collect the poison-generating
1172   // recipes in the backward slice starting at the address of a VPWidenRecipe or
1173   // VPInterleaveRecipe.
1174   auto Iter = depth_first(
1175       VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry()));
1176   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1177     for (VPRecipeBase &Recipe : *VPBB) {
1178       if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
1179         Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr();
1180         VPDef *AddrDef = WidenRec->getAddr()->getDef();
1181         if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr &&
1182             Legal->blockNeedsPredication(UnderlyingInstr->getParent()))
1183           collectPoisonGeneratingInstrsInBackwardSlice(
1184               cast<VPRecipeBase>(AddrDef));
1185       } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1186         VPDef *AddrDef = InterleaveRec->getAddr()->getDef();
1187         if (AddrDef) {
1188           // Check if any member of the interleave group needs predication.
1189           const InterleaveGroup<Instruction> *InterGroup =
1190               InterleaveRec->getInterleaveGroup();
1191           bool NeedPredication = false;
1192           for (int I = 0, NumMembers = InterGroup->getNumMembers();
1193                I < NumMembers; ++I) {
1194             Instruction *Member = InterGroup->getMember(I);
1195             if (Member)
1196               NeedPredication |=
1197                   Legal->blockNeedsPredication(Member->getParent());
1198           }
1199 
1200           if (NeedPredication)
1201             collectPoisonGeneratingInstrsInBackwardSlice(
1202                 cast<VPRecipeBase>(AddrDef));
1203         }
1204       }
1205     }
1206   }
1207 }
1208 
1209 void InnerLoopVectorizer::addMetadata(Instruction *To,
1210                                       Instruction *From) {
1211   propagateMetadata(To, From);
1212   addNewMetadata(To, From);
1213 }
1214 
1215 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1216                                       Instruction *From) {
1217   for (Value *V : To) {
1218     if (Instruction *I = dyn_cast<Instruction>(V))
1219       addMetadata(I, From);
1220   }
1221 }
1222 
1223 PHINode *InnerLoopVectorizer::getReductionResumeValue(
1224     const RecurrenceDescriptor &RdxDesc) {
1225   auto It = ReductionResumeValues.find(&RdxDesc);
1226   assert(It != ReductionResumeValues.end() &&
1227          "Expected to find a resume value for the reduction.");
1228   return It->second;
1229 }
1230 
1231 namespace llvm {
1232 
1233 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1234 // lowered.
1235 enum ScalarEpilogueLowering {
1236 
1237   // The default: allowing scalar epilogues.
1238   CM_ScalarEpilogueAllowed,
1239 
1240   // Vectorization with OptForSize: don't allow epilogues.
1241   CM_ScalarEpilogueNotAllowedOptSize,
1242 
1243   // A special case of vectorisation with OptForSize: loops with a very small
1244   // trip count are considered for vectorization under OptForSize, thereby
1245   // making sure the cost of their loop body is dominant, free of runtime
1246   // guards and scalar iteration overheads.
1247   CM_ScalarEpilogueNotAllowedLowTripLoop,
1248 
1249   // Loop hint predicate indicating an epilogue is undesired.
1250   CM_ScalarEpilogueNotNeededUsePredicate,
1251 
1252   // Directive indicating we must either tail fold or not vectorize
1253   CM_ScalarEpilogueNotAllowedUsePredicate
1254 };
1255 
1256 /// ElementCountComparator creates a total ordering for ElementCount
1257 /// for the purposes of using it in a set structure.
1258 struct ElementCountComparator {
1259   bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
1260     return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
1261            std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
1262   }
1263 };
1264 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
1265 
1266 /// LoopVectorizationCostModel - estimates the expected speedups due to
1267 /// vectorization.
1268 /// In many cases vectorization is not profitable. This can happen because of
1269 /// a number of reasons. In this class we mainly attempt to predict the
1270 /// expected speedup/slowdowns due to the supported instruction set. We use the
1271 /// TargetTransformInfo to query the different backends for the cost of
1272 /// different operations.
1273 class LoopVectorizationCostModel {
1274 public:
1275   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1276                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1277                              LoopVectorizationLegality *Legal,
1278                              const TargetTransformInfo &TTI,
1279                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1280                              AssumptionCache *AC,
1281                              OptimizationRemarkEmitter *ORE, const Function *F,
1282                              const LoopVectorizeHints *Hints,
1283                              InterleavedAccessInfo &IAI)
1284       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1285         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1286         Hints(Hints), InterleaveInfo(IAI) {}
1287 
1288   /// \return An upper bound for the vectorization factors (both fixed and
1289   /// scalable). If the factors are 0, vectorization and interleaving should be
1290   /// avoided up front.
1291   FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1292 
1293   /// \return True if runtime checks are required for vectorization, and false
1294   /// otherwise.
1295   bool runtimeChecksRequired();
1296 
1297   /// \return The most profitable vectorization factor and the cost of that VF.
1298   /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
1299   /// then this vectorization factor will be selected if vectorization is
1300   /// possible.
1301   VectorizationFactor
1302   selectVectorizationFactor(const ElementCountSet &CandidateVFs);
1303 
1304   VectorizationFactor
1305   selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1306                                     const LoopVectorizationPlanner &LVP);
1307 
1308   /// Setup cost-based decisions for user vectorization factor.
1309   /// \return true if the UserVF is a feasible VF to be chosen.
1310   bool selectUserVectorizationFactor(ElementCount UserVF) {
1311     collectUniformsAndScalars(UserVF);
1312     collectInstsToScalarize(UserVF);
1313     return expectedCost(UserVF).first.isValid();
1314   }
1315 
1316   /// \return The size (in bits) of the smallest and widest types in the code
1317   /// that needs to be vectorized. We ignore values that remain scalar such as
1318   /// 64 bit loop indices.
1319   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1320 
1321   /// \return The desired interleave count.
1322   /// If interleave count has been specified by metadata it will be returned.
1323   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1324   /// are the selected vectorization factor and the cost of the selected VF.
1325   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1326 
1327   /// Memory access instruction may be vectorized in more than one way.
1328   /// Form of instruction after vectorization depends on cost.
1329   /// This function takes cost-based decisions for Load/Store instructions
1330   /// and collects them in a map. This decisions map is used for building
1331   /// the lists of loop-uniform and loop-scalar instructions.
1332   /// The calculated cost is saved with widening decision in order to
1333   /// avoid redundant calculations.
1334   void setCostBasedWideningDecision(ElementCount VF);
1335 
1336   /// A struct that represents some properties of the register usage
1337   /// of a loop.
1338   struct RegisterUsage {
1339     /// Holds the number of loop invariant values that are used in the loop.
1340     /// The key is ClassID of target-provided register class.
1341     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1342     /// Holds the maximum number of concurrent live intervals in the loop.
1343     /// The key is ClassID of target-provided register class.
1344     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1345   };
1346 
1347   /// \return Returns information about the register usages of the loop for the
1348   /// given vectorization factors.
1349   SmallVector<RegisterUsage, 8>
1350   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1351 
1352   /// Collect values we want to ignore in the cost model.
1353   void collectValuesToIgnore();
1354 
1355   /// Collect all element types in the loop for which widening is needed.
1356   void collectElementTypesForWidening();
1357 
1358   /// Split reductions into those that happen in the loop, and those that happen
1359   /// outside. In loop reductions are collected into InLoopReductionChains.
1360   void collectInLoopReductions();
1361 
1362   /// Returns true if we should use strict in-order reductions for the given
1363   /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1364   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1365   /// of FP operations.
1366   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) {
1367     return !Hints->allowReordering() && RdxDesc.isOrdered();
1368   }
1369 
1370   /// \returns The smallest bitwidth each instruction can be represented with.
1371   /// The vector equivalents of these instructions should be truncated to this
1372   /// type.
1373   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1374     return MinBWs;
1375   }
1376 
1377   /// \returns True if it is more profitable to scalarize instruction \p I for
1378   /// vectorization factor \p VF.
1379   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1380     assert(VF.isVector() &&
1381            "Profitable to scalarize relevant only for VF > 1.");
1382 
1383     // Cost model is not run in the VPlan-native path - return conservative
1384     // result until this changes.
1385     if (EnableVPlanNativePath)
1386       return false;
1387 
1388     auto Scalars = InstsToScalarize.find(VF);
1389     assert(Scalars != InstsToScalarize.end() &&
1390            "VF not yet analyzed for scalarization profitability");
1391     return Scalars->second.find(I) != Scalars->second.end();
1392   }
1393 
1394   /// Returns true if \p I is known to be uniform after vectorization.
1395   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1396     if (VF.isScalar())
1397       return true;
1398 
1399     // Cost model is not run in the VPlan-native path - return conservative
1400     // result until this changes.
1401     if (EnableVPlanNativePath)
1402       return false;
1403 
1404     auto UniformsPerVF = Uniforms.find(VF);
1405     assert(UniformsPerVF != Uniforms.end() &&
1406            "VF not yet analyzed for uniformity");
1407     return UniformsPerVF->second.count(I);
1408   }
1409 
1410   /// Returns true if \p I is known to be scalar after vectorization.
1411   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1412     if (VF.isScalar())
1413       return true;
1414 
1415     // Cost model is not run in the VPlan-native path - return conservative
1416     // result until this changes.
1417     if (EnableVPlanNativePath)
1418       return false;
1419 
1420     auto ScalarsPerVF = Scalars.find(VF);
1421     assert(ScalarsPerVF != Scalars.end() &&
1422            "Scalar values are not calculated for VF");
1423     return ScalarsPerVF->second.count(I);
1424   }
1425 
1426   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1427   /// for vectorization factor \p VF.
1428   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1429     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1430            !isProfitableToScalarize(I, VF) &&
1431            !isScalarAfterVectorization(I, VF);
1432   }
1433 
1434   /// Decision that was taken during cost calculation for memory instruction.
1435   enum InstWidening {
1436     CM_Unknown,
1437     CM_Widen,         // For consecutive accesses with stride +1.
1438     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1439     CM_Interleave,
1440     CM_GatherScatter,
1441     CM_Scalarize
1442   };
1443 
1444   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1445   /// instruction \p I and vector width \p VF.
1446   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1447                            InstructionCost Cost) {
1448     assert(VF.isVector() && "Expected VF >=2");
1449     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1450   }
1451 
1452   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1453   /// interleaving group \p Grp and vector width \p VF.
1454   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1455                            ElementCount VF, InstWidening W,
1456                            InstructionCost Cost) {
1457     assert(VF.isVector() && "Expected VF >=2");
1458     /// Broadcast this decicion to all instructions inside the group.
1459     /// But the cost will be assigned to one instruction only.
1460     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1461       if (auto *I = Grp->getMember(i)) {
1462         if (Grp->getInsertPos() == I)
1463           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1464         else
1465           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1466       }
1467     }
1468   }
1469 
1470   /// Return the cost model decision for the given instruction \p I and vector
1471   /// width \p VF. Return CM_Unknown if this instruction did not pass
1472   /// through the cost modeling.
1473   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1474     assert(VF.isVector() && "Expected VF to be a vector VF");
1475     // Cost model is not run in the VPlan-native path - return conservative
1476     // result until this changes.
1477     if (EnableVPlanNativePath)
1478       return CM_GatherScatter;
1479 
1480     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1481     auto Itr = WideningDecisions.find(InstOnVF);
1482     if (Itr == WideningDecisions.end())
1483       return CM_Unknown;
1484     return Itr->second.first;
1485   }
1486 
1487   /// Return the vectorization cost for the given instruction \p I and vector
1488   /// width \p VF.
1489   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1490     assert(VF.isVector() && "Expected VF >=2");
1491     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1492     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1493            "The cost is not calculated");
1494     return WideningDecisions[InstOnVF].second;
1495   }
1496 
1497   /// Return True if instruction \p I is an optimizable truncate whose operand
1498   /// is an induction variable. Such a truncate will be removed by adding a new
1499   /// induction variable with the destination type.
1500   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1501     // If the instruction is not a truncate, return false.
1502     auto *Trunc = dyn_cast<TruncInst>(I);
1503     if (!Trunc)
1504       return false;
1505 
1506     // Get the source and destination types of the truncate.
1507     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1508     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1509 
1510     // If the truncate is free for the given types, return false. Replacing a
1511     // free truncate with an induction variable would add an induction variable
1512     // update instruction to each iteration of the loop. We exclude from this
1513     // check the primary induction variable since it will need an update
1514     // instruction regardless.
1515     Value *Op = Trunc->getOperand(0);
1516     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1517       return false;
1518 
1519     // If the truncated value is not an induction variable, return false.
1520     return Legal->isInductionPhi(Op);
1521   }
1522 
1523   /// Collects the instructions to scalarize for each predicated instruction in
1524   /// the loop.
1525   void collectInstsToScalarize(ElementCount VF);
1526 
1527   /// Collect Uniform and Scalar values for the given \p VF.
1528   /// The sets depend on CM decision for Load/Store instructions
1529   /// that may be vectorized as interleave, gather-scatter or scalarized.
1530   void collectUniformsAndScalars(ElementCount VF) {
1531     // Do the analysis once.
1532     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1533       return;
1534     setCostBasedWideningDecision(VF);
1535     collectLoopUniforms(VF);
1536     collectLoopScalars(VF);
1537   }
1538 
1539   /// Returns true if the target machine supports masked store operation
1540   /// for the given \p DataType and kind of access to \p Ptr.
1541   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1542     return Legal->isConsecutivePtr(DataType, Ptr) &&
1543            TTI.isLegalMaskedStore(DataType, Alignment);
1544   }
1545 
1546   /// Returns true if the target machine supports masked load operation
1547   /// for the given \p DataType and kind of access to \p Ptr.
1548   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1549     return Legal->isConsecutivePtr(DataType, Ptr) &&
1550            TTI.isLegalMaskedLoad(DataType, Alignment);
1551   }
1552 
1553   /// Returns true if the target machine can represent \p V as a masked gather
1554   /// or scatter operation.
1555   bool isLegalGatherOrScatter(Value *V,
1556                               ElementCount VF = ElementCount::getFixed(1)) {
1557     bool LI = isa<LoadInst>(V);
1558     bool SI = isa<StoreInst>(V);
1559     if (!LI && !SI)
1560       return false;
1561     auto *Ty = getLoadStoreType(V);
1562     Align Align = getLoadStoreAlignment(V);
1563     if (VF.isVector())
1564       Ty = VectorType::get(Ty, VF);
1565     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1566            (SI && TTI.isLegalMaskedScatter(Ty, Align));
1567   }
1568 
1569   /// Returns true if the target machine supports all of the reduction
1570   /// variables found for the given VF.
1571   bool canVectorizeReductions(ElementCount VF) const {
1572     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1573       const RecurrenceDescriptor &RdxDesc = Reduction.second;
1574       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1575     }));
1576   }
1577 
1578   /// Returns true if \p I is an instruction that will be scalarized with
1579   /// predication when vectorizing \p I with vectorization factor \p VF. Such
1580   /// instructions include conditional stores and instructions that may divide
1581   /// by zero.
1582   bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1583 
1584   // Returns true if \p I is an instruction that will be predicated either
1585   // through scalar predication or masked load/store or masked gather/scatter.
1586   // \p VF is the vectorization factor that will be used to vectorize \p I.
1587   // Superset of instructions that return true for isScalarWithPredication.
1588   bool isPredicatedInst(Instruction *I, ElementCount VF,
1589                         bool IsKnownUniform = false) {
1590     // When we know the load is uniform and the original scalar loop was not
1591     // predicated we don't need to mark it as a predicated instruction. Any
1592     // vectorised blocks created when tail-folding are something artificial we
1593     // have introduced and we know there is always at least one active lane.
1594     // That's why we call Legal->blockNeedsPredication here because it doesn't
1595     // query tail-folding.
1596     if (IsKnownUniform && isa<LoadInst>(I) &&
1597         !Legal->blockNeedsPredication(I->getParent()))
1598       return false;
1599     if (!blockNeedsPredicationForAnyReason(I->getParent()))
1600       return false;
1601     // Loads and stores that need some form of masked operation are predicated
1602     // instructions.
1603     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1604       return Legal->isMaskRequired(I);
1605     return isScalarWithPredication(I, VF);
1606   }
1607 
1608   /// Returns true if \p I is a memory instruction with consecutive memory
1609   /// access that can be widened.
1610   bool
1611   memoryInstructionCanBeWidened(Instruction *I,
1612                                 ElementCount VF = ElementCount::getFixed(1));
1613 
1614   /// Returns true if \p I is a memory instruction in an interleaved-group
1615   /// of memory accesses that can be vectorized with wide vector loads/stores
1616   /// and shuffles.
1617   bool
1618   interleavedAccessCanBeWidened(Instruction *I,
1619                                 ElementCount VF = ElementCount::getFixed(1));
1620 
1621   /// Check if \p Instr belongs to any interleaved access group.
1622   bool isAccessInterleaved(Instruction *Instr) {
1623     return InterleaveInfo.isInterleaved(Instr);
1624   }
1625 
1626   /// Get the interleaved access group that \p Instr belongs to.
1627   const InterleaveGroup<Instruction> *
1628   getInterleavedAccessGroup(Instruction *Instr) {
1629     return InterleaveInfo.getInterleaveGroup(Instr);
1630   }
1631 
1632   /// Returns true if we're required to use a scalar epilogue for at least
1633   /// the final iteration of the original loop.
1634   bool requiresScalarEpilogue(ElementCount VF) const {
1635     if (!isScalarEpilogueAllowed())
1636       return false;
1637     // If we might exit from anywhere but the latch, must run the exiting
1638     // iteration in scalar form.
1639     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1640       return true;
1641     return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();
1642   }
1643 
1644   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1645   /// loop hint annotation.
1646   bool isScalarEpilogueAllowed() const {
1647     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1648   }
1649 
1650   /// Returns true if all loop blocks should be masked to fold tail loop.
1651   bool foldTailByMasking() const { return FoldTailByMasking; }
1652 
1653   /// Returns true if the instructions in this block requires predication
1654   /// for any reason, e.g. because tail folding now requires a predicate
1655   /// or because the block in the original loop was predicated.
1656   bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1657     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1658   }
1659 
1660   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1661   /// nodes to the chain of instructions representing the reductions. Uses a
1662   /// MapVector to ensure deterministic iteration order.
1663   using ReductionChainMap =
1664       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1665 
1666   /// Return the chain of instructions representing an inloop reduction.
1667   const ReductionChainMap &getInLoopReductionChains() const {
1668     return InLoopReductionChains;
1669   }
1670 
1671   /// Returns true if the Phi is part of an inloop reduction.
1672   bool isInLoopReduction(PHINode *Phi) const {
1673     return InLoopReductionChains.count(Phi);
1674   }
1675 
1676   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1677   /// with factor VF.  Return the cost of the instruction, including
1678   /// scalarization overhead if it's needed.
1679   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1680 
1681   /// Estimate cost of a call instruction CI if it were vectorized with factor
1682   /// VF. Return the cost of the instruction, including scalarization overhead
1683   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1684   /// scalarized -
1685   /// i.e. either vector version isn't available, or is too expensive.
1686   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1687                                     bool &NeedToScalarize) const;
1688 
1689   /// Returns true if the per-lane cost of VectorizationFactor A is lower than
1690   /// that of B.
1691   bool isMoreProfitable(const VectorizationFactor &A,
1692                         const VectorizationFactor &B) const;
1693 
1694   /// Invalidates decisions already taken by the cost model.
1695   void invalidateCostModelingDecisions() {
1696     WideningDecisions.clear();
1697     Uniforms.clear();
1698     Scalars.clear();
1699   }
1700 
1701 private:
1702   unsigned NumPredStores = 0;
1703 
1704   /// \return An upper bound for the vectorization factors for both
1705   /// fixed and scalable vectorization, where the minimum-known number of
1706   /// elements is a power-of-2 larger than zero. If scalable vectorization is
1707   /// disabled or unsupported, then the scalable part will be equal to
1708   /// ElementCount::getScalable(0).
1709   FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1710                                            ElementCount UserVF,
1711                                            bool FoldTailByMasking);
1712 
1713   /// \return the maximized element count based on the targets vector
1714   /// registers and the loop trip-count, but limited to a maximum safe VF.
1715   /// This is a helper function of computeFeasibleMaxVF.
1716   /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure
1717   /// issue that occurred on one of the buildbots which cannot be reproduced
1718   /// without having access to the properietary compiler (see comments on
1719   /// D98509). The issue is currently under investigation and this workaround
1720   /// will be removed as soon as possible.
1721   ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
1722                                        unsigned SmallestType,
1723                                        unsigned WidestType,
1724                                        const ElementCount &MaxSafeVF,
1725                                        bool FoldTailByMasking);
1726 
1727   /// \return the maximum legal scalable VF, based on the safe max number
1728   /// of elements.
1729   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1730 
1731   /// The vectorization cost is a combination of the cost itself and a boolean
1732   /// indicating whether any of the contributing operations will actually
1733   /// operate on vector values after type legalization in the backend. If this
1734   /// latter value is false, then all operations will be scalarized (i.e. no
1735   /// vectorization has actually taken place).
1736   using VectorizationCostTy = std::pair<InstructionCost, bool>;
1737 
1738   /// Returns the expected execution cost. The unit of the cost does
1739   /// not matter because we use the 'cost' units to compare different
1740   /// vector widths. The cost that is returned is *not* normalized by
1741   /// the factor width. If \p Invalid is not nullptr, this function
1742   /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1743   /// each instruction that has an Invalid cost for the given VF.
1744   using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1745   VectorizationCostTy
1746   expectedCost(ElementCount VF,
1747                SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1748 
1749   /// Returns the execution time cost of an instruction for a given vector
1750   /// width. Vector width of one means scalar.
1751   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1752 
1753   /// The cost-computation logic from getInstructionCost which provides
1754   /// the vector type as an output parameter.
1755   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1756                                      Type *&VectorTy);
1757 
1758   /// Return the cost of instructions in an inloop reduction pattern, if I is
1759   /// part of that pattern.
1760   Optional<InstructionCost>
1761   getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1762                           TTI::TargetCostKind CostKind);
1763 
1764   /// Calculate vectorization cost of memory instruction \p I.
1765   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1766 
1767   /// The cost computation for scalarized memory instruction.
1768   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1769 
1770   /// The cost computation for interleaving group of memory instructions.
1771   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1772 
1773   /// The cost computation for Gather/Scatter instruction.
1774   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1775 
1776   /// The cost computation for widening instruction \p I with consecutive
1777   /// memory access.
1778   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1779 
1780   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1781   /// Load: scalar load + broadcast.
1782   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1783   /// element)
1784   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1785 
1786   /// Estimate the overhead of scalarizing an instruction. This is a
1787   /// convenience wrapper for the type-based getScalarizationOverhead API.
1788   InstructionCost getScalarizationOverhead(Instruction *I,
1789                                            ElementCount VF) const;
1790 
1791   /// Returns whether the instruction is a load or store and will be a emitted
1792   /// as a vector operation.
1793   bool isConsecutiveLoadOrStore(Instruction *I);
1794 
1795   /// Returns true if an artificially high cost for emulated masked memrefs
1796   /// should be used.
1797   bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1798 
1799   /// Map of scalar integer values to the smallest bitwidth they can be legally
1800   /// represented as. The vector equivalents of these values should be truncated
1801   /// to this type.
1802   MapVector<Instruction *, uint64_t> MinBWs;
1803 
1804   /// A type representing the costs for instructions if they were to be
1805   /// scalarized rather than vectorized. The entries are Instruction-Cost
1806   /// pairs.
1807   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1808 
1809   /// A set containing all BasicBlocks that are known to present after
1810   /// vectorization as a predicated block.
1811   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1812 
1813   /// Records whether it is allowed to have the original scalar loop execute at
1814   /// least once. This may be needed as a fallback loop in case runtime
1815   /// aliasing/dependence checks fail, or to handle the tail/remainder
1816   /// iterations when the trip count is unknown or doesn't divide by the VF,
1817   /// or as a peel-loop to handle gaps in interleave-groups.
1818   /// Under optsize and when the trip count is very small we don't allow any
1819   /// iterations to execute in the scalar loop.
1820   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1821 
1822   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1823   bool FoldTailByMasking = false;
1824 
1825   /// A map holding scalar costs for different vectorization factors. The
1826   /// presence of a cost for an instruction in the mapping indicates that the
1827   /// instruction will be scalarized when vectorizing with the associated
1828   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1829   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1830 
1831   /// Holds the instructions known to be uniform after vectorization.
1832   /// The data is collected per VF.
1833   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1834 
1835   /// Holds the instructions known to be scalar after vectorization.
1836   /// The data is collected per VF.
1837   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1838 
1839   /// Holds the instructions (address computations) that are forced to be
1840   /// scalarized.
1841   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1842 
1843   /// PHINodes of the reductions that should be expanded in-loop along with
1844   /// their associated chains of reduction operations, in program order from top
1845   /// (PHI) to bottom
1846   ReductionChainMap InLoopReductionChains;
1847 
1848   /// A Map of inloop reduction operations and their immediate chain operand.
1849   /// FIXME: This can be removed once reductions can be costed correctly in
1850   /// vplan. This was added to allow quick lookup to the inloop operations,
1851   /// without having to loop through InLoopReductionChains.
1852   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1853 
1854   /// Returns the expected difference in cost from scalarizing the expression
1855   /// feeding a predicated instruction \p PredInst. The instructions to
1856   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1857   /// non-negative return value implies the expression will be scalarized.
1858   /// Currently, only single-use chains are considered for scalarization.
1859   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1860                               ElementCount VF);
1861 
1862   /// Collect the instructions that are uniform after vectorization. An
1863   /// instruction is uniform if we represent it with a single scalar value in
1864   /// the vectorized loop corresponding to each vector iteration. Examples of
1865   /// uniform instructions include pointer operands of consecutive or
1866   /// interleaved memory accesses. Note that although uniformity implies an
1867   /// instruction will be scalar, the reverse is not true. In general, a
1868   /// scalarized instruction will be represented by VF scalar values in the
1869   /// vectorized loop, each corresponding to an iteration of the original
1870   /// scalar loop.
1871   void collectLoopUniforms(ElementCount VF);
1872 
1873   /// Collect the instructions that are scalar after vectorization. An
1874   /// instruction is scalar if it is known to be uniform or will be scalarized
1875   /// during vectorization. collectLoopScalars should only add non-uniform nodes
1876   /// to the list if they are used by a load/store instruction that is marked as
1877   /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1878   /// VF values in the vectorized loop, each corresponding to an iteration of
1879   /// the original scalar loop.
1880   void collectLoopScalars(ElementCount VF);
1881 
1882   /// Keeps cost model vectorization decision and cost for instructions.
1883   /// Right now it is used for memory instructions only.
1884   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1885                                 std::pair<InstWidening, InstructionCost>>;
1886 
1887   DecisionList WideningDecisions;
1888 
1889   /// Returns true if \p V is expected to be vectorized and it needs to be
1890   /// extracted.
1891   bool needsExtract(Value *V, ElementCount VF) const {
1892     Instruction *I = dyn_cast<Instruction>(V);
1893     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1894         TheLoop->isLoopInvariant(I))
1895       return false;
1896 
1897     // Assume we can vectorize V (and hence we need extraction) if the
1898     // scalars are not computed yet. This can happen, because it is called
1899     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1900     // the scalars are collected. That should be a safe assumption in most
1901     // cases, because we check if the operands have vectorizable types
1902     // beforehand in LoopVectorizationLegality.
1903     return Scalars.find(VF) == Scalars.end() ||
1904            !isScalarAfterVectorization(I, VF);
1905   };
1906 
1907   /// Returns a range containing only operands needing to be extracted.
1908   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1909                                                    ElementCount VF) const {
1910     return SmallVector<Value *, 4>(make_filter_range(
1911         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1912   }
1913 
1914   /// Determines if we have the infrastructure to vectorize loop \p L and its
1915   /// epilogue, assuming the main loop is vectorized by \p VF.
1916   bool isCandidateForEpilogueVectorization(const Loop &L,
1917                                            const ElementCount VF) const;
1918 
1919   /// Returns true if epilogue vectorization is considered profitable, and
1920   /// false otherwise.
1921   /// \p VF is the vectorization factor chosen for the original loop.
1922   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1923 
1924 public:
1925   /// The loop that we evaluate.
1926   Loop *TheLoop;
1927 
1928   /// Predicated scalar evolution analysis.
1929   PredicatedScalarEvolution &PSE;
1930 
1931   /// Loop Info analysis.
1932   LoopInfo *LI;
1933 
1934   /// Vectorization legality.
1935   LoopVectorizationLegality *Legal;
1936 
1937   /// Vector target information.
1938   const TargetTransformInfo &TTI;
1939 
1940   /// Target Library Info.
1941   const TargetLibraryInfo *TLI;
1942 
1943   /// Demanded bits analysis.
1944   DemandedBits *DB;
1945 
1946   /// Assumption cache.
1947   AssumptionCache *AC;
1948 
1949   /// Interface to emit optimization remarks.
1950   OptimizationRemarkEmitter *ORE;
1951 
1952   const Function *TheFunction;
1953 
1954   /// Loop Vectorize Hint.
1955   const LoopVectorizeHints *Hints;
1956 
1957   /// The interleave access information contains groups of interleaved accesses
1958   /// with the same stride and close to each other.
1959   InterleavedAccessInfo &InterleaveInfo;
1960 
1961   /// Values to ignore in the cost model.
1962   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1963 
1964   /// Values to ignore in the cost model when VF > 1.
1965   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1966 
1967   /// All element types found in the loop.
1968   SmallPtrSet<Type *, 16> ElementTypesInLoop;
1969 
1970   /// Profitable vector factors.
1971   SmallVector<VectorizationFactor, 8> ProfitableVFs;
1972 };
1973 } // end namespace llvm
1974 
1975 /// Helper struct to manage generating runtime checks for vectorization.
1976 ///
1977 /// The runtime checks are created up-front in temporary blocks to allow better
1978 /// estimating the cost and un-linked from the existing IR. After deciding to
1979 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1980 /// temporary blocks are completely removed.
1981 class GeneratedRTChecks {
1982   /// Basic block which contains the generated SCEV checks, if any.
1983   BasicBlock *SCEVCheckBlock = nullptr;
1984 
1985   /// The value representing the result of the generated SCEV checks. If it is
1986   /// nullptr, either no SCEV checks have been generated or they have been used.
1987   Value *SCEVCheckCond = nullptr;
1988 
1989   /// Basic block which contains the generated memory runtime checks, if any.
1990   BasicBlock *MemCheckBlock = nullptr;
1991 
1992   /// The value representing the result of the generated memory runtime checks.
1993   /// If it is nullptr, either no memory runtime checks have been generated or
1994   /// they have been used.
1995   Value *MemRuntimeCheckCond = nullptr;
1996 
1997   DominatorTree *DT;
1998   LoopInfo *LI;
1999 
2000   SCEVExpander SCEVExp;
2001   SCEVExpander MemCheckExp;
2002 
2003 public:
2004   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
2005                     const DataLayout &DL)
2006       : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"),
2007         MemCheckExp(SE, DL, "scev.check") {}
2008 
2009   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
2010   /// accurately estimate the cost of the runtime checks. The blocks are
2011   /// un-linked from the IR and is added back during vector code generation. If
2012   /// there is no vector code generation, the check blocks are removed
2013   /// completely.
2014   void Create(Loop *L, const LoopAccessInfo &LAI,
2015               const SCEVUnionPredicate &UnionPred) {
2016 
2017     BasicBlock *LoopHeader = L->getHeader();
2018     BasicBlock *Preheader = L->getLoopPreheader();
2019 
2020     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
2021     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
2022     // may be used by SCEVExpander. The blocks will be un-linked from their
2023     // predecessors and removed from LI & DT at the end of the function.
2024     if (!UnionPred.isAlwaysTrue()) {
2025       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
2026                                   nullptr, "vector.scevcheck");
2027 
2028       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
2029           &UnionPred, SCEVCheckBlock->getTerminator());
2030     }
2031 
2032     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
2033     if (RtPtrChecking.Need) {
2034       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
2035       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
2036                                  "vector.memcheck");
2037 
2038       MemRuntimeCheckCond =
2039           addRuntimeChecks(MemCheckBlock->getTerminator(), L,
2040                            RtPtrChecking.getChecks(), MemCheckExp);
2041       assert(MemRuntimeCheckCond &&
2042              "no RT checks generated although RtPtrChecking "
2043              "claimed checks are required");
2044     }
2045 
2046     if (!MemCheckBlock && !SCEVCheckBlock)
2047       return;
2048 
2049     // Unhook the temporary block with the checks, update various places
2050     // accordingly.
2051     if (SCEVCheckBlock)
2052       SCEVCheckBlock->replaceAllUsesWith(Preheader);
2053     if (MemCheckBlock)
2054       MemCheckBlock->replaceAllUsesWith(Preheader);
2055 
2056     if (SCEVCheckBlock) {
2057       SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2058       new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
2059       Preheader->getTerminator()->eraseFromParent();
2060     }
2061     if (MemCheckBlock) {
2062       MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2063       new UnreachableInst(Preheader->getContext(), MemCheckBlock);
2064       Preheader->getTerminator()->eraseFromParent();
2065     }
2066 
2067     DT->changeImmediateDominator(LoopHeader, Preheader);
2068     if (MemCheckBlock) {
2069       DT->eraseNode(MemCheckBlock);
2070       LI->removeBlock(MemCheckBlock);
2071     }
2072     if (SCEVCheckBlock) {
2073       DT->eraseNode(SCEVCheckBlock);
2074       LI->removeBlock(SCEVCheckBlock);
2075     }
2076   }
2077 
2078   /// Remove the created SCEV & memory runtime check blocks & instructions, if
2079   /// unused.
2080   ~GeneratedRTChecks() {
2081     SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2082     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2083     if (!SCEVCheckCond)
2084       SCEVCleaner.markResultUsed();
2085 
2086     if (!MemRuntimeCheckCond)
2087       MemCheckCleaner.markResultUsed();
2088 
2089     if (MemRuntimeCheckCond) {
2090       auto &SE = *MemCheckExp.getSE();
2091       // Memory runtime check generation creates compares that use expanded
2092       // values. Remove them before running the SCEVExpanderCleaners.
2093       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2094         if (MemCheckExp.isInsertedInstruction(&I))
2095           continue;
2096         SE.forgetValue(&I);
2097         I.eraseFromParent();
2098       }
2099     }
2100     MemCheckCleaner.cleanup();
2101     SCEVCleaner.cleanup();
2102 
2103     if (SCEVCheckCond)
2104       SCEVCheckBlock->eraseFromParent();
2105     if (MemRuntimeCheckCond)
2106       MemCheckBlock->eraseFromParent();
2107   }
2108 
2109   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2110   /// adjusts the branches to branch to the vector preheader or \p Bypass,
2111   /// depending on the generated condition.
2112   BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass,
2113                              BasicBlock *LoopVectorPreHeader,
2114                              BasicBlock *LoopExitBlock) {
2115     if (!SCEVCheckCond)
2116       return nullptr;
2117     if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond))
2118       if (C->isZero())
2119         return nullptr;
2120 
2121     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2122 
2123     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2124     // Create new preheader for vector loop.
2125     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2126       PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2127 
2128     SCEVCheckBlock->getTerminator()->eraseFromParent();
2129     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2130     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2131                                                 SCEVCheckBlock);
2132 
2133     DT->addNewBlock(SCEVCheckBlock, Pred);
2134     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2135 
2136     ReplaceInstWithInst(
2137         SCEVCheckBlock->getTerminator(),
2138         BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond));
2139     // Mark the check as used, to prevent it from being removed during cleanup.
2140     SCEVCheckCond = nullptr;
2141     return SCEVCheckBlock;
2142   }
2143 
2144   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2145   /// the branches to branch to the vector preheader or \p Bypass, depending on
2146   /// the generated condition.
2147   BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass,
2148                                    BasicBlock *LoopVectorPreHeader) {
2149     // Check if we generated code that checks in runtime if arrays overlap.
2150     if (!MemRuntimeCheckCond)
2151       return nullptr;
2152 
2153     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2154     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2155                                                 MemCheckBlock);
2156 
2157     DT->addNewBlock(MemCheckBlock, Pred);
2158     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2159     MemCheckBlock->moveBefore(LoopVectorPreHeader);
2160 
2161     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2162       PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2163 
2164     ReplaceInstWithInst(
2165         MemCheckBlock->getTerminator(),
2166         BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2167     MemCheckBlock->getTerminator()->setDebugLoc(
2168         Pred->getTerminator()->getDebugLoc());
2169 
2170     // Mark the check as used, to prevent it from being removed during cleanup.
2171     MemRuntimeCheckCond = nullptr;
2172     return MemCheckBlock;
2173   }
2174 };
2175 
2176 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2177 // vectorization. The loop needs to be annotated with #pragma omp simd
2178 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2179 // vector length information is not provided, vectorization is not considered
2180 // explicit. Interleave hints are not allowed either. These limitations will be
2181 // relaxed in the future.
2182 // Please, note that we are currently forced to abuse the pragma 'clang
2183 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2184 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2185 // provides *explicit vectorization hints* (LV can bypass legal checks and
2186 // assume that vectorization is legal). However, both hints are implemented
2187 // using the same metadata (llvm.loop.vectorize, processed by
2188 // LoopVectorizeHints). This will be fixed in the future when the native IR
2189 // representation for pragma 'omp simd' is introduced.
2190 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2191                                    OptimizationRemarkEmitter *ORE) {
2192   assert(!OuterLp->isInnermost() && "This is not an outer loop");
2193   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2194 
2195   // Only outer loops with an explicit vectorization hint are supported.
2196   // Unannotated outer loops are ignored.
2197   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2198     return false;
2199 
2200   Function *Fn = OuterLp->getHeader()->getParent();
2201   if (!Hints.allowVectorization(Fn, OuterLp,
2202                                 true /*VectorizeOnlyWhenForced*/)) {
2203     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2204     return false;
2205   }
2206 
2207   if (Hints.getInterleave() > 1) {
2208     // TODO: Interleave support is future work.
2209     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2210                          "outer loops.\n");
2211     Hints.emitRemarkWithHints();
2212     return false;
2213   }
2214 
2215   return true;
2216 }
2217 
2218 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2219                                   OptimizationRemarkEmitter *ORE,
2220                                   SmallVectorImpl<Loop *> &V) {
2221   // Collect inner loops and outer loops without irreducible control flow. For
2222   // now, only collect outer loops that have explicit vectorization hints. If we
2223   // are stress testing the VPlan H-CFG construction, we collect the outermost
2224   // loop of every loop nest.
2225   if (L.isInnermost() || VPlanBuildStressTest ||
2226       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2227     LoopBlocksRPO RPOT(&L);
2228     RPOT.perform(LI);
2229     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2230       V.push_back(&L);
2231       // TODO: Collect inner loops inside marked outer loops in case
2232       // vectorization fails for the outer loop. Do not invoke
2233       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2234       // already known to be reducible. We can use an inherited attribute for
2235       // that.
2236       return;
2237     }
2238   }
2239   for (Loop *InnerL : L)
2240     collectSupportedLoops(*InnerL, LI, ORE, V);
2241 }
2242 
2243 namespace {
2244 
2245 /// The LoopVectorize Pass.
2246 struct LoopVectorize : public FunctionPass {
2247   /// Pass identification, replacement for typeid
2248   static char ID;
2249 
2250   LoopVectorizePass Impl;
2251 
2252   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2253                          bool VectorizeOnlyWhenForced = false)
2254       : FunctionPass(ID),
2255         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2256     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2257   }
2258 
2259   bool runOnFunction(Function &F) override {
2260     if (skipFunction(F))
2261       return false;
2262 
2263     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2264     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2265     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2266     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2267     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2268     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2269     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2270     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2271     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2272     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2273     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2274     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2275     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2276 
2277     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2278         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2279 
2280     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2281                         GetLAA, *ORE, PSI).MadeAnyChange;
2282   }
2283 
2284   void getAnalysisUsage(AnalysisUsage &AU) const override {
2285     AU.addRequired<AssumptionCacheTracker>();
2286     AU.addRequired<BlockFrequencyInfoWrapperPass>();
2287     AU.addRequired<DominatorTreeWrapperPass>();
2288     AU.addRequired<LoopInfoWrapperPass>();
2289     AU.addRequired<ScalarEvolutionWrapperPass>();
2290     AU.addRequired<TargetTransformInfoWrapperPass>();
2291     AU.addRequired<AAResultsWrapperPass>();
2292     AU.addRequired<LoopAccessLegacyAnalysis>();
2293     AU.addRequired<DemandedBitsWrapperPass>();
2294     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2295     AU.addRequired<InjectTLIMappingsLegacy>();
2296 
2297     // We currently do not preserve loopinfo/dominator analyses with outer loop
2298     // vectorization. Until this is addressed, mark these analyses as preserved
2299     // only for non-VPlan-native path.
2300     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2301     if (!EnableVPlanNativePath) {
2302       AU.addPreserved<LoopInfoWrapperPass>();
2303       AU.addPreserved<DominatorTreeWrapperPass>();
2304     }
2305 
2306     AU.addPreserved<BasicAAWrapperPass>();
2307     AU.addPreserved<GlobalsAAWrapperPass>();
2308     AU.addRequired<ProfileSummaryInfoWrapperPass>();
2309   }
2310 };
2311 
2312 } // end anonymous namespace
2313 
2314 //===----------------------------------------------------------------------===//
2315 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2316 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2317 //===----------------------------------------------------------------------===//
2318 
2319 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2320   // We need to place the broadcast of invariant variables outside the loop,
2321   // but only if it's proven safe to do so. Else, broadcast will be inside
2322   // vector loop body.
2323   Instruction *Instr = dyn_cast<Instruction>(V);
2324   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2325                      (!Instr ||
2326                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2327   // Place the code for broadcasting invariant variables in the new preheader.
2328   IRBuilder<>::InsertPointGuard Guard(Builder);
2329   if (SafeToHoist)
2330     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2331 
2332   // Broadcast the scalar into all locations in the vector.
2333   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2334 
2335   return Shuf;
2336 }
2337 
2338 /// This function adds
2339 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
2340 /// to each vector element of Val. The sequence starts at StartIndex.
2341 /// \p Opcode is relevant for FP induction variable.
2342 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
2343                             Instruction::BinaryOps BinOp, ElementCount VF,
2344                             IRBuilder<> &Builder) {
2345   assert(VF.isVector() && "only vector VFs are supported");
2346 
2347   // Create and check the types.
2348   auto *ValVTy = cast<VectorType>(Val->getType());
2349   ElementCount VLen = ValVTy->getElementCount();
2350 
2351   Type *STy = Val->getType()->getScalarType();
2352   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2353          "Induction Step must be an integer or FP");
2354   assert(Step->getType() == STy && "Step has wrong type");
2355 
2356   SmallVector<Constant *, 8> Indices;
2357 
2358   // Create a vector of consecutive numbers from zero to VF.
2359   VectorType *InitVecValVTy = ValVTy;
2360   Type *InitVecValSTy = STy;
2361   if (STy->isFloatingPointTy()) {
2362     InitVecValSTy =
2363         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
2364     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2365   }
2366   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2367 
2368   // Splat the StartIdx
2369   Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
2370 
2371   if (STy->isIntegerTy()) {
2372     InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2373     Step = Builder.CreateVectorSplat(VLen, Step);
2374     assert(Step->getType() == Val->getType() && "Invalid step vec");
2375     // FIXME: The newly created binary instructions should contain nsw/nuw
2376     // flags, which can be found from the original scalar operations.
2377     Step = Builder.CreateMul(InitVec, Step);
2378     return Builder.CreateAdd(Val, Step, "induction");
2379   }
2380 
2381   // Floating point induction.
2382   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2383          "Binary Opcode should be specified for FP induction");
2384   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2385   InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
2386 
2387   Step = Builder.CreateVectorSplat(VLen, Step);
2388   Value *MulOp = Builder.CreateFMul(InitVec, Step);
2389   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2390 }
2391 
2392 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
2393     const InductionDescriptor &II, Value *Step, Value *Start,
2394     Instruction *EntryVal, VPValue *Def, VPTransformState &State) {
2395   IRBuilder<> &Builder = State.Builder;
2396   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2397          "Expected either an induction phi-node or a truncate of it!");
2398 
2399   // Construct the initial value of the vector IV in the vector loop preheader
2400   auto CurrIP = Builder.saveIP();
2401   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2402   if (isa<TruncInst>(EntryVal)) {
2403     assert(Start->getType()->isIntegerTy() &&
2404            "Truncation requires an integer type");
2405     auto *TruncType = cast<IntegerType>(EntryVal->getType());
2406     Step = Builder.CreateTrunc(Step, TruncType);
2407     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2408   }
2409 
2410   Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
2411   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
2412   Value *SteppedStart = getStepVector(
2413       SplatStart, Zero, Step, II.getInductionOpcode(), State.VF, State.Builder);
2414 
2415   // We create vector phi nodes for both integer and floating-point induction
2416   // variables. Here, we determine the kind of arithmetic we will perform.
2417   Instruction::BinaryOps AddOp;
2418   Instruction::BinaryOps MulOp;
2419   if (Step->getType()->isIntegerTy()) {
2420     AddOp = Instruction::Add;
2421     MulOp = Instruction::Mul;
2422   } else {
2423     AddOp = II.getInductionOpcode();
2424     MulOp = Instruction::FMul;
2425   }
2426 
2427   // Multiply the vectorization factor by the step using integer or
2428   // floating-point arithmetic as appropriate.
2429   Type *StepType = Step->getType();
2430   Value *RuntimeVF;
2431   if (Step->getType()->isFloatingPointTy())
2432     RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
2433   else
2434     RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
2435   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
2436 
2437   // Create a vector splat to use in the induction update.
2438   //
2439   // FIXME: If the step is non-constant, we create the vector splat with
2440   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2441   //        handle a constant vector splat.
2442   Value *SplatVF = isa<Constant>(Mul)
2443                        ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
2444                        : Builder.CreateVectorSplat(State.VF, Mul);
2445   Builder.restoreIP(CurrIP);
2446 
2447   // We may need to add the step a number of times, depending on the unroll
2448   // factor. The last of those goes into the PHI.
2449   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2450                                     &*LoopVectorBody->getFirstInsertionPt());
2451   VecInd->setDebugLoc(EntryVal->getDebugLoc());
2452   Instruction *LastInduction = VecInd;
2453   for (unsigned Part = 0; Part < UF; ++Part) {
2454     State.set(Def, LastInduction, Part);
2455 
2456     if (isa<TruncInst>(EntryVal))
2457       addMetadata(LastInduction, EntryVal);
2458 
2459     LastInduction = cast<Instruction>(
2460         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
2461     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
2462   }
2463 
2464   // Move the last step to the end of the latch block. This ensures consistent
2465   // placement of all induction updates.
2466   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2467   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2468   LastInduction->moveBefore(Br);
2469   LastInduction->setName("vec.ind.next");
2470 
2471   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2472   VecInd->addIncoming(LastInduction, LoopVectorLatch);
2473 }
2474 
2475 void InnerLoopVectorizer::widenIntOrFpInduction(
2476     PHINode *IV, VPWidenIntOrFpInductionRecipe *Def, VPTransformState &State,
2477     Value *CanonicalIV) {
2478   Value *Start = Def->getStartValue()->getLiveInIRValue();
2479   const InductionDescriptor &ID = Def->getInductionDescriptor();
2480   TruncInst *Trunc = Def->getTruncInst();
2481   IRBuilder<> &Builder = State.Builder;
2482   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
2483   assert(!State.VF.isZero() && "VF must be non-zero");
2484 
2485   // The value from the original loop to which we are mapping the new induction
2486   // variable.
2487   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2488 
2489   auto &DL = EntryVal->getModule()->getDataLayout();
2490 
2491   // Generate code for the induction step. Note that induction steps are
2492   // required to be loop-invariant
2493   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
2494     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
2495            "Induction step should be loop invariant");
2496     if (PSE.getSE()->isSCEVable(IV->getType())) {
2497       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2498       return Exp.expandCodeFor(Step, Step->getType(),
2499                                State.CFG.VectorPreHeader->getTerminator());
2500     }
2501     return cast<SCEVUnknown>(Step)->getValue();
2502   };
2503 
2504   // The scalar value to broadcast. This is derived from the canonical
2505   // induction variable. If a truncation type is given, truncate the canonical
2506   // induction variable and step. Otherwise, derive these values from the
2507   // induction descriptor.
2508   auto CreateScalarIV = [&](Value *&Step) -> Value * {
2509     Value *ScalarIV = CanonicalIV;
2510     Type *NeededType = IV->getType();
2511     if (!Def->isCanonical() || ScalarIV->getType() != NeededType) {
2512       ScalarIV =
2513           NeededType->isIntegerTy()
2514               ? Builder.CreateSExtOrTrunc(ScalarIV, NeededType)
2515               : Builder.CreateCast(Instruction::SIToFP, ScalarIV, NeededType);
2516       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID,
2517                                       State.CFG.PrevBB);
2518       ScalarIV->setName("offset.idx");
2519     }
2520     if (Trunc) {
2521       auto *TruncType = cast<IntegerType>(Trunc->getType());
2522       assert(Step->getType()->isIntegerTy() &&
2523              "Truncation requires an integer step");
2524       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2525       Step = Builder.CreateTrunc(Step, TruncType);
2526     }
2527     return ScalarIV;
2528   };
2529 
2530   // Fast-math-flags propagate from the original induction instruction.
2531   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
2532   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
2533     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
2534 
2535   // Now do the actual transformations, and start with creating the step value.
2536   Value *Step = CreateStepValue(ID.getStep());
2537   if (State.VF.isScalar()) {
2538     Value *ScalarIV = CreateScalarIV(Step);
2539     Type *ScalarTy = IntegerType::get(ScalarIV->getContext(),
2540                                       Step->getType()->getScalarSizeInBits());
2541 
2542     Instruction::BinaryOps IncOp = ID.getInductionOpcode();
2543     if (IncOp == Instruction::BinaryOpsEnd)
2544       IncOp = Instruction::Add;
2545     for (unsigned Part = 0; Part < UF; ++Part) {
2546       Value *StartIdx = ConstantInt::get(ScalarTy, Part);
2547       Instruction::BinaryOps MulOp = Instruction::Mul;
2548       if (Step->getType()->isFloatingPointTy()) {
2549         StartIdx = Builder.CreateUIToFP(StartIdx, Step->getType());
2550         MulOp = Instruction::FMul;
2551       }
2552 
2553       Value *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2554       Value *EntryPart = Builder.CreateBinOp(IncOp, ScalarIV, Mul, "induction");
2555       State.set(Def, EntryPart, Part);
2556       if (Trunc) {
2557         assert(!Step->getType()->isFloatingPointTy() &&
2558                "fp inductions shouldn't be truncated");
2559         addMetadata(EntryPart, Trunc);
2560       }
2561     }
2562     return;
2563   }
2564 
2565   // Create a new independent vector induction variable, if one is needed.
2566   if (Def->needsVectorIV())
2567     createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State);
2568 
2569   if (Def->needsScalarIV()) {
2570     // Create scalar steps that can be used by instructions we will later
2571     // scalarize. Note that the addition of the scalar steps will not increase
2572     // the number of instructions in the loop in the common case prior to
2573     // InstCombine. We will be trading one vector extract for each scalar step.
2574     Value *ScalarIV = CreateScalarIV(Step);
2575     buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State);
2576   }
2577 }
2578 
2579 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2580                                            Instruction *EntryVal,
2581                                            const InductionDescriptor &ID,
2582                                            VPValue *Def,
2583                                            VPTransformState &State) {
2584   IRBuilder<> &Builder = State.Builder;
2585   // We shouldn't have to build scalar steps if we aren't vectorizing.
2586   assert(State.VF.isVector() && "VF should be greater than one");
2587   // Get the value type and ensure it and the step have the same integer type.
2588   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2589   assert(ScalarIVTy == Step->getType() &&
2590          "Val and Step should have the same type");
2591 
2592   // We build scalar steps for both integer and floating-point induction
2593   // variables. Here, we determine the kind of arithmetic we will perform.
2594   Instruction::BinaryOps AddOp;
2595   Instruction::BinaryOps MulOp;
2596   if (ScalarIVTy->isIntegerTy()) {
2597     AddOp = Instruction::Add;
2598     MulOp = Instruction::Mul;
2599   } else {
2600     AddOp = ID.getInductionOpcode();
2601     MulOp = Instruction::FMul;
2602   }
2603 
2604   // Determine the number of scalars we need to generate for each unroll
2605   // iteration. If EntryVal is uniform, we only need to generate the first
2606   // lane. Otherwise, we generate all VF values.
2607   bool IsUniform =
2608       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), State.VF);
2609   unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue();
2610   // Compute the scalar steps and save the results in State.
2611   Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2612                                      ScalarIVTy->getScalarSizeInBits());
2613   Type *VecIVTy = nullptr;
2614   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2615   if (!IsUniform && State.VF.isScalable()) {
2616     VecIVTy = VectorType::get(ScalarIVTy, State.VF);
2617     UnitStepVec =
2618         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
2619     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
2620     SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV);
2621   }
2622 
2623   for (unsigned Part = 0; Part < State.UF; ++Part) {
2624     Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
2625 
2626     if (!IsUniform && State.VF.isScalable()) {
2627       auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
2628       auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2629       if (ScalarIVTy->isFloatingPointTy())
2630         InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2631       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2632       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2633       State.set(Def, Add, Part);
2634       // It's useful to record the lane values too for the known minimum number
2635       // of elements so we do those below. This improves the code quality when
2636       // trying to extract the first element, for example.
2637     }
2638 
2639     if (ScalarIVTy->isFloatingPointTy())
2640       StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2641 
2642     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2643       Value *StartIdx = Builder.CreateBinOp(
2644           AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2645       // The step returned by `createStepForVF` is a runtime-evaluated value
2646       // when VF is scalable. Otherwise, it should be folded into a Constant.
2647       assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
2648              "Expected StartIdx to be folded to a constant when VF is not "
2649              "scalable");
2650       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2651       auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2652       State.set(Def, Add, VPIteration(Part, Lane));
2653     }
2654   }
2655 }
2656 
2657 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2658                                                     const VPIteration &Instance,
2659                                                     VPTransformState &State) {
2660   Value *ScalarInst = State.get(Def, Instance);
2661   Value *VectorValue = State.get(Def, Instance.Part);
2662   VectorValue = Builder.CreateInsertElement(
2663       VectorValue, ScalarInst,
2664       Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2665   State.set(Def, VectorValue, Instance.Part);
2666 }
2667 
2668 // Return whether we allow using masked interleave-groups (for dealing with
2669 // strided loads/stores that reside in predicated blocks, or for dealing
2670 // with gaps).
2671 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2672   // If an override option has been passed in for interleaved accesses, use it.
2673   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2674     return EnableMaskedInterleavedMemAccesses;
2675 
2676   return TTI.enableMaskedInterleavedAccessVectorization();
2677 }
2678 
2679 // Try to vectorize the interleave group that \p Instr belongs to.
2680 //
2681 // E.g. Translate following interleaved load group (factor = 3):
2682 //   for (i = 0; i < N; i+=3) {
2683 //     R = Pic[i];             // Member of index 0
2684 //     G = Pic[i+1];           // Member of index 1
2685 //     B = Pic[i+2];           // Member of index 2
2686 //     ... // do something to R, G, B
2687 //   }
2688 // To:
2689 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2690 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2691 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2692 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2693 //
2694 // Or translate following interleaved store group (factor = 3):
2695 //   for (i = 0; i < N; i+=3) {
2696 //     ... do something to R, G, B
2697 //     Pic[i]   = R;           // Member of index 0
2698 //     Pic[i+1] = G;           // Member of index 1
2699 //     Pic[i+2] = B;           // Member of index 2
2700 //   }
2701 // To:
2702 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2703 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2704 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2705 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2706 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2707 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2708     const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2709     VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2710     VPValue *BlockInMask) {
2711   Instruction *Instr = Group->getInsertPos();
2712   const DataLayout &DL = Instr->getModule()->getDataLayout();
2713 
2714   // Prepare for the vector type of the interleaved load/store.
2715   Type *ScalarTy = getLoadStoreType(Instr);
2716   unsigned InterleaveFactor = Group->getFactor();
2717   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2718   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2719 
2720   // Prepare for the new pointers.
2721   SmallVector<Value *, 2> AddrParts;
2722   unsigned Index = Group->getIndex(Instr);
2723 
2724   // TODO: extend the masked interleaved-group support to reversed access.
2725   assert((!BlockInMask || !Group->isReverse()) &&
2726          "Reversed masked interleave-group not supported.");
2727 
2728   // If the group is reverse, adjust the index to refer to the last vector lane
2729   // instead of the first. We adjust the index from the first vector lane,
2730   // rather than directly getting the pointer for lane VF - 1, because the
2731   // pointer operand of the interleaved access is supposed to be uniform. For
2732   // uniform instructions, we're only required to generate a value for the
2733   // first vector lane in each unroll iteration.
2734   if (Group->isReverse())
2735     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2736 
2737   for (unsigned Part = 0; Part < UF; Part++) {
2738     Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2739     setDebugLocFromInst(AddrPart);
2740 
2741     // Notice current instruction could be any index. Need to adjust the address
2742     // to the member of index 0.
2743     //
2744     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2745     //       b = A[i];       // Member of index 0
2746     // Current pointer is pointed to A[i+1], adjust it to A[i].
2747     //
2748     // E.g.  A[i+1] = a;     // Member of index 1
2749     //       A[i]   = b;     // Member of index 0
2750     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2751     // Current pointer is pointed to A[i+2], adjust it to A[i].
2752 
2753     bool InBounds = false;
2754     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2755       InBounds = gep->isInBounds();
2756     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2757     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2758 
2759     // Cast to the vector pointer type.
2760     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2761     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2762     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2763   }
2764 
2765   setDebugLocFromInst(Instr);
2766   Value *PoisonVec = PoisonValue::get(VecTy);
2767 
2768   Value *MaskForGaps = nullptr;
2769   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2770     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2771     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2772   }
2773 
2774   // Vectorize the interleaved load group.
2775   if (isa<LoadInst>(Instr)) {
2776     // For each unroll part, create a wide load for the group.
2777     SmallVector<Value *, 2> NewLoads;
2778     for (unsigned Part = 0; Part < UF; Part++) {
2779       Instruction *NewLoad;
2780       if (BlockInMask || MaskForGaps) {
2781         assert(useMaskedInterleavedAccesses(*TTI) &&
2782                "masked interleaved groups are not allowed.");
2783         Value *GroupMask = MaskForGaps;
2784         if (BlockInMask) {
2785           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2786           Value *ShuffledMask = Builder.CreateShuffleVector(
2787               BlockInMaskPart,
2788               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2789               "interleaved.mask");
2790           GroupMask = MaskForGaps
2791                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2792                                                 MaskForGaps)
2793                           : ShuffledMask;
2794         }
2795         NewLoad =
2796             Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2797                                      GroupMask, PoisonVec, "wide.masked.vec");
2798       }
2799       else
2800         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2801                                             Group->getAlign(), "wide.vec");
2802       Group->addMetadata(NewLoad);
2803       NewLoads.push_back(NewLoad);
2804     }
2805 
2806     // For each member in the group, shuffle out the appropriate data from the
2807     // wide loads.
2808     unsigned J = 0;
2809     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2810       Instruction *Member = Group->getMember(I);
2811 
2812       // Skip the gaps in the group.
2813       if (!Member)
2814         continue;
2815 
2816       auto StrideMask =
2817           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2818       for (unsigned Part = 0; Part < UF; Part++) {
2819         Value *StridedVec = Builder.CreateShuffleVector(
2820             NewLoads[Part], StrideMask, "strided.vec");
2821 
2822         // If this member has different type, cast the result type.
2823         if (Member->getType() != ScalarTy) {
2824           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2825           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2826           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2827         }
2828 
2829         if (Group->isReverse())
2830           StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2831 
2832         State.set(VPDefs[J], StridedVec, Part);
2833       }
2834       ++J;
2835     }
2836     return;
2837   }
2838 
2839   // The sub vector type for current instruction.
2840   auto *SubVT = VectorType::get(ScalarTy, VF);
2841 
2842   // Vectorize the interleaved store group.
2843   MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2844   assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2845          "masked interleaved groups are not allowed.");
2846   assert((!MaskForGaps || !VF.isScalable()) &&
2847          "masking gaps for scalable vectors is not yet supported.");
2848   for (unsigned Part = 0; Part < UF; Part++) {
2849     // Collect the stored vector from each member.
2850     SmallVector<Value *, 4> StoredVecs;
2851     for (unsigned i = 0; i < InterleaveFactor; i++) {
2852       assert((Group->getMember(i) || MaskForGaps) &&
2853              "Fail to get a member from an interleaved store group");
2854       Instruction *Member = Group->getMember(i);
2855 
2856       // Skip the gaps in the group.
2857       if (!Member) {
2858         Value *Undef = PoisonValue::get(SubVT);
2859         StoredVecs.push_back(Undef);
2860         continue;
2861       }
2862 
2863       Value *StoredVec = State.get(StoredValues[i], Part);
2864 
2865       if (Group->isReverse())
2866         StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
2867 
2868       // If this member has different type, cast it to a unified type.
2869 
2870       if (StoredVec->getType() != SubVT)
2871         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2872 
2873       StoredVecs.push_back(StoredVec);
2874     }
2875 
2876     // Concatenate all vectors into a wide vector.
2877     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2878 
2879     // Interleave the elements in the wide vector.
2880     Value *IVec = Builder.CreateShuffleVector(
2881         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2882         "interleaved.vec");
2883 
2884     Instruction *NewStoreInstr;
2885     if (BlockInMask || MaskForGaps) {
2886       Value *GroupMask = MaskForGaps;
2887       if (BlockInMask) {
2888         Value *BlockInMaskPart = State.get(BlockInMask, Part);
2889         Value *ShuffledMask = Builder.CreateShuffleVector(
2890             BlockInMaskPart,
2891             createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2892             "interleaved.mask");
2893         GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And,
2894                                                       ShuffledMask, MaskForGaps)
2895                                 : ShuffledMask;
2896       }
2897       NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2898                                                 Group->getAlign(), GroupMask);
2899     } else
2900       NewStoreInstr =
2901           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2902 
2903     Group->addMetadata(NewStoreInstr);
2904   }
2905 }
2906 
2907 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2908                                                VPReplicateRecipe *RepRecipe,
2909                                                const VPIteration &Instance,
2910                                                bool IfPredicateInstr,
2911                                                VPTransformState &State) {
2912   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2913 
2914   // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2915   // the first lane and part.
2916   if (isa<NoAliasScopeDeclInst>(Instr))
2917     if (!Instance.isFirstIteration())
2918       return;
2919 
2920   setDebugLocFromInst(Instr);
2921 
2922   // Does this instruction return a value ?
2923   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2924 
2925   Instruction *Cloned = Instr->clone();
2926   if (!IsVoidRetTy)
2927     Cloned->setName(Instr->getName() + ".cloned");
2928 
2929   // If the scalarized instruction contributes to the address computation of a
2930   // widen masked load/store which was in a basic block that needed predication
2931   // and is not predicated after vectorization, we can't propagate
2932   // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized
2933   // instruction could feed a poison value to the base address of the widen
2934   // load/store.
2935   if (State.MayGeneratePoisonRecipes.contains(RepRecipe))
2936     Cloned->dropPoisonGeneratingFlags();
2937 
2938   State.Builder.SetInsertPoint(Builder.GetInsertBlock(),
2939                                Builder.GetInsertPoint());
2940   // Replace the operands of the cloned instructions with their scalar
2941   // equivalents in the new loop.
2942   for (auto &I : enumerate(RepRecipe->operands())) {
2943     auto InputInstance = Instance;
2944     VPValue *Operand = I.value();
2945     if (State.Plan->isUniformAfterVectorization(Operand))
2946       InputInstance.Lane = VPLane::getFirstLane();
2947     Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2948   }
2949   addNewMetadata(Cloned, Instr);
2950 
2951   // Place the cloned scalar in the new loop.
2952   Builder.Insert(Cloned);
2953 
2954   State.set(RepRecipe, Cloned, Instance);
2955 
2956   // If we just cloned a new assumption, add it the assumption cache.
2957   if (auto *II = dyn_cast<AssumeInst>(Cloned))
2958     AC->registerAssumption(II);
2959 
2960   // End if-block.
2961   if (IfPredicateInstr)
2962     PredicatedInstructions.push_back(Cloned);
2963 }
2964 
2965 void InnerLoopVectorizer::createHeaderBranch(Loop *L) {
2966   BasicBlock *Header = L->getHeader();
2967   assert(!L->getLoopLatch() && "loop should not have a latch at this point");
2968 
2969   IRBuilder<> B(Header->getTerminator());
2970   Instruction *OldInst =
2971       getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
2972   setDebugLocFromInst(OldInst, &B);
2973 
2974   // Connect the header to the exit and header blocks and replace the old
2975   // terminator.
2976   B.CreateCondBr(B.getTrue(), L->getUniqueExitBlock(), Header);
2977 
2978   // Now we have two terminators. Remove the old one from the block.
2979   Header->getTerminator()->eraseFromParent();
2980 }
2981 
2982 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2983   if (TripCount)
2984     return TripCount;
2985 
2986   assert(L && "Create Trip Count for null loop.");
2987   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2988   // Find the loop boundaries.
2989   ScalarEvolution *SE = PSE.getSE();
2990   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2991   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
2992          "Invalid loop count");
2993 
2994   Type *IdxTy = Legal->getWidestInductionType();
2995   assert(IdxTy && "No type for induction");
2996 
2997   // The exit count might have the type of i64 while the phi is i32. This can
2998   // happen if we have an induction variable that is sign extended before the
2999   // compare. The only way that we get a backedge taken count is that the
3000   // induction variable was signed and as such will not overflow. In such a case
3001   // truncation is legal.
3002   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
3003       IdxTy->getPrimitiveSizeInBits())
3004     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
3005   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
3006 
3007   // Get the total trip count from the count by adding 1.
3008   const SCEV *ExitCount = SE->getAddExpr(
3009       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
3010 
3011   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
3012 
3013   // Expand the trip count and place the new instructions in the preheader.
3014   // Notice that the pre-header does not change, only the loop body.
3015   SCEVExpander Exp(*SE, DL, "induction");
3016 
3017   // Count holds the overall loop count (N).
3018   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
3019                                 L->getLoopPreheader()->getTerminator());
3020 
3021   if (TripCount->getType()->isPointerTy())
3022     TripCount =
3023         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
3024                                     L->getLoopPreheader()->getTerminator());
3025 
3026   return TripCount;
3027 }
3028 
3029 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
3030   if (VectorTripCount)
3031     return VectorTripCount;
3032 
3033   Value *TC = getOrCreateTripCount(L);
3034   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3035 
3036   Type *Ty = TC->getType();
3037   // This is where we can make the step a runtime constant.
3038   Value *Step = createStepForVF(Builder, Ty, VF, UF);
3039 
3040   // If the tail is to be folded by masking, round the number of iterations N
3041   // up to a multiple of Step instead of rounding down. This is done by first
3042   // adding Step-1 and then rounding down. Note that it's ok if this addition
3043   // overflows: the vector induction variable will eventually wrap to zero given
3044   // that it starts at zero and its Step is a power of two; the loop will then
3045   // exit, with the last early-exit vector comparison also producing all-true.
3046   if (Cost->foldTailByMasking()) {
3047     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
3048            "VF*UF must be a power of 2 when folding tail by masking");
3049     Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
3050     TC = Builder.CreateAdd(
3051         TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
3052   }
3053 
3054   // Now we need to generate the expression for the part of the loop that the
3055   // vectorized body will execute. This is equal to N - (N % Step) if scalar
3056   // iterations are not required for correctness, or N - Step, otherwise. Step
3057   // is equal to the vectorization factor (number of SIMD elements) times the
3058   // unroll factor (number of SIMD instructions).
3059   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
3060 
3061   // There are cases where we *must* run at least one iteration in the remainder
3062   // loop.  See the cost model for when this can happen.  If the step evenly
3063   // divides the trip count, we set the remainder to be equal to the step. If
3064   // the step does not evenly divide the trip count, no adjustment is necessary
3065   // since there will already be scalar iterations. Note that the minimum
3066   // iterations check ensures that N >= Step.
3067   if (Cost->requiresScalarEpilogue(VF)) {
3068     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
3069     R = Builder.CreateSelect(IsZero, Step, R);
3070   }
3071 
3072   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
3073 
3074   return VectorTripCount;
3075 }
3076 
3077 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
3078                                                    const DataLayout &DL) {
3079   // Verify that V is a vector type with same number of elements as DstVTy.
3080   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
3081   unsigned VF = DstFVTy->getNumElements();
3082   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
3083   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
3084   Type *SrcElemTy = SrcVecTy->getElementType();
3085   Type *DstElemTy = DstFVTy->getElementType();
3086   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3087          "Vector elements must have same size");
3088 
3089   // Do a direct cast if element types are castable.
3090   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3091     return Builder.CreateBitOrPointerCast(V, DstFVTy);
3092   }
3093   // V cannot be directly casted to desired vector type.
3094   // May happen when V is a floating point vector but DstVTy is a vector of
3095   // pointers or vice-versa. Handle this using a two-step bitcast using an
3096   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3097   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3098          "Only one type should be a pointer type");
3099   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3100          "Only one type should be a floating point type");
3101   Type *IntTy =
3102       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3103   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
3104   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3105   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
3106 }
3107 
3108 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
3109                                                          BasicBlock *Bypass) {
3110   Value *Count = getOrCreateTripCount(L);
3111   // Reuse existing vector loop preheader for TC checks.
3112   // Note that new preheader block is generated for vector loop.
3113   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
3114   IRBuilder<> Builder(TCCheckBlock->getTerminator());
3115 
3116   // Generate code to check if the loop's trip count is less than VF * UF, or
3117   // equal to it in case a scalar epilogue is required; this implies that the
3118   // vector trip count is zero. This check also covers the case where adding one
3119   // to the backedge-taken count overflowed leading to an incorrect trip count
3120   // of zero. In this case we will also jump to the scalar loop.
3121   auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
3122                                             : ICmpInst::ICMP_ULT;
3123 
3124   // If tail is to be folded, vector loop takes care of all iterations.
3125   Value *CheckMinIters = Builder.getFalse();
3126   if (!Cost->foldTailByMasking()) {
3127     Value *Step = createStepForVF(Builder, Count->getType(), VF, UF);
3128     CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
3129   }
3130   // Create new preheader for vector loop.
3131   LoopVectorPreHeader =
3132       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3133                  "vector.ph");
3134 
3135   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
3136                                DT->getNode(Bypass)->getIDom()) &&
3137          "TC check is expected to dominate Bypass");
3138 
3139   // Update dominator for Bypass & LoopExit (if needed).
3140   DT->changeImmediateDominator(Bypass, TCCheckBlock);
3141   if (!Cost->requiresScalarEpilogue(VF))
3142     // If there is an epilogue which must run, there's no edge from the
3143     // middle block to exit blocks  and thus no need to update the immediate
3144     // dominator of the exit blocks.
3145     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3146 
3147   ReplaceInstWithInst(
3148       TCCheckBlock->getTerminator(),
3149       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3150   LoopBypassBlocks.push_back(TCCheckBlock);
3151 }
3152 
3153 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
3154 
3155   BasicBlock *const SCEVCheckBlock =
3156       RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock);
3157   if (!SCEVCheckBlock)
3158     return nullptr;
3159 
3160   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3161            (OptForSizeBasedOnProfile &&
3162             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
3163          "Cannot SCEV check stride or overflow when optimizing for size");
3164 
3165 
3166   // Update dominator only if this is first RT check.
3167   if (LoopBypassBlocks.empty()) {
3168     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3169     if (!Cost->requiresScalarEpilogue(VF))
3170       // If there is an epilogue which must run, there's no edge from the
3171       // middle block to exit blocks  and thus no need to update the immediate
3172       // dominator of the exit blocks.
3173       DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3174   }
3175 
3176   LoopBypassBlocks.push_back(SCEVCheckBlock);
3177   AddedSafetyChecks = true;
3178   return SCEVCheckBlock;
3179 }
3180 
3181 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
3182                                                       BasicBlock *Bypass) {
3183   // VPlan-native path does not do any analysis for runtime checks currently.
3184   if (EnableVPlanNativePath)
3185     return nullptr;
3186 
3187   BasicBlock *const MemCheckBlock =
3188       RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader);
3189 
3190   // Check if we generated code that checks in runtime if arrays overlap. We put
3191   // the checks into a separate block to make the more common case of few
3192   // elements faster.
3193   if (!MemCheckBlock)
3194     return nullptr;
3195 
3196   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3197     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3198            "Cannot emit memory checks when optimizing for size, unless forced "
3199            "to vectorize.");
3200     ORE->emit([&]() {
3201       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3202                                         L->getStartLoc(), L->getHeader())
3203              << "Code-size may be reduced by not forcing "
3204                 "vectorization, or by source-code modifications "
3205                 "eliminating the need for runtime checks "
3206                 "(e.g., adding 'restrict').";
3207     });
3208   }
3209 
3210   LoopBypassBlocks.push_back(MemCheckBlock);
3211 
3212   AddedSafetyChecks = true;
3213 
3214   // We currently don't use LoopVersioning for the actual loop cloning but we
3215   // still use it to add the noalias metadata.
3216   LVer = std::make_unique<LoopVersioning>(
3217       *Legal->getLAI(),
3218       Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3219       DT, PSE.getSE());
3220   LVer->prepareNoAliasMetadata();
3221   return MemCheckBlock;
3222 }
3223 
3224 Value *InnerLoopVectorizer::emitTransformedIndex(
3225     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
3226     const InductionDescriptor &ID, BasicBlock *VectorHeader) const {
3227 
3228   SCEVExpander Exp(*SE, DL, "induction");
3229   auto Step = ID.getStep();
3230   auto StartValue = ID.getStartValue();
3231   assert(Index->getType()->getScalarType() == Step->getType() &&
3232          "Index scalar type does not match StepValue type");
3233 
3234   // Note: the IR at this point is broken. We cannot use SE to create any new
3235   // SCEV and then expand it, hoping that SCEV's simplification will give us
3236   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3237   // lead to various SCEV crashes. So all we can do is to use builder and rely
3238   // on InstCombine for future simplifications. Here we handle some trivial
3239   // cases only.
3240   auto CreateAdd = [&B](Value *X, Value *Y) {
3241     assert(X->getType() == Y->getType() && "Types don't match!");
3242     if (auto *CX = dyn_cast<ConstantInt>(X))
3243       if (CX->isZero())
3244         return Y;
3245     if (auto *CY = dyn_cast<ConstantInt>(Y))
3246       if (CY->isZero())
3247         return X;
3248     return B.CreateAdd(X, Y);
3249   };
3250 
3251   // We allow X to be a vector type, in which case Y will potentially be
3252   // splatted into a vector with the same element count.
3253   auto CreateMul = [&B](Value *X, Value *Y) {
3254     assert(X->getType()->getScalarType() == Y->getType() &&
3255            "Types don't match!");
3256     if (auto *CX = dyn_cast<ConstantInt>(X))
3257       if (CX->isOne())
3258         return Y;
3259     if (auto *CY = dyn_cast<ConstantInt>(Y))
3260       if (CY->isOne())
3261         return X;
3262     VectorType *XVTy = dyn_cast<VectorType>(X->getType());
3263     if (XVTy && !isa<VectorType>(Y->getType()))
3264       Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
3265     return B.CreateMul(X, Y);
3266   };
3267 
3268   // Get a suitable insert point for SCEV expansion. For blocks in the vector
3269   // loop, choose the end of the vector loop header (=VectorHeader), because
3270   // the DomTree is not kept up-to-date for additional blocks generated in the
3271   // vector loop. By using the header as insertion point, we guarantee that the
3272   // expanded instructions dominate all their uses.
3273   auto GetInsertPoint = [this, &B, VectorHeader]() {
3274     BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3275     if (InsertBB != LoopVectorBody &&
3276         LI->getLoopFor(VectorHeader) == LI->getLoopFor(InsertBB))
3277       return VectorHeader->getTerminator();
3278     return &*B.GetInsertPoint();
3279   };
3280 
3281   switch (ID.getKind()) {
3282   case InductionDescriptor::IK_IntInduction: {
3283     assert(!isa<VectorType>(Index->getType()) &&
3284            "Vector indices not supported for integer inductions yet");
3285     assert(Index->getType() == StartValue->getType() &&
3286            "Index type does not match StartValue type");
3287     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3288       return B.CreateSub(StartValue, Index);
3289     auto *Offset = CreateMul(
3290         Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3291     return CreateAdd(StartValue, Offset);
3292   }
3293   case InductionDescriptor::IK_PtrInduction: {
3294     assert(isa<SCEVConstant>(Step) &&
3295            "Expected constant step for pointer induction");
3296     return B.CreateGEP(
3297         ID.getElementType(), StartValue,
3298         CreateMul(Index,
3299                   Exp.expandCodeFor(Step, Index->getType()->getScalarType(),
3300                                     GetInsertPoint())));
3301   }
3302   case InductionDescriptor::IK_FpInduction: {
3303     assert(!isa<VectorType>(Index->getType()) &&
3304            "Vector indices not supported for FP inductions yet");
3305     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
3306     auto InductionBinOp = ID.getInductionBinOp();
3307     assert(InductionBinOp &&
3308            (InductionBinOp->getOpcode() == Instruction::FAdd ||
3309             InductionBinOp->getOpcode() == Instruction::FSub) &&
3310            "Original bin op should be defined for FP induction");
3311 
3312     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3313     Value *MulExp = B.CreateFMul(StepValue, Index);
3314     return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3315                          "induction");
3316   }
3317   case InductionDescriptor::IK_NoInduction:
3318     return nullptr;
3319   }
3320   llvm_unreachable("invalid enum");
3321 }
3322 
3323 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3324   LoopScalarBody = OrigLoop->getHeader();
3325   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3326   assert(LoopVectorPreHeader && "Invalid loop structure");
3327   LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3328   assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) &&
3329          "multiple exit loop without required epilogue?");
3330 
3331   LoopMiddleBlock =
3332       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3333                  LI, nullptr, Twine(Prefix) + "middle.block");
3334   LoopScalarPreHeader =
3335       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3336                  nullptr, Twine(Prefix) + "scalar.ph");
3337 
3338   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3339 
3340   // Set up the middle block terminator.  Two cases:
3341   // 1) If we know that we must execute the scalar epilogue, emit an
3342   //    unconditional branch.
3343   // 2) Otherwise, we must have a single unique exit block (due to how we
3344   //    implement the multiple exit case).  In this case, set up a conditonal
3345   //    branch from the middle block to the loop scalar preheader, and the
3346   //    exit block.  completeLoopSkeleton will update the condition to use an
3347   //    iteration check, if required to decide whether to execute the remainder.
3348   BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ?
3349     BranchInst::Create(LoopScalarPreHeader) :
3350     BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3351                        Builder.getTrue());
3352   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3353   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3354 
3355   // We intentionally don't let SplitBlock to update LoopInfo since
3356   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3357   // LoopVectorBody is explicitly added to the correct place few lines later.
3358   LoopVectorBody =
3359       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3360                  nullptr, nullptr, Twine(Prefix) + "vector.body");
3361 
3362   // Update dominator for loop exit.
3363   if (!Cost->requiresScalarEpilogue(VF))
3364     // If there is an epilogue which must run, there's no edge from the
3365     // middle block to exit blocks  and thus no need to update the immediate
3366     // dominator of the exit blocks.
3367     DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3368 
3369   // Create and register the new vector loop.
3370   Loop *Lp = LI->AllocateLoop();
3371   Loop *ParentLoop = OrigLoop->getParentLoop();
3372 
3373   // Insert the new loop into the loop nest and register the new basic blocks
3374   // before calling any utilities such as SCEV that require valid LoopInfo.
3375   if (ParentLoop) {
3376     ParentLoop->addChildLoop(Lp);
3377   } else {
3378     LI->addTopLevelLoop(Lp);
3379   }
3380   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3381   return Lp;
3382 }
3383 
3384 void InnerLoopVectorizer::createInductionResumeValues(
3385     Loop *L, std::pair<BasicBlock *, Value *> AdditionalBypass) {
3386   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3387           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3388          "Inconsistent information about additional bypass.");
3389 
3390   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3391   assert(VectorTripCount && L && "Expected valid arguments");
3392   // We are going to resume the execution of the scalar loop.
3393   // Go over all of the induction variables that we found and fix the
3394   // PHIs that are left in the scalar version of the loop.
3395   // The starting values of PHI nodes depend on the counter of the last
3396   // iteration in the vectorized loop.
3397   // If we come from a bypass edge then we need to start from the original
3398   // start value.
3399   Instruction *OldInduction = Legal->getPrimaryInduction();
3400   for (auto &InductionEntry : Legal->getInductionVars()) {
3401     PHINode *OrigPhi = InductionEntry.first;
3402     InductionDescriptor II = InductionEntry.second;
3403 
3404     // Create phi nodes to merge from the  backedge-taken check block.
3405     PHINode *BCResumeVal =
3406         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3407                         LoopScalarPreHeader->getTerminator());
3408     // Copy original phi DL over to the new one.
3409     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3410     Value *&EndValue = IVEndValues[OrigPhi];
3411     Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3412     if (OrigPhi == OldInduction) {
3413       // We know what the end value is.
3414       EndValue = VectorTripCount;
3415     } else {
3416       IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3417 
3418       // Fast-math-flags propagate from the original induction instruction.
3419       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3420         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3421 
3422       Type *StepType = II.getStep()->getType();
3423       Instruction::CastOps CastOp =
3424           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3425       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3426       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3427       EndValue =
3428           emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody);
3429       EndValue->setName("ind.end");
3430 
3431       // Compute the end value for the additional bypass (if applicable).
3432       if (AdditionalBypass.first) {
3433         B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3434         CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3435                                          StepType, true);
3436         CRD =
3437             B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
3438         EndValueFromAdditionalBypass =
3439             emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody);
3440         EndValueFromAdditionalBypass->setName("ind.end");
3441       }
3442     }
3443     // The new PHI merges the original incoming value, in case of a bypass,
3444     // or the value at the end of the vectorized loop.
3445     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3446 
3447     // Fix the scalar body counter (PHI node).
3448     // The old induction's phi node in the scalar body needs the truncated
3449     // value.
3450     for (BasicBlock *BB : LoopBypassBlocks)
3451       BCResumeVal->addIncoming(II.getStartValue(), BB);
3452 
3453     if (AdditionalBypass.first)
3454       BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3455                                             EndValueFromAdditionalBypass);
3456 
3457     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3458   }
3459 }
3460 
3461 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3462                                                       MDNode *OrigLoopID) {
3463   assert(L && "Expected valid loop.");
3464 
3465   // The trip counts should be cached by now.
3466   Value *Count = getOrCreateTripCount(L);
3467   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3468 
3469   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3470 
3471   // Add a check in the middle block to see if we have completed
3472   // all of the iterations in the first vector loop.  Three cases:
3473   // 1) If we require a scalar epilogue, there is no conditional branch as
3474   //    we unconditionally branch to the scalar preheader.  Do nothing.
3475   // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3476   //    Thus if tail is to be folded, we know we don't need to run the
3477   //    remainder and we can use the previous value for the condition (true).
3478   // 3) Otherwise, construct a runtime check.
3479   if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) {
3480     Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3481                                         Count, VectorTripCount, "cmp.n",
3482                                         LoopMiddleBlock->getTerminator());
3483 
3484     // Here we use the same DebugLoc as the scalar loop latch terminator instead
3485     // of the corresponding compare because they may have ended up with
3486     // different line numbers and we want to avoid awkward line stepping while
3487     // debugging. Eg. if the compare has got a line number inside the loop.
3488     CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3489     cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3490   }
3491 
3492   // Get ready to start creating new instructions into the vectorized body.
3493   assert(LoopVectorPreHeader == L->getLoopPreheader() &&
3494          "Inconsistent vector loop preheader");
3495   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3496 
3497 #ifdef EXPENSIVE_CHECKS
3498   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3499   LI->verify(*DT);
3500 #endif
3501 
3502   return LoopVectorPreHeader;
3503 }
3504 
3505 std::pair<BasicBlock *, Value *>
3506 InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3507   /*
3508    In this function we generate a new loop. The new loop will contain
3509    the vectorized instructions while the old loop will continue to run the
3510    scalar remainder.
3511 
3512        [ ] <-- loop iteration number check.
3513     /   |
3514    /    v
3515   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3516   |  /  |
3517   | /   v
3518   ||   [ ]     <-- vector pre header.
3519   |/    |
3520   |     v
3521   |    [  ] \
3522   |    [  ]_|   <-- vector loop.
3523   |     |
3524   |     v
3525   \   -[ ]   <--- middle-block.
3526    \/   |
3527    /\   v
3528    | ->[ ]     <--- new preheader.
3529    |    |
3530  (opt)  v      <-- edge from middle to exit iff epilogue is not required.
3531    |   [ ] \
3532    |   [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue).
3533     \   |
3534      \  v
3535       >[ ]     <-- exit block(s).
3536    ...
3537    */
3538 
3539   // Get the metadata of the original loop before it gets modified.
3540   MDNode *OrigLoopID = OrigLoop->getLoopID();
3541 
3542   // Workaround!  Compute the trip count of the original loop and cache it
3543   // before we start modifying the CFG.  This code has a systemic problem
3544   // wherein it tries to run analysis over partially constructed IR; this is
3545   // wrong, and not simply for SCEV.  The trip count of the original loop
3546   // simply happens to be prone to hitting this in practice.  In theory, we
3547   // can hit the same issue for any SCEV, or ValueTracking query done during
3548   // mutation.  See PR49900.
3549   getOrCreateTripCount(OrigLoop);
3550 
3551   // Create an empty vector loop, and prepare basic blocks for the runtime
3552   // checks.
3553   Loop *Lp = createVectorLoopSkeleton("");
3554 
3555   // Now, compare the new count to zero. If it is zero skip the vector loop and
3556   // jump to the scalar loop. This check also covers the case where the
3557   // backedge-taken count is uint##_max: adding one to it will overflow leading
3558   // to an incorrect trip count of zero. In this (rare) case we will also jump
3559   // to the scalar loop.
3560   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3561 
3562   // Generate the code to check any assumptions that we've made for SCEV
3563   // expressions.
3564   emitSCEVChecks(Lp, LoopScalarPreHeader);
3565 
3566   // Generate the code that checks in runtime if arrays overlap. We put the
3567   // checks into a separate block to make the more common case of few elements
3568   // faster.
3569   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3570 
3571   createHeaderBranch(Lp);
3572 
3573   // Emit phis for the new starting index of the scalar loop.
3574   createInductionResumeValues(Lp);
3575 
3576   return {completeLoopSkeleton(Lp, OrigLoopID), nullptr};
3577 }
3578 
3579 // Fix up external users of the induction variable. At this point, we are
3580 // in LCSSA form, with all external PHIs that use the IV having one input value,
3581 // coming from the remainder loop. We need those PHIs to also have a correct
3582 // value for the IV when arriving directly from the middle block.
3583 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3584                                        const InductionDescriptor &II,
3585                                        Value *CountRoundDown, Value *EndValue,
3586                                        BasicBlock *MiddleBlock) {
3587   // There are two kinds of external IV usages - those that use the value
3588   // computed in the last iteration (the PHI) and those that use the penultimate
3589   // value (the value that feeds into the phi from the loop latch).
3590   // We allow both, but they, obviously, have different values.
3591 
3592   assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3593 
3594   DenseMap<Value *, Value *> MissingVals;
3595 
3596   // An external user of the last iteration's value should see the value that
3597   // the remainder loop uses to initialize its own IV.
3598   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3599   for (User *U : PostInc->users()) {
3600     Instruction *UI = cast<Instruction>(U);
3601     if (!OrigLoop->contains(UI)) {
3602       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3603       MissingVals[UI] = EndValue;
3604     }
3605   }
3606 
3607   // An external user of the penultimate value need to see EndValue - Step.
3608   // The simplest way to get this is to recompute it from the constituent SCEVs,
3609   // that is Start + (Step * (CRD - 1)).
3610   for (User *U : OrigPhi->users()) {
3611     auto *UI = cast<Instruction>(U);
3612     if (!OrigLoop->contains(UI)) {
3613       const DataLayout &DL =
3614           OrigLoop->getHeader()->getModule()->getDataLayout();
3615       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3616 
3617       IRBuilder<> B(MiddleBlock->getTerminator());
3618 
3619       // Fast-math-flags propagate from the original induction instruction.
3620       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3621         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3622 
3623       Value *CountMinusOne = B.CreateSub(
3624           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3625       Value *CMO =
3626           !II.getStep()->getType()->isIntegerTy()
3627               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3628                              II.getStep()->getType())
3629               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3630       CMO->setName("cast.cmo");
3631       Value *Escape =
3632           emitTransformedIndex(B, CMO, PSE.getSE(), DL, II, LoopVectorBody);
3633       Escape->setName("ind.escape");
3634       MissingVals[UI] = Escape;
3635     }
3636   }
3637 
3638   for (auto &I : MissingVals) {
3639     PHINode *PHI = cast<PHINode>(I.first);
3640     // One corner case we have to handle is two IVs "chasing" each-other,
3641     // that is %IV2 = phi [...], [ %IV1, %latch ]
3642     // In this case, if IV1 has an external use, we need to avoid adding both
3643     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3644     // don't already have an incoming value for the middle block.
3645     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3646       PHI->addIncoming(I.second, MiddleBlock);
3647   }
3648 }
3649 
3650 namespace {
3651 
3652 struct CSEDenseMapInfo {
3653   static bool canHandle(const Instruction *I) {
3654     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3655            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3656   }
3657 
3658   static inline Instruction *getEmptyKey() {
3659     return DenseMapInfo<Instruction *>::getEmptyKey();
3660   }
3661 
3662   static inline Instruction *getTombstoneKey() {
3663     return DenseMapInfo<Instruction *>::getTombstoneKey();
3664   }
3665 
3666   static unsigned getHashValue(const Instruction *I) {
3667     assert(canHandle(I) && "Unknown instruction!");
3668     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3669                                                            I->value_op_end()));
3670   }
3671 
3672   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3673     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3674         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3675       return LHS == RHS;
3676     return LHS->isIdenticalTo(RHS);
3677   }
3678 };
3679 
3680 } // end anonymous namespace
3681 
3682 ///Perform cse of induction variable instructions.
3683 static void cse(BasicBlock *BB) {
3684   // Perform simple cse.
3685   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3686   for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3687     if (!CSEDenseMapInfo::canHandle(&In))
3688       continue;
3689 
3690     // Check if we can replace this instruction with any of the
3691     // visited instructions.
3692     if (Instruction *V = CSEMap.lookup(&In)) {
3693       In.replaceAllUsesWith(V);
3694       In.eraseFromParent();
3695       continue;
3696     }
3697 
3698     CSEMap[&In] = &In;
3699   }
3700 }
3701 
3702 InstructionCost
3703 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3704                                               bool &NeedToScalarize) const {
3705   Function *F = CI->getCalledFunction();
3706   Type *ScalarRetTy = CI->getType();
3707   SmallVector<Type *, 4> Tys, ScalarTys;
3708   for (auto &ArgOp : CI->args())
3709     ScalarTys.push_back(ArgOp->getType());
3710 
3711   // Estimate cost of scalarized vector call. The source operands are assumed
3712   // to be vectors, so we need to extract individual elements from there,
3713   // execute VF scalar calls, and then gather the result into the vector return
3714   // value.
3715   InstructionCost ScalarCallCost =
3716       TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
3717   if (VF.isScalar())
3718     return ScalarCallCost;
3719 
3720   // Compute corresponding vector type for return value and arguments.
3721   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3722   for (Type *ScalarTy : ScalarTys)
3723     Tys.push_back(ToVectorTy(ScalarTy, VF));
3724 
3725   // Compute costs of unpacking argument values for the scalar calls and
3726   // packing the return values to a vector.
3727   InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
3728 
3729   InstructionCost Cost =
3730       ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3731 
3732   // If we can't emit a vector call for this function, then the currently found
3733   // cost is the cost we need to return.
3734   NeedToScalarize = true;
3735   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3736   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3737 
3738   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3739     return Cost;
3740 
3741   // If the corresponding vector cost is cheaper, return its cost.
3742   InstructionCost VectorCallCost =
3743       TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
3744   if (VectorCallCost < Cost) {
3745     NeedToScalarize = false;
3746     Cost = VectorCallCost;
3747   }
3748   return Cost;
3749 }
3750 
3751 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3752   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3753     return Elt;
3754   return VectorType::get(Elt, VF);
3755 }
3756 
3757 InstructionCost
3758 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3759                                                    ElementCount VF) const {
3760   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3761   assert(ID && "Expected intrinsic call!");
3762   Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3763   FastMathFlags FMF;
3764   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3765     FMF = FPMO->getFastMathFlags();
3766 
3767   SmallVector<const Value *> Arguments(CI->args());
3768   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3769   SmallVector<Type *> ParamTys;
3770   std::transform(FTy->param_begin(), FTy->param_end(),
3771                  std::back_inserter(ParamTys),
3772                  [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3773 
3774   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3775                                     dyn_cast<IntrinsicInst>(CI));
3776   return TTI.getIntrinsicInstrCost(CostAttrs,
3777                                    TargetTransformInfo::TCK_RecipThroughput);
3778 }
3779 
3780 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3781   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3782   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3783   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3784 }
3785 
3786 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3787   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3788   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3789   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3790 }
3791 
3792 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3793   // For every instruction `I` in MinBWs, truncate the operands, create a
3794   // truncated version of `I` and reextend its result. InstCombine runs
3795   // later and will remove any ext/trunc pairs.
3796   SmallPtrSet<Value *, 4> Erased;
3797   for (const auto &KV : Cost->getMinimalBitwidths()) {
3798     // If the value wasn't vectorized, we must maintain the original scalar
3799     // type. The absence of the value from State indicates that it
3800     // wasn't vectorized.
3801     // FIXME: Should not rely on getVPValue at this point.
3802     VPValue *Def = State.Plan->getVPValue(KV.first, true);
3803     if (!State.hasAnyVectorValue(Def))
3804       continue;
3805     for (unsigned Part = 0; Part < UF; ++Part) {
3806       Value *I = State.get(Def, Part);
3807       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3808         continue;
3809       Type *OriginalTy = I->getType();
3810       Type *ScalarTruncatedTy =
3811           IntegerType::get(OriginalTy->getContext(), KV.second);
3812       auto *TruncatedTy = VectorType::get(
3813           ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
3814       if (TruncatedTy == OriginalTy)
3815         continue;
3816 
3817       IRBuilder<> B(cast<Instruction>(I));
3818       auto ShrinkOperand = [&](Value *V) -> Value * {
3819         if (auto *ZI = dyn_cast<ZExtInst>(V))
3820           if (ZI->getSrcTy() == TruncatedTy)
3821             return ZI->getOperand(0);
3822         return B.CreateZExtOrTrunc(V, TruncatedTy);
3823       };
3824 
3825       // The actual instruction modification depends on the instruction type,
3826       // unfortunately.
3827       Value *NewI = nullptr;
3828       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3829         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3830                              ShrinkOperand(BO->getOperand(1)));
3831 
3832         // Any wrapping introduced by shrinking this operation shouldn't be
3833         // considered undefined behavior. So, we can't unconditionally copy
3834         // arithmetic wrapping flags to NewI.
3835         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3836       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3837         NewI =
3838             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3839                          ShrinkOperand(CI->getOperand(1)));
3840       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3841         NewI = B.CreateSelect(SI->getCondition(),
3842                               ShrinkOperand(SI->getTrueValue()),
3843                               ShrinkOperand(SI->getFalseValue()));
3844       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3845         switch (CI->getOpcode()) {
3846         default:
3847           llvm_unreachable("Unhandled cast!");
3848         case Instruction::Trunc:
3849           NewI = ShrinkOperand(CI->getOperand(0));
3850           break;
3851         case Instruction::SExt:
3852           NewI = B.CreateSExtOrTrunc(
3853               CI->getOperand(0),
3854               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3855           break;
3856         case Instruction::ZExt:
3857           NewI = B.CreateZExtOrTrunc(
3858               CI->getOperand(0),
3859               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3860           break;
3861         }
3862       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3863         auto Elements0 =
3864             cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
3865         auto *O0 = B.CreateZExtOrTrunc(
3866             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3867         auto Elements1 =
3868             cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
3869         auto *O1 = B.CreateZExtOrTrunc(
3870             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3871 
3872         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3873       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3874         // Don't do anything with the operands, just extend the result.
3875         continue;
3876       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3877         auto Elements =
3878             cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
3879         auto *O0 = B.CreateZExtOrTrunc(
3880             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3881         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3882         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3883       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3884         auto Elements =
3885             cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
3886         auto *O0 = B.CreateZExtOrTrunc(
3887             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3888         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3889       } else {
3890         // If we don't know what to do, be conservative and don't do anything.
3891         continue;
3892       }
3893 
3894       // Lastly, extend the result.
3895       NewI->takeName(cast<Instruction>(I));
3896       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3897       I->replaceAllUsesWith(Res);
3898       cast<Instruction>(I)->eraseFromParent();
3899       Erased.insert(I);
3900       State.reset(Def, Res, Part);
3901     }
3902   }
3903 
3904   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3905   for (const auto &KV : Cost->getMinimalBitwidths()) {
3906     // If the value wasn't vectorized, we must maintain the original scalar
3907     // type. The absence of the value from State indicates that it
3908     // wasn't vectorized.
3909     // FIXME: Should not rely on getVPValue at this point.
3910     VPValue *Def = State.Plan->getVPValue(KV.first, true);
3911     if (!State.hasAnyVectorValue(Def))
3912       continue;
3913     for (unsigned Part = 0; Part < UF; ++Part) {
3914       Value *I = State.get(Def, Part);
3915       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3916       if (Inst && Inst->use_empty()) {
3917         Value *NewI = Inst->getOperand(0);
3918         Inst->eraseFromParent();
3919         State.reset(Def, NewI, Part);
3920       }
3921     }
3922   }
3923 }
3924 
3925 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
3926   // Insert truncates and extends for any truncated instructions as hints to
3927   // InstCombine.
3928   if (VF.isVector())
3929     truncateToMinimalBitwidths(State);
3930 
3931   // Fix widened non-induction PHIs by setting up the PHI operands.
3932   if (OrigPHIsToFix.size()) {
3933     assert(EnableVPlanNativePath &&
3934            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3935     fixNonInductionPHIs(State);
3936   }
3937 
3938   // At this point every instruction in the original loop is widened to a
3939   // vector form. Now we need to fix the recurrences in the loop. These PHI
3940   // nodes are currently empty because we did not want to introduce cycles.
3941   // This is the second stage of vectorizing recurrences.
3942   fixCrossIterationPHIs(State);
3943 
3944   // Forget the original basic block.
3945   PSE.getSE()->forgetLoop(OrigLoop);
3946 
3947   // If we inserted an edge from the middle block to the unique exit block,
3948   // update uses outside the loop (phis) to account for the newly inserted
3949   // edge.
3950   if (!Cost->requiresScalarEpilogue(VF)) {
3951     // Fix-up external users of the induction variables.
3952     for (auto &Entry : Legal->getInductionVars())
3953       fixupIVUsers(Entry.first, Entry.second,
3954                    getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3955                    IVEndValues[Entry.first], LoopMiddleBlock);
3956 
3957     fixLCSSAPHIs(State);
3958   }
3959 
3960   for (Instruction *PI : PredicatedInstructions)
3961     sinkScalarOperands(&*PI);
3962 
3963   // Remove redundant induction instructions.
3964   cse(LoopVectorBody);
3965 
3966   // Set/update profile weights for the vector and remainder loops as original
3967   // loop iterations are now distributed among them. Note that original loop
3968   // represented by LoopScalarBody becomes remainder loop after vectorization.
3969   //
3970   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3971   // end up getting slightly roughened result but that should be OK since
3972   // profile is not inherently precise anyway. Note also possible bypass of
3973   // vector code caused by legality checks is ignored, assigning all the weight
3974   // to the vector loop, optimistically.
3975   //
3976   // For scalable vectorization we can't know at compile time how many iterations
3977   // of the loop are handled in one vector iteration, so instead assume a pessimistic
3978   // vscale of '1'.
3979   setProfileInfoAfterUnrolling(
3980       LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
3981       LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
3982 }
3983 
3984 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
3985   // In order to support recurrences we need to be able to vectorize Phi nodes.
3986   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3987   // stage #2: We now need to fix the recurrences by adding incoming edges to
3988   // the currently empty PHI nodes. At this point every instruction in the
3989   // original loop is widened to a vector form so we can use them to construct
3990   // the incoming edges.
3991   VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock();
3992   for (VPRecipeBase &R : Header->phis()) {
3993     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
3994       fixReduction(ReductionPhi, State);
3995     else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3996       fixFirstOrderRecurrence(FOR, State);
3997   }
3998 }
3999 
4000 void InnerLoopVectorizer::fixFirstOrderRecurrence(
4001     VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
4002   // This is the second phase of vectorizing first-order recurrences. An
4003   // overview of the transformation is described below. Suppose we have the
4004   // following loop.
4005   //
4006   //   for (int i = 0; i < n; ++i)
4007   //     b[i] = a[i] - a[i - 1];
4008   //
4009   // There is a first-order recurrence on "a". For this loop, the shorthand
4010   // scalar IR looks like:
4011   //
4012   //   scalar.ph:
4013   //     s_init = a[-1]
4014   //     br scalar.body
4015   //
4016   //   scalar.body:
4017   //     i = phi [0, scalar.ph], [i+1, scalar.body]
4018   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
4019   //     s2 = a[i]
4020   //     b[i] = s2 - s1
4021   //     br cond, scalar.body, ...
4022   //
4023   // In this example, s1 is a recurrence because it's value depends on the
4024   // previous iteration. In the first phase of vectorization, we created a
4025   // vector phi v1 for s1. We now complete the vectorization and produce the
4026   // shorthand vector IR shown below (for VF = 4, UF = 1).
4027   //
4028   //   vector.ph:
4029   //     v_init = vector(..., ..., ..., a[-1])
4030   //     br vector.body
4031   //
4032   //   vector.body
4033   //     i = phi [0, vector.ph], [i+4, vector.body]
4034   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
4035   //     v2 = a[i, i+1, i+2, i+3];
4036   //     v3 = vector(v1(3), v2(0, 1, 2))
4037   //     b[i, i+1, i+2, i+3] = v2 - v3
4038   //     br cond, vector.body, middle.block
4039   //
4040   //   middle.block:
4041   //     x = v2(3)
4042   //     br scalar.ph
4043   //
4044   //   scalar.ph:
4045   //     s_init = phi [x, middle.block], [a[-1], otherwise]
4046   //     br scalar.body
4047   //
4048   // After execution completes the vector loop, we extract the next value of
4049   // the recurrence (x) to use as the initial value in the scalar loop.
4050 
4051   // Extract the last vector element in the middle block. This will be the
4052   // initial value for the recurrence when jumping to the scalar loop.
4053   VPValue *PreviousDef = PhiR->getBackedgeValue();
4054   Value *Incoming = State.get(PreviousDef, UF - 1);
4055   auto *ExtractForScalar = Incoming;
4056   auto *IdxTy = Builder.getInt32Ty();
4057   if (VF.isVector()) {
4058     auto *One = ConstantInt::get(IdxTy, 1);
4059     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4060     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
4061     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
4062     ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
4063                                                     "vector.recur.extract");
4064   }
4065   // Extract the second last element in the middle block if the
4066   // Phi is used outside the loop. We need to extract the phi itself
4067   // and not the last element (the phi update in the current iteration). This
4068   // will be the value when jumping to the exit block from the LoopMiddleBlock,
4069   // when the scalar loop is not run at all.
4070   Value *ExtractForPhiUsedOutsideLoop = nullptr;
4071   if (VF.isVector()) {
4072     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
4073     auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
4074     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4075         Incoming, Idx, "vector.recur.extract.for.phi");
4076   } else if (UF > 1)
4077     // When loop is unrolled without vectorizing, initialize
4078     // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
4079     // of `Incoming`. This is analogous to the vectorized case above: extracting
4080     // the second last element when VF > 1.
4081     ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
4082 
4083   // Fix the initial value of the original recurrence in the scalar loop.
4084   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
4085   PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
4086   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4087   auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
4088   for (auto *BB : predecessors(LoopScalarPreHeader)) {
4089     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4090     Start->addIncoming(Incoming, BB);
4091   }
4092 
4093   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
4094   Phi->setName("scalar.recur");
4095 
4096   // Finally, fix users of the recurrence outside the loop. The users will need
4097   // either the last value of the scalar recurrence or the last value of the
4098   // vector recurrence we extracted in the middle block. Since the loop is in
4099   // LCSSA form, we just need to find all the phi nodes for the original scalar
4100   // recurrence in the exit block, and then add an edge for the middle block.
4101   // Note that LCSSA does not imply single entry when the original scalar loop
4102   // had multiple exiting edges (as we always run the last iteration in the
4103   // scalar epilogue); in that case, there is no edge from middle to exit and
4104   // and thus no phis which needed updated.
4105   if (!Cost->requiresScalarEpilogue(VF))
4106     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4107       if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi))
4108         LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4109 }
4110 
4111 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
4112                                        VPTransformState &State) {
4113   PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
4114   // Get it's reduction variable descriptor.
4115   assert(Legal->isReductionVariable(OrigPhi) &&
4116          "Unable to find the reduction variable");
4117   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
4118 
4119   RecurKind RK = RdxDesc.getRecurrenceKind();
4120   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4121   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4122   setDebugLocFromInst(ReductionStartValue);
4123 
4124   VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
4125   // This is the vector-clone of the value that leaves the loop.
4126   Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
4127 
4128   // Wrap flags are in general invalid after vectorization, clear them.
4129   clearReductionWrapFlags(RdxDesc, State);
4130 
4131   // Before each round, move the insertion point right between
4132   // the PHIs and the values we are going to write.
4133   // This allows us to write both PHINodes and the extractelement
4134   // instructions.
4135   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4136 
4137   setDebugLocFromInst(LoopExitInst);
4138 
4139   Type *PhiTy = OrigPhi->getType();
4140   // If tail is folded by masking, the vector value to leave the loop should be
4141   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
4142   // instead of the former. For an inloop reduction the reduction will already
4143   // be predicated, and does not need to be handled here.
4144   if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
4145     for (unsigned Part = 0; Part < UF; ++Part) {
4146       Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
4147       Value *Sel = nullptr;
4148       for (User *U : VecLoopExitInst->users()) {
4149         if (isa<SelectInst>(U)) {
4150           assert(!Sel && "Reduction exit feeding two selects");
4151           Sel = U;
4152         } else
4153           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
4154       }
4155       assert(Sel && "Reduction exit feeds no select");
4156       State.reset(LoopExitInstDef, Sel, Part);
4157 
4158       // If the target can create a predicated operator for the reduction at no
4159       // extra cost in the loop (for example a predicated vadd), it can be
4160       // cheaper for the select to remain in the loop than be sunk out of it,
4161       // and so use the select value for the phi instead of the old
4162       // LoopExitValue.
4163       if (PreferPredicatedReductionSelect ||
4164           TTI->preferPredicatedReductionSelect(
4165               RdxDesc.getOpcode(), PhiTy,
4166               TargetTransformInfo::ReductionFlags())) {
4167         auto *VecRdxPhi =
4168             cast<PHINode>(State.get(PhiR, Part));
4169         VecRdxPhi->setIncomingValueForBlock(
4170             LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4171       }
4172     }
4173   }
4174 
4175   // If the vector reduction can be performed in a smaller type, we truncate
4176   // then extend the loop exit value to enable InstCombine to evaluate the
4177   // entire expression in the smaller type.
4178   if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
4179     assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
4180     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4181     Builder.SetInsertPoint(
4182         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4183     VectorParts RdxParts(UF);
4184     for (unsigned Part = 0; Part < UF; ++Part) {
4185       RdxParts[Part] = State.get(LoopExitInstDef, Part);
4186       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4187       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4188                                         : Builder.CreateZExt(Trunc, VecTy);
4189       for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users()))
4190         if (U != Trunc) {
4191           U->replaceUsesOfWith(RdxParts[Part], Extnd);
4192           RdxParts[Part] = Extnd;
4193         }
4194     }
4195     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4196     for (unsigned Part = 0; Part < UF; ++Part) {
4197       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4198       State.reset(LoopExitInstDef, RdxParts[Part], Part);
4199     }
4200   }
4201 
4202   // Reduce all of the unrolled parts into a single vector.
4203   Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
4204   unsigned Op = RecurrenceDescriptor::getOpcode(RK);
4205 
4206   // The middle block terminator has already been assigned a DebugLoc here (the
4207   // OrigLoop's single latch terminator). We want the whole middle block to
4208   // appear to execute on this line because: (a) it is all compiler generated,
4209   // (b) these instructions are always executed after evaluating the latch
4210   // conditional branch, and (c) other passes may add new predecessors which
4211   // terminate on this line. This is the easiest way to ensure we don't
4212   // accidentally cause an extra step back into the loop while debugging.
4213   setDebugLocFromInst(LoopMiddleBlock->getTerminator());
4214   if (PhiR->isOrdered())
4215     ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
4216   else {
4217     // Floating-point operations should have some FMF to enable the reduction.
4218     IRBuilderBase::FastMathFlagGuard FMFG(Builder);
4219     Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
4220     for (unsigned Part = 1; Part < UF; ++Part) {
4221       Value *RdxPart = State.get(LoopExitInstDef, Part);
4222       if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
4223         ReducedPartRdx = Builder.CreateBinOp(
4224             (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
4225       } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK))
4226         ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK,
4227                                            ReducedPartRdx, RdxPart);
4228       else
4229         ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
4230     }
4231   }
4232 
4233   // Create the reduction after the loop. Note that inloop reductions create the
4234   // target reduction in the loop using a Reduction recipe.
4235   if (VF.isVector() && !PhiR->isInLoop()) {
4236     ReducedPartRdx =
4237         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi);
4238     // If the reduction can be performed in a smaller type, we need to extend
4239     // the reduction to the wider type before we branch to the original loop.
4240     if (PhiTy != RdxDesc.getRecurrenceType())
4241       ReducedPartRdx = RdxDesc.isSigned()
4242                            ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
4243                            : Builder.CreateZExt(ReducedPartRdx, PhiTy);
4244   }
4245 
4246   PHINode *ResumePhi =
4247       dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
4248 
4249   // Create a phi node that merges control-flow from the backedge-taken check
4250   // block and the middle block.
4251   PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
4252                                         LoopScalarPreHeader->getTerminator());
4253 
4254   // If we are fixing reductions in the epilogue loop then we should already
4255   // have created a bc.merge.rdx Phi after the main vector body. Ensure that
4256   // we carry over the incoming values correctly.
4257   for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
4258     if (Incoming == LoopMiddleBlock)
4259       BCBlockPhi->addIncoming(ReducedPartRdx, Incoming);
4260     else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming))
4261       BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
4262                               Incoming);
4263     else
4264       BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
4265   }
4266 
4267   // Set the resume value for this reduction
4268   ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
4269 
4270   // Now, we need to fix the users of the reduction variable
4271   // inside and outside of the scalar remainder loop.
4272 
4273   // We know that the loop is in LCSSA form. We need to update the PHI nodes
4274   // in the exit blocks.  See comment on analogous loop in
4275   // fixFirstOrderRecurrence for a more complete explaination of the logic.
4276   if (!Cost->requiresScalarEpilogue(VF))
4277     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4278       if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst))
4279         LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4280 
4281   // Fix the scalar loop reduction variable with the incoming reduction sum
4282   // from the vector body and from the backedge value.
4283   int IncomingEdgeBlockIdx =
4284       OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4285   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4286   // Pick the other block.
4287   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4288   OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4289   OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4290 }
4291 
4292 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
4293                                                   VPTransformState &State) {
4294   RecurKind RK = RdxDesc.getRecurrenceKind();
4295   if (RK != RecurKind::Add && RK != RecurKind::Mul)
4296     return;
4297 
4298   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4299   assert(LoopExitInstr && "null loop exit instruction");
4300   SmallVector<Instruction *, 8> Worklist;
4301   SmallPtrSet<Instruction *, 8> Visited;
4302   Worklist.push_back(LoopExitInstr);
4303   Visited.insert(LoopExitInstr);
4304 
4305   while (!Worklist.empty()) {
4306     Instruction *Cur = Worklist.pop_back_val();
4307     if (isa<OverflowingBinaryOperator>(Cur))
4308       for (unsigned Part = 0; Part < UF; ++Part) {
4309         // FIXME: Should not rely on getVPValue at this point.
4310         Value *V = State.get(State.Plan->getVPValue(Cur, true), Part);
4311         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4312       }
4313 
4314     for (User *U : Cur->users()) {
4315       Instruction *UI = cast<Instruction>(U);
4316       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4317           Visited.insert(UI).second)
4318         Worklist.push_back(UI);
4319     }
4320   }
4321 }
4322 
4323 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) {
4324   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4325     if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
4326       // Some phis were already hand updated by the reduction and recurrence
4327       // code above, leave them alone.
4328       continue;
4329 
4330     auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4331     // Non-instruction incoming values will have only one value.
4332 
4333     VPLane Lane = VPLane::getFirstLane();
4334     if (isa<Instruction>(IncomingValue) &&
4335         !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue),
4336                                            VF))
4337       Lane = VPLane::getLastLaneForVF(VF);
4338 
4339     // Can be a loop invariant incoming value or the last scalar value to be
4340     // extracted from the vectorized loop.
4341     // FIXME: Should not rely on getVPValue at this point.
4342     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4343     Value *lastIncomingValue =
4344         OrigLoop->isLoopInvariant(IncomingValue)
4345             ? IncomingValue
4346             : State.get(State.Plan->getVPValue(IncomingValue, true),
4347                         VPIteration(UF - 1, Lane));
4348     LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4349   }
4350 }
4351 
4352 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4353   // The basic block and loop containing the predicated instruction.
4354   auto *PredBB = PredInst->getParent();
4355   auto *VectorLoop = LI->getLoopFor(PredBB);
4356 
4357   // Initialize a worklist with the operands of the predicated instruction.
4358   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4359 
4360   // Holds instructions that we need to analyze again. An instruction may be
4361   // reanalyzed if we don't yet know if we can sink it or not.
4362   SmallVector<Instruction *, 8> InstsToReanalyze;
4363 
4364   // Returns true if a given use occurs in the predicated block. Phi nodes use
4365   // their operands in their corresponding predecessor blocks.
4366   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4367     auto *I = cast<Instruction>(U.getUser());
4368     BasicBlock *BB = I->getParent();
4369     if (auto *Phi = dyn_cast<PHINode>(I))
4370       BB = Phi->getIncomingBlock(
4371           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4372     return BB == PredBB;
4373   };
4374 
4375   // Iteratively sink the scalarized operands of the predicated instruction
4376   // into the block we created for it. When an instruction is sunk, it's
4377   // operands are then added to the worklist. The algorithm ends after one pass
4378   // through the worklist doesn't sink a single instruction.
4379   bool Changed;
4380   do {
4381     // Add the instructions that need to be reanalyzed to the worklist, and
4382     // reset the changed indicator.
4383     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4384     InstsToReanalyze.clear();
4385     Changed = false;
4386 
4387     while (!Worklist.empty()) {
4388       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4389 
4390       // We can't sink an instruction if it is a phi node, is not in the loop,
4391       // or may have side effects.
4392       if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
4393           I->mayHaveSideEffects())
4394         continue;
4395 
4396       // If the instruction is already in PredBB, check if we can sink its
4397       // operands. In that case, VPlan's sinkScalarOperands() succeeded in
4398       // sinking the scalar instruction I, hence it appears in PredBB; but it
4399       // may have failed to sink I's operands (recursively), which we try
4400       // (again) here.
4401       if (I->getParent() == PredBB) {
4402         Worklist.insert(I->op_begin(), I->op_end());
4403         continue;
4404       }
4405 
4406       // It's legal to sink the instruction if all its uses occur in the
4407       // predicated block. Otherwise, there's nothing to do yet, and we may
4408       // need to reanalyze the instruction.
4409       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4410         InstsToReanalyze.push_back(I);
4411         continue;
4412       }
4413 
4414       // Move the instruction to the beginning of the predicated block, and add
4415       // it's operands to the worklist.
4416       I->moveBefore(&*PredBB->getFirstInsertionPt());
4417       Worklist.insert(I->op_begin(), I->op_end());
4418 
4419       // The sinking may have enabled other instructions to be sunk, so we will
4420       // need to iterate.
4421       Changed = true;
4422     }
4423   } while (Changed);
4424 }
4425 
4426 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
4427   for (PHINode *OrigPhi : OrigPHIsToFix) {
4428     VPWidenPHIRecipe *VPPhi =
4429         cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi));
4430     PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4431     // Make sure the builder has a valid insert point.
4432     Builder.SetInsertPoint(NewPhi);
4433     for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4434       VPValue *Inc = VPPhi->getIncomingValue(i);
4435       VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4436       NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4437     }
4438   }
4439 }
4440 
4441 bool InnerLoopVectorizer::useOrderedReductions(
4442     const RecurrenceDescriptor &RdxDesc) {
4443   return Cost->useOrderedReductions(RdxDesc);
4444 }
4445 
4446 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
4447                                               VPWidenPHIRecipe *PhiR,
4448                                               VPTransformState &State) {
4449   PHINode *P = cast<PHINode>(PN);
4450   if (EnableVPlanNativePath) {
4451     // Currently we enter here in the VPlan-native path for non-induction
4452     // PHIs where all control flow is uniform. We simply widen these PHIs.
4453     // Create a vector phi with no operands - the vector phi operands will be
4454     // set at the end of vector code generation.
4455     Type *VecTy = (State.VF.isScalar())
4456                       ? PN->getType()
4457                       : VectorType::get(PN->getType(), State.VF);
4458     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4459     State.set(PhiR, VecPhi, 0);
4460     OrigPHIsToFix.push_back(P);
4461 
4462     return;
4463   }
4464 
4465   assert(PN->getParent() == OrigLoop->getHeader() &&
4466          "Non-header phis should have been handled elsewhere");
4467 
4468   // In order to support recurrences we need to be able to vectorize Phi nodes.
4469   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4470   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4471   // this value when we vectorize all of the instructions that use the PHI.
4472 
4473   assert(!Legal->isReductionVariable(P) &&
4474          "reductions should be handled elsewhere");
4475 
4476   setDebugLocFromInst(P);
4477 
4478   // This PHINode must be an induction variable.
4479   // Make sure that we know about it.
4480   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4481 
4482   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4483   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4484 
4485   auto *IVR = PhiR->getParent()->getPlan()->getCanonicalIV();
4486   PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
4487 
4488   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4489   // which can be found from the original scalar operations.
4490   switch (II.getKind()) {
4491   case InductionDescriptor::IK_NoInduction:
4492     llvm_unreachable("Unknown induction");
4493   case InductionDescriptor::IK_IntInduction:
4494   case InductionDescriptor::IK_FpInduction:
4495     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4496   case InductionDescriptor::IK_PtrInduction: {
4497     // Handle the pointer induction variable case.
4498     assert(P->getType()->isPointerTy() && "Unexpected type.");
4499 
4500     if (Cost->isScalarAfterVectorization(P, State.VF)) {
4501       // This is the normalized GEP that starts counting at zero.
4502       Value *PtrInd =
4503           Builder.CreateSExtOrTrunc(CanonicalIV, II.getStep()->getType());
4504       // Determine the number of scalars we need to generate for each unroll
4505       // iteration. If the instruction is uniform, we only need to generate the
4506       // first lane. Otherwise, we generate all VF values.
4507       bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF);
4508       assert((IsUniform || !State.VF.isScalable()) &&
4509              "Cannot scalarize a scalable VF");
4510       unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
4511 
4512       for (unsigned Part = 0; Part < UF; ++Part) {
4513         Value *PartStart =
4514             createStepForVF(Builder, PtrInd->getType(), VF, Part);
4515 
4516         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4517           Value *Idx = Builder.CreateAdd(
4518               PartStart, ConstantInt::get(PtrInd->getType(), Lane));
4519           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4520           Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(),
4521                                                 DL, II, State.CFG.PrevBB);
4522           SclrGep->setName("next.gep");
4523           State.set(PhiR, SclrGep, VPIteration(Part, Lane));
4524         }
4525       }
4526       return;
4527     }
4528     assert(isa<SCEVConstant>(II.getStep()) &&
4529            "Induction step not a SCEV constant!");
4530     Type *PhiType = II.getStep()->getType();
4531 
4532     // Build a pointer phi
4533     Value *ScalarStartValue = PhiR->getStartValue()->getLiveInIRValue();
4534     Type *ScStValueType = ScalarStartValue->getType();
4535     PHINode *NewPointerPhi =
4536         PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
4537     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4538 
4539     // A pointer induction, performed by using a gep
4540     BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4541     Instruction *InductionLoc = LoopLatch->getTerminator();
4542     const SCEV *ScalarStep = II.getStep();
4543     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4544     Value *ScalarStepValue =
4545         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4546     Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF);
4547     Value *NumUnrolledElems =
4548         Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
4549     Value *InductionGEP = GetElementPtrInst::Create(
4550         II.getElementType(), NewPointerPhi,
4551         Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
4552         InductionLoc);
4553     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4554 
4555     // Create UF many actual address geps that use the pointer
4556     // phi as base and a vectorized version of the step value
4557     // (<step*0, ..., step*N>) as offset.
4558     for (unsigned Part = 0; Part < State.UF; ++Part) {
4559       Type *VecPhiType = VectorType::get(PhiType, State.VF);
4560       Value *StartOffsetScalar =
4561           Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
4562       Value *StartOffset =
4563           Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
4564       // Create a vector of consecutive numbers from zero to VF.
4565       StartOffset =
4566           Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType));
4567 
4568       Value *GEP = Builder.CreateGEP(
4569           II.getElementType(), NewPointerPhi,
4570           Builder.CreateMul(
4571               StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue),
4572               "vector.gep"));
4573       State.set(PhiR, GEP, Part);
4574     }
4575   }
4576   }
4577 }
4578 
4579 /// A helper function for checking whether an integer division-related
4580 /// instruction may divide by zero (in which case it must be predicated if
4581 /// executed conditionally in the scalar code).
4582 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4583 /// Non-zero divisors that are non compile-time constants will not be
4584 /// converted into multiplication, so we will still end up scalarizing
4585 /// the division, but can do so w/o predication.
4586 static bool mayDivideByZero(Instruction &I) {
4587   assert((I.getOpcode() == Instruction::UDiv ||
4588           I.getOpcode() == Instruction::SDiv ||
4589           I.getOpcode() == Instruction::URem ||
4590           I.getOpcode() == Instruction::SRem) &&
4591          "Unexpected instruction");
4592   Value *Divisor = I.getOperand(1);
4593   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4594   return !CInt || CInt->isZero();
4595 }
4596 
4597 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4598                                                VPUser &ArgOperands,
4599                                                VPTransformState &State) {
4600   assert(!isa<DbgInfoIntrinsic>(I) &&
4601          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4602   setDebugLocFromInst(&I);
4603 
4604   Module *M = I.getParent()->getParent()->getParent();
4605   auto *CI = cast<CallInst>(&I);
4606 
4607   SmallVector<Type *, 4> Tys;
4608   for (Value *ArgOperand : CI->args())
4609     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4610 
4611   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4612 
4613   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4614   // version of the instruction.
4615   // Is it beneficial to perform intrinsic call compared to lib call?
4616   bool NeedToScalarize = false;
4617   InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4618   InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
4619   bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
4620   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4621          "Instruction should be scalarized elsewhere.");
4622   assert((IntrinsicCost.isValid() || CallCost.isValid()) &&
4623          "Either the intrinsic cost or vector call cost must be valid");
4624 
4625   for (unsigned Part = 0; Part < UF; ++Part) {
4626     SmallVector<Type *, 2> TysForDecl = {CI->getType()};
4627     SmallVector<Value *, 4> Args;
4628     for (auto &I : enumerate(ArgOperands.operands())) {
4629       // Some intrinsics have a scalar argument - don't replace it with a
4630       // vector.
4631       Value *Arg;
4632       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4633         Arg = State.get(I.value(), Part);
4634       else {
4635         Arg = State.get(I.value(), VPIteration(0, 0));
4636         if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index()))
4637           TysForDecl.push_back(Arg->getType());
4638       }
4639       Args.push_back(Arg);
4640     }
4641 
4642     Function *VectorF;
4643     if (UseVectorIntrinsic) {
4644       // Use vector version of the intrinsic.
4645       if (VF.isVector())
4646         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4647       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4648       assert(VectorF && "Can't retrieve vector intrinsic.");
4649     } else {
4650       // Use vector version of the function call.
4651       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4652 #ifndef NDEBUG
4653       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4654              "Can't create vector function.");
4655 #endif
4656         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4657     }
4658       SmallVector<OperandBundleDef, 1> OpBundles;
4659       CI->getOperandBundlesAsDefs(OpBundles);
4660       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4661 
4662       if (isa<FPMathOperator>(V))
4663         V->copyFastMathFlags(CI);
4664 
4665       State.set(Def, V, Part);
4666       addMetadata(V, &I);
4667   }
4668 }
4669 
4670 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4671   // We should not collect Scalars more than once per VF. Right now, this
4672   // function is called from collectUniformsAndScalars(), which already does
4673   // this check. Collecting Scalars for VF=1 does not make any sense.
4674   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4675          "This function should not be visited twice for the same VF");
4676 
4677   SmallSetVector<Instruction *, 8> Worklist;
4678 
4679   // These sets are used to seed the analysis with pointers used by memory
4680   // accesses that will remain scalar.
4681   SmallSetVector<Instruction *, 8> ScalarPtrs;
4682   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4683   auto *Latch = TheLoop->getLoopLatch();
4684 
4685   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4686   // The pointer operands of loads and stores will be scalar as long as the
4687   // memory access is not a gather or scatter operation. The value operand of a
4688   // store will remain scalar if the store is scalarized.
4689   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4690     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4691     assert(WideningDecision != CM_Unknown &&
4692            "Widening decision should be ready at this moment");
4693     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4694       if (Ptr == Store->getValueOperand())
4695         return WideningDecision == CM_Scalarize;
4696     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4697            "Ptr is neither a value or pointer operand");
4698     return WideningDecision != CM_GatherScatter;
4699   };
4700 
4701   // A helper that returns true if the given value is a bitcast or
4702   // getelementptr instruction contained in the loop.
4703   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4704     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4705             isa<GetElementPtrInst>(V)) &&
4706            !TheLoop->isLoopInvariant(V);
4707   };
4708 
4709   // A helper that evaluates a memory access's use of a pointer. If the use will
4710   // be a scalar use and the pointer is only used by memory accesses, we place
4711   // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4712   // PossibleNonScalarPtrs.
4713   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4714     // We only care about bitcast and getelementptr instructions contained in
4715     // the loop.
4716     if (!isLoopVaryingBitCastOrGEP(Ptr))
4717       return;
4718 
4719     // If the pointer has already been identified as scalar (e.g., if it was
4720     // also identified as uniform), there's nothing to do.
4721     auto *I = cast<Instruction>(Ptr);
4722     if (Worklist.count(I))
4723       return;
4724 
4725     // If the use of the pointer will be a scalar use, and all users of the
4726     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4727     // place the pointer in PossibleNonScalarPtrs.
4728     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4729           return isa<LoadInst>(U) || isa<StoreInst>(U);
4730         }))
4731       ScalarPtrs.insert(I);
4732     else
4733       PossibleNonScalarPtrs.insert(I);
4734   };
4735 
4736   // We seed the scalars analysis with three classes of instructions: (1)
4737   // instructions marked uniform-after-vectorization and (2) bitcast,
4738   // getelementptr and (pointer) phi instructions used by memory accesses
4739   // requiring a scalar use.
4740   //
4741   // (1) Add to the worklist all instructions that have been identified as
4742   // uniform-after-vectorization.
4743   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4744 
4745   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4746   // memory accesses requiring a scalar use. The pointer operands of loads and
4747   // stores will be scalar as long as the memory accesses is not a gather or
4748   // scatter operation. The value operand of a store will remain scalar if the
4749   // store is scalarized.
4750   for (auto *BB : TheLoop->blocks())
4751     for (auto &I : *BB) {
4752       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4753         evaluatePtrUse(Load, Load->getPointerOperand());
4754       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4755         evaluatePtrUse(Store, Store->getPointerOperand());
4756         evaluatePtrUse(Store, Store->getValueOperand());
4757       }
4758     }
4759   for (auto *I : ScalarPtrs)
4760     if (!PossibleNonScalarPtrs.count(I)) {
4761       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4762       Worklist.insert(I);
4763     }
4764 
4765   // Insert the forced scalars.
4766   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4767   // induction variable when the PHI user is scalarized.
4768   auto ForcedScalar = ForcedScalars.find(VF);
4769   if (ForcedScalar != ForcedScalars.end())
4770     for (auto *I : ForcedScalar->second)
4771       Worklist.insert(I);
4772 
4773   // Expand the worklist by looking through any bitcasts and getelementptr
4774   // instructions we've already identified as scalar. This is similar to the
4775   // expansion step in collectLoopUniforms(); however, here we're only
4776   // expanding to include additional bitcasts and getelementptr instructions.
4777   unsigned Idx = 0;
4778   while (Idx != Worklist.size()) {
4779     Instruction *Dst = Worklist[Idx++];
4780     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4781       continue;
4782     auto *Src = cast<Instruction>(Dst->getOperand(0));
4783     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4784           auto *J = cast<Instruction>(U);
4785           return !TheLoop->contains(J) || Worklist.count(J) ||
4786                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4787                   isScalarUse(J, Src));
4788         })) {
4789       Worklist.insert(Src);
4790       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4791     }
4792   }
4793 
4794   // An induction variable will remain scalar if all users of the induction
4795   // variable and induction variable update remain scalar.
4796   for (auto &Induction : Legal->getInductionVars()) {
4797     auto *Ind = Induction.first;
4798     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4799 
4800     // If tail-folding is applied, the primary induction variable will be used
4801     // to feed a vector compare.
4802     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4803       continue;
4804 
4805     // Returns true if \p Indvar is a pointer induction that is used directly by
4806     // load/store instruction \p I.
4807     auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
4808                                               Instruction *I) {
4809       return Induction.second.getKind() ==
4810                  InductionDescriptor::IK_PtrInduction &&
4811              (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
4812              Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
4813     };
4814 
4815     // Determine if all users of the induction variable are scalar after
4816     // vectorization.
4817     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4818       auto *I = cast<Instruction>(U);
4819       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4820              IsDirectLoadStoreFromPtrIndvar(Ind, I);
4821     });
4822     if (!ScalarInd)
4823       continue;
4824 
4825     // Determine if all users of the induction variable update instruction are
4826     // scalar after vectorization.
4827     auto ScalarIndUpdate =
4828         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4829           auto *I = cast<Instruction>(U);
4830           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4831                  IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
4832         });
4833     if (!ScalarIndUpdate)
4834       continue;
4835 
4836     // The induction variable and its update instruction will remain scalar.
4837     Worklist.insert(Ind);
4838     Worklist.insert(IndUpdate);
4839     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4840     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4841                       << "\n");
4842   }
4843 
4844   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4845 }
4846 
4847 bool LoopVectorizationCostModel::isScalarWithPredication(
4848     Instruction *I, ElementCount VF) const {
4849   if (!blockNeedsPredicationForAnyReason(I->getParent()))
4850     return false;
4851   switch(I->getOpcode()) {
4852   default:
4853     break;
4854   case Instruction::Load:
4855   case Instruction::Store: {
4856     if (!Legal->isMaskRequired(I))
4857       return false;
4858     auto *Ptr = getLoadStorePointerOperand(I);
4859     auto *Ty = getLoadStoreType(I);
4860     Type *VTy = Ty;
4861     if (VF.isVector())
4862       VTy = VectorType::get(Ty, VF);
4863     const Align Alignment = getLoadStoreAlignment(I);
4864     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4865                                 TTI.isLegalMaskedGather(VTy, Alignment))
4866                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4867                                 TTI.isLegalMaskedScatter(VTy, Alignment));
4868   }
4869   case Instruction::UDiv:
4870   case Instruction::SDiv:
4871   case Instruction::SRem:
4872   case Instruction::URem:
4873     return mayDivideByZero(*I);
4874   }
4875   return false;
4876 }
4877 
4878 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4879     Instruction *I, ElementCount VF) {
4880   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4881   assert(getWideningDecision(I, VF) == CM_Unknown &&
4882          "Decision should not be set yet.");
4883   auto *Group = getInterleavedAccessGroup(I);
4884   assert(Group && "Must have a group.");
4885 
4886   // If the instruction's allocated size doesn't equal it's type size, it
4887   // requires padding and will be scalarized.
4888   auto &DL = I->getModule()->getDataLayout();
4889   auto *ScalarTy = getLoadStoreType(I);
4890   if (hasIrregularType(ScalarTy, DL))
4891     return false;
4892 
4893   // Check if masking is required.
4894   // A Group may need masking for one of two reasons: it resides in a block that
4895   // needs predication, or it was decided to use masking to deal with gaps
4896   // (either a gap at the end of a load-access that may result in a speculative
4897   // load, or any gaps in a store-access).
4898   bool PredicatedAccessRequiresMasking =
4899       blockNeedsPredicationForAnyReason(I->getParent()) &&
4900       Legal->isMaskRequired(I);
4901   bool LoadAccessWithGapsRequiresEpilogMasking =
4902       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4903       !isScalarEpilogueAllowed();
4904   bool StoreAccessWithGapsRequiresMasking =
4905       isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4906   if (!PredicatedAccessRequiresMasking &&
4907       !LoadAccessWithGapsRequiresEpilogMasking &&
4908       !StoreAccessWithGapsRequiresMasking)
4909     return true;
4910 
4911   // If masked interleaving is required, we expect that the user/target had
4912   // enabled it, because otherwise it either wouldn't have been created or
4913   // it should have been invalidated by the CostModel.
4914   assert(useMaskedInterleavedAccesses(TTI) &&
4915          "Masked interleave-groups for predicated accesses are not enabled.");
4916 
4917   if (Group->isReverse())
4918     return false;
4919 
4920   auto *Ty = getLoadStoreType(I);
4921   const Align Alignment = getLoadStoreAlignment(I);
4922   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4923                           : TTI.isLegalMaskedStore(Ty, Alignment);
4924 }
4925 
4926 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4927     Instruction *I, ElementCount VF) {
4928   // Get and ensure we have a valid memory instruction.
4929   assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
4930 
4931   auto *Ptr = getLoadStorePointerOperand(I);
4932   auto *ScalarTy = getLoadStoreType(I);
4933 
4934   // In order to be widened, the pointer should be consecutive, first of all.
4935   if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
4936     return false;
4937 
4938   // If the instruction is a store located in a predicated block, it will be
4939   // scalarized.
4940   if (isScalarWithPredication(I, VF))
4941     return false;
4942 
4943   // If the instruction's allocated size doesn't equal it's type size, it
4944   // requires padding and will be scalarized.
4945   auto &DL = I->getModule()->getDataLayout();
4946   if (hasIrregularType(ScalarTy, DL))
4947     return false;
4948 
4949   return true;
4950 }
4951 
4952 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4953   // We should not collect Uniforms more than once per VF. Right now,
4954   // this function is called from collectUniformsAndScalars(), which
4955   // already does this check. Collecting Uniforms for VF=1 does not make any
4956   // sense.
4957 
4958   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
4959          "This function should not be visited twice for the same VF");
4960 
4961   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4962   // not analyze again.  Uniforms.count(VF) will return 1.
4963   Uniforms[VF].clear();
4964 
4965   // We now know that the loop is vectorizable!
4966   // Collect instructions inside the loop that will remain uniform after
4967   // vectorization.
4968 
4969   // Global values, params and instructions outside of current loop are out of
4970   // scope.
4971   auto isOutOfScope = [&](Value *V) -> bool {
4972     Instruction *I = dyn_cast<Instruction>(V);
4973     return (!I || !TheLoop->contains(I));
4974   };
4975 
4976   // Worklist containing uniform instructions demanding lane 0.
4977   SetVector<Instruction *> Worklist;
4978   BasicBlock *Latch = TheLoop->getLoopLatch();
4979 
4980   // Add uniform instructions demanding lane 0 to the worklist. Instructions
4981   // that are scalar with predication must not be considered uniform after
4982   // vectorization, because that would create an erroneous replicating region
4983   // where only a single instance out of VF should be formed.
4984   // TODO: optimize such seldom cases if found important, see PR40816.
4985   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4986     if (isOutOfScope(I)) {
4987       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
4988                         << *I << "\n");
4989       return;
4990     }
4991     if (isScalarWithPredication(I, VF)) {
4992       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4993                         << *I << "\n");
4994       return;
4995     }
4996     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4997     Worklist.insert(I);
4998   };
4999 
5000   // Start with the conditional branch. If the branch condition is an
5001   // instruction contained in the loop that is only used by the branch, it is
5002   // uniform.
5003   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5004   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5005     addToWorklistIfAllowed(Cmp);
5006 
5007   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5008     InstWidening WideningDecision = getWideningDecision(I, VF);
5009     assert(WideningDecision != CM_Unknown &&
5010            "Widening decision should be ready at this moment");
5011 
5012     // A uniform memory op is itself uniform.  We exclude uniform stores
5013     // here as they demand the last lane, not the first one.
5014     if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
5015       assert(WideningDecision == CM_Scalarize);
5016       return true;
5017     }
5018 
5019     return (WideningDecision == CM_Widen ||
5020             WideningDecision == CM_Widen_Reverse ||
5021             WideningDecision == CM_Interleave);
5022   };
5023 
5024 
5025   // Returns true if Ptr is the pointer operand of a memory access instruction
5026   // I, and I is known to not require scalarization.
5027   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5028     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5029   };
5030 
5031   // Holds a list of values which are known to have at least one uniform use.
5032   // Note that there may be other uses which aren't uniform.  A "uniform use"
5033   // here is something which only demands lane 0 of the unrolled iterations;
5034   // it does not imply that all lanes produce the same value (e.g. this is not
5035   // the usual meaning of uniform)
5036   SetVector<Value *> HasUniformUse;
5037 
5038   // Scan the loop for instructions which are either a) known to have only
5039   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
5040   for (auto *BB : TheLoop->blocks())
5041     for (auto &I : *BB) {
5042       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
5043         switch (II->getIntrinsicID()) {
5044         case Intrinsic::sideeffect:
5045         case Intrinsic::experimental_noalias_scope_decl:
5046         case Intrinsic::assume:
5047         case Intrinsic::lifetime_start:
5048         case Intrinsic::lifetime_end:
5049           if (TheLoop->hasLoopInvariantOperands(&I))
5050             addToWorklistIfAllowed(&I);
5051           break;
5052         default:
5053           break;
5054         }
5055       }
5056 
5057       // ExtractValue instructions must be uniform, because the operands are
5058       // known to be loop-invariant.
5059       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
5060         assert(isOutOfScope(EVI->getAggregateOperand()) &&
5061                "Expected aggregate value to be loop invariant");
5062         addToWorklistIfAllowed(EVI);
5063         continue;
5064       }
5065 
5066       // If there's no pointer operand, there's nothing to do.
5067       auto *Ptr = getLoadStorePointerOperand(&I);
5068       if (!Ptr)
5069         continue;
5070 
5071       // A uniform memory op is itself uniform.  We exclude uniform stores
5072       // here as they demand the last lane, not the first one.
5073       if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
5074         addToWorklistIfAllowed(&I);
5075 
5076       if (isUniformDecision(&I, VF)) {
5077         assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
5078         HasUniformUse.insert(Ptr);
5079       }
5080     }
5081 
5082   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
5083   // demanding) users.  Since loops are assumed to be in LCSSA form, this
5084   // disallows uses outside the loop as well.
5085   for (auto *V : HasUniformUse) {
5086     if (isOutOfScope(V))
5087       continue;
5088     auto *I = cast<Instruction>(V);
5089     auto UsersAreMemAccesses =
5090       llvm::all_of(I->users(), [&](User *U) -> bool {
5091         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
5092       });
5093     if (UsersAreMemAccesses)
5094       addToWorklistIfAllowed(I);
5095   }
5096 
5097   // Expand Worklist in topological order: whenever a new instruction
5098   // is added , its users should be already inside Worklist.  It ensures
5099   // a uniform instruction will only be used by uniform instructions.
5100   unsigned idx = 0;
5101   while (idx != Worklist.size()) {
5102     Instruction *I = Worklist[idx++];
5103 
5104     for (auto OV : I->operand_values()) {
5105       // isOutOfScope operands cannot be uniform instructions.
5106       if (isOutOfScope(OV))
5107         continue;
5108       // First order recurrence Phi's should typically be considered
5109       // non-uniform.
5110       auto *OP = dyn_cast<PHINode>(OV);
5111       if (OP && Legal->isFirstOrderRecurrence(OP))
5112         continue;
5113       // If all the users of the operand are uniform, then add the
5114       // operand into the uniform worklist.
5115       auto *OI = cast<Instruction>(OV);
5116       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5117             auto *J = cast<Instruction>(U);
5118             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
5119           }))
5120         addToWorklistIfAllowed(OI);
5121     }
5122   }
5123 
5124   // For an instruction to be added into Worklist above, all its users inside
5125   // the loop should also be in Worklist. However, this condition cannot be
5126   // true for phi nodes that form a cyclic dependence. We must process phi
5127   // nodes separately. An induction variable will remain uniform if all users
5128   // of the induction variable and induction variable update remain uniform.
5129   // The code below handles both pointer and non-pointer induction variables.
5130   for (auto &Induction : Legal->getInductionVars()) {
5131     auto *Ind = Induction.first;
5132     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5133 
5134     // Determine if all users of the induction variable are uniform after
5135     // vectorization.
5136     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5137       auto *I = cast<Instruction>(U);
5138       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5139              isVectorizedMemAccessUse(I, Ind);
5140     });
5141     if (!UniformInd)
5142       continue;
5143 
5144     // Determine if all users of the induction variable update instruction are
5145     // uniform after vectorization.
5146     auto UniformIndUpdate =
5147         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5148           auto *I = cast<Instruction>(U);
5149           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5150                  isVectorizedMemAccessUse(I, IndUpdate);
5151         });
5152     if (!UniformIndUpdate)
5153       continue;
5154 
5155     // The induction variable and its update instruction will remain uniform.
5156     addToWorklistIfAllowed(Ind);
5157     addToWorklistIfAllowed(IndUpdate);
5158   }
5159 
5160   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5161 }
5162 
5163 bool LoopVectorizationCostModel::runtimeChecksRequired() {
5164   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
5165 
5166   if (Legal->getRuntimePointerChecking()->Need) {
5167     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5168         "runtime pointer checks needed. Enable vectorization of this "
5169         "loop with '#pragma clang loop vectorize(enable)' when "
5170         "compiling with -Os/-Oz",
5171         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5172     return true;
5173   }
5174 
5175   if (!PSE.getUnionPredicate().getPredicates().empty()) {
5176     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5177         "runtime SCEV checks needed. Enable vectorization of this "
5178         "loop with '#pragma clang loop vectorize(enable)' when "
5179         "compiling with -Os/-Oz",
5180         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5181     return true;
5182   }
5183 
5184   // FIXME: Avoid specializing for stride==1 instead of bailing out.
5185   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5186     reportVectorizationFailure("Runtime stride check for small trip count",
5187         "runtime stride == 1 checks needed. Enable vectorization of "
5188         "this loop without such check by compiling with -Os/-Oz",
5189         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5190     return true;
5191   }
5192 
5193   return false;
5194 }
5195 
5196 ElementCount
5197 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
5198   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
5199     return ElementCount::getScalable(0);
5200 
5201   if (Hints->isScalableVectorizationDisabled()) {
5202     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
5203                             "ScalableVectorizationDisabled", ORE, TheLoop);
5204     return ElementCount::getScalable(0);
5205   }
5206 
5207   LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
5208 
5209   auto MaxScalableVF = ElementCount::getScalable(
5210       std::numeric_limits<ElementCount::ScalarTy>::max());
5211 
5212   // Test that the loop-vectorizer can legalize all operations for this MaxVF.
5213   // FIXME: While for scalable vectors this is currently sufficient, this should
5214   // be replaced by a more detailed mechanism that filters out specific VFs,
5215   // instead of invalidating vectorization for a whole set of VFs based on the
5216   // MaxVF.
5217 
5218   // Disable scalable vectorization if the loop contains unsupported reductions.
5219   if (!canVectorizeReductions(MaxScalableVF)) {
5220     reportVectorizationInfo(
5221         "Scalable vectorization not supported for the reduction "
5222         "operations found in this loop.",
5223         "ScalableVFUnfeasible", ORE, TheLoop);
5224     return ElementCount::getScalable(0);
5225   }
5226 
5227   // Disable scalable vectorization if the loop contains any instructions
5228   // with element types not supported for scalable vectors.
5229   if (any_of(ElementTypesInLoop, [&](Type *Ty) {
5230         return !Ty->isVoidTy() &&
5231                !this->TTI.isElementTypeLegalForScalableVector(Ty);
5232       })) {
5233     reportVectorizationInfo("Scalable vectorization is not supported "
5234                             "for all element types found in this loop.",
5235                             "ScalableVFUnfeasible", ORE, TheLoop);
5236     return ElementCount::getScalable(0);
5237   }
5238 
5239   if (Legal->isSafeForAnyVectorWidth())
5240     return MaxScalableVF;
5241 
5242   // Limit MaxScalableVF by the maximum safe dependence distance.
5243   Optional<unsigned> MaxVScale = TTI.getMaxVScale();
5244   if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange))
5245     MaxVScale =
5246         TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
5247   MaxScalableVF = ElementCount::getScalable(
5248       MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
5249   if (!MaxScalableVF)
5250     reportVectorizationInfo(
5251         "Max legal vector width too small, scalable vectorization "
5252         "unfeasible.",
5253         "ScalableVFUnfeasible", ORE, TheLoop);
5254 
5255   return MaxScalableVF;
5256 }
5257 
5258 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
5259     unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) {
5260   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5261   unsigned SmallestType, WidestType;
5262   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5263 
5264   // Get the maximum safe dependence distance in bits computed by LAA.
5265   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5266   // the memory accesses that is most restrictive (involved in the smallest
5267   // dependence distance).
5268   unsigned MaxSafeElements =
5269       PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
5270 
5271   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
5272   auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
5273 
5274   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
5275                     << ".\n");
5276   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
5277                     << ".\n");
5278 
5279   // First analyze the UserVF, fall back if the UserVF should be ignored.
5280   if (UserVF) {
5281     auto MaxSafeUserVF =
5282         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
5283 
5284     if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
5285       // If `VF=vscale x N` is safe, then so is `VF=N`
5286       if (UserVF.isScalable())
5287         return FixedScalableVFPair(
5288             ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
5289       else
5290         return UserVF;
5291     }
5292 
5293     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
5294 
5295     // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
5296     // is better to ignore the hint and let the compiler choose a suitable VF.
5297     if (!UserVF.isScalable()) {
5298       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5299                         << " is unsafe, clamping to max safe VF="
5300                         << MaxSafeFixedVF << ".\n");
5301       ORE->emit([&]() {
5302         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5303                                           TheLoop->getStartLoc(),
5304                                           TheLoop->getHeader())
5305                << "User-specified vectorization factor "
5306                << ore::NV("UserVectorizationFactor", UserVF)
5307                << " is unsafe, clamping to maximum safe vectorization factor "
5308                << ore::NV("VectorizationFactor", MaxSafeFixedVF);
5309       });
5310       return MaxSafeFixedVF;
5311     }
5312 
5313     if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
5314       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5315                         << " is ignored because scalable vectors are not "
5316                            "available.\n");
5317       ORE->emit([&]() {
5318         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5319                                           TheLoop->getStartLoc(),
5320                                           TheLoop->getHeader())
5321                << "User-specified vectorization factor "
5322                << ore::NV("UserVectorizationFactor", UserVF)
5323                << " is ignored because the target does not support scalable "
5324                   "vectors. The compiler will pick a more suitable value.";
5325       });
5326     } else {
5327       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5328                         << " is unsafe. Ignoring scalable UserVF.\n");
5329       ORE->emit([&]() {
5330         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5331                                           TheLoop->getStartLoc(),
5332                                           TheLoop->getHeader())
5333                << "User-specified vectorization factor "
5334                << ore::NV("UserVectorizationFactor", UserVF)
5335                << " is unsafe. Ignoring the hint to let the compiler pick a "
5336                   "more suitable value.";
5337       });
5338     }
5339   }
5340 
5341   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5342                     << " / " << WidestType << " bits.\n");
5343 
5344   FixedScalableVFPair Result(ElementCount::getFixed(1),
5345                              ElementCount::getScalable(0));
5346   if (auto MaxVF =
5347           getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5348                                   MaxSafeFixedVF, FoldTailByMasking))
5349     Result.FixedVF = MaxVF;
5350 
5351   if (auto MaxVF =
5352           getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5353                                   MaxSafeScalableVF, FoldTailByMasking))
5354     if (MaxVF.isScalable()) {
5355       Result.ScalableVF = MaxVF;
5356       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
5357                         << "\n");
5358     }
5359 
5360   return Result;
5361 }
5362 
5363 FixedScalableVFPair
5364 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5365   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5366     // TODO: It may by useful to do since it's still likely to be dynamically
5367     // uniform if the target can skip.
5368     reportVectorizationFailure(
5369         "Not inserting runtime ptr check for divergent target",
5370         "runtime pointer checks needed. Not enabled for divergent target",
5371         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5372     return FixedScalableVFPair::getNone();
5373   }
5374 
5375   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5376   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5377   if (TC == 1) {
5378     reportVectorizationFailure("Single iteration (non) loop",
5379         "loop trip count is one, irrelevant for vectorization",
5380         "SingleIterationLoop", ORE, TheLoop);
5381     return FixedScalableVFPair::getNone();
5382   }
5383 
5384   switch (ScalarEpilogueStatus) {
5385   case CM_ScalarEpilogueAllowed:
5386     return computeFeasibleMaxVF(TC, UserVF, false);
5387   case CM_ScalarEpilogueNotAllowedUsePredicate:
5388     LLVM_FALLTHROUGH;
5389   case CM_ScalarEpilogueNotNeededUsePredicate:
5390     LLVM_DEBUG(
5391         dbgs() << "LV: vector predicate hint/switch found.\n"
5392                << "LV: Not allowing scalar epilogue, creating predicated "
5393                << "vector loop.\n");
5394     break;
5395   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5396     // fallthrough as a special case of OptForSize
5397   case CM_ScalarEpilogueNotAllowedOptSize:
5398     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5399       LLVM_DEBUG(
5400           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5401     else
5402       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5403                         << "count.\n");
5404 
5405     // Bail if runtime checks are required, which are not good when optimising
5406     // for size.
5407     if (runtimeChecksRequired())
5408       return FixedScalableVFPair::getNone();
5409 
5410     break;
5411   }
5412 
5413   // The only loops we can vectorize without a scalar epilogue, are loops with
5414   // a bottom-test and a single exiting block. We'd have to handle the fact
5415   // that not every instruction executes on the last iteration.  This will
5416   // require a lane mask which varies through the vector loop body.  (TODO)
5417   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5418     // If there was a tail-folding hint/switch, but we can't fold the tail by
5419     // masking, fallback to a vectorization with a scalar epilogue.
5420     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5421       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5422                            "scalar epilogue instead.\n");
5423       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5424       return computeFeasibleMaxVF(TC, UserVF, false);
5425     }
5426     return FixedScalableVFPair::getNone();
5427   }
5428 
5429   // Now try the tail folding
5430 
5431   // Invalidate interleave groups that require an epilogue if we can't mask
5432   // the interleave-group.
5433   if (!useMaskedInterleavedAccesses(TTI)) {
5434     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5435            "No decisions should have been taken at this point");
5436     // Note: There is no need to invalidate any cost modeling decisions here, as
5437     // non where taken so far.
5438     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5439   }
5440 
5441   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true);
5442   // Avoid tail folding if the trip count is known to be a multiple of any VF
5443   // we chose.
5444   // FIXME: The condition below pessimises the case for fixed-width vectors,
5445   // when scalable VFs are also candidates for vectorization.
5446   if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
5447     ElementCount MaxFixedVF = MaxFactors.FixedVF;
5448     assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&
5449            "MaxFixedVF must be a power of 2");
5450     unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
5451                                    : MaxFixedVF.getFixedValue();
5452     ScalarEvolution *SE = PSE.getSE();
5453     const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5454     const SCEV *ExitCount = SE->getAddExpr(
5455         BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5456     const SCEV *Rem = SE->getURemExpr(
5457         SE->applyLoopGuards(ExitCount, TheLoop),
5458         SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5459     if (Rem->isZero()) {
5460       // Accept MaxFixedVF if we do not have a tail.
5461       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5462       return MaxFactors;
5463     }
5464   }
5465 
5466   // For scalable vectors don't use tail folding for low trip counts or
5467   // optimizing for code size. We only permit this if the user has explicitly
5468   // requested it.
5469   if (ScalarEpilogueStatus != CM_ScalarEpilogueNotNeededUsePredicate &&
5470       ScalarEpilogueStatus != CM_ScalarEpilogueNotAllowedUsePredicate &&
5471       MaxFactors.ScalableVF.isVector())
5472     MaxFactors.ScalableVF = ElementCount::getScalable(0);
5473 
5474   // If we don't know the precise trip count, or if the trip count that we
5475   // found modulo the vectorization factor is not zero, try to fold the tail
5476   // by masking.
5477   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5478   if (Legal->prepareToFoldTailByMasking()) {
5479     FoldTailByMasking = true;
5480     return MaxFactors;
5481   }
5482 
5483   // If there was a tail-folding hint/switch, but we can't fold the tail by
5484   // masking, fallback to a vectorization with a scalar epilogue.
5485   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5486     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5487                          "scalar epilogue instead.\n");
5488     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5489     return MaxFactors;
5490   }
5491 
5492   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5493     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5494     return FixedScalableVFPair::getNone();
5495   }
5496 
5497   if (TC == 0) {
5498     reportVectorizationFailure(
5499         "Unable to calculate the loop count due to complex control flow",
5500         "unable to calculate the loop count due to complex control flow",
5501         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5502     return FixedScalableVFPair::getNone();
5503   }
5504 
5505   reportVectorizationFailure(
5506       "Cannot optimize for size and vectorize at the same time.",
5507       "cannot optimize for size and vectorize at the same time. "
5508       "Enable vectorization of this loop with '#pragma clang loop "
5509       "vectorize(enable)' when compiling with -Os/-Oz",
5510       "NoTailLoopWithOptForSize", ORE, TheLoop);
5511   return FixedScalableVFPair::getNone();
5512 }
5513 
5514 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
5515     unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
5516     const ElementCount &MaxSafeVF, bool FoldTailByMasking) {
5517   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
5518   TypeSize WidestRegister = TTI.getRegisterBitWidth(
5519       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5520                            : TargetTransformInfo::RGK_FixedWidthVector);
5521 
5522   // Convenience function to return the minimum of two ElementCounts.
5523   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
5524     assert((LHS.isScalable() == RHS.isScalable()) &&
5525            "Scalable flags must match");
5526     return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
5527   };
5528 
5529   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5530   // Note that both WidestRegister and WidestType may not be a powers of 2.
5531   auto MaxVectorElementCount = ElementCount::get(
5532       PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType),
5533       ComputeScalableMaxVF);
5534   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
5535   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5536                     << (MaxVectorElementCount * WidestType) << " bits.\n");
5537 
5538   if (!MaxVectorElementCount) {
5539     LLVM_DEBUG(dbgs() << "LV: The target has no "
5540                       << (ComputeScalableMaxVF ? "scalable" : "fixed")
5541                       << " vector registers.\n");
5542     return ElementCount::getFixed(1);
5543   }
5544 
5545   const auto TripCountEC = ElementCount::getFixed(ConstTripCount);
5546   if (ConstTripCount &&
5547       ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) &&
5548       (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {
5549     // If loop trip count (TC) is known at compile time there is no point in
5550     // choosing VF greater than TC (as done in the loop below). Select maximum
5551     // power of two which doesn't exceed TC.
5552     // If MaxVectorElementCount is scalable, we only fall back on a fixed VF
5553     // when the TC is less than or equal to the known number of lanes.
5554     auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount);
5555     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
5556                          "exceeding the constant trip count: "
5557                       << ClampedConstTripCount << "\n");
5558     return ElementCount::getFixed(ClampedConstTripCount);
5559   }
5560 
5561   ElementCount MaxVF = MaxVectorElementCount;
5562   if (TTI.shouldMaximizeVectorBandwidth() ||
5563       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5564     auto MaxVectorElementCountMaxBW = ElementCount::get(
5565         PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
5566         ComputeScalableMaxVF);
5567     MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
5568 
5569     // Collect all viable vectorization factors larger than the default MaxVF
5570     // (i.e. MaxVectorElementCount).
5571     SmallVector<ElementCount, 8> VFs;
5572     for (ElementCount VS = MaxVectorElementCount * 2;
5573          ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
5574       VFs.push_back(VS);
5575 
5576     // For each VF calculate its register usage.
5577     auto RUs = calculateRegisterUsage(VFs);
5578 
5579     // Select the largest VF which doesn't require more registers than existing
5580     // ones.
5581     for (int i = RUs.size() - 1; i >= 0; --i) {
5582       bool Selected = true;
5583       for (auto &pair : RUs[i].MaxLocalUsers) {
5584         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5585         if (pair.second > TargetNumRegisters)
5586           Selected = false;
5587       }
5588       if (Selected) {
5589         MaxVF = VFs[i];
5590         break;
5591       }
5592     }
5593     if (ElementCount MinVF =
5594             TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
5595       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
5596         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5597                           << ") with target's minimum: " << MinVF << '\n');
5598         MaxVF = MinVF;
5599       }
5600     }
5601   }
5602   return MaxVF;
5603 }
5604 
5605 bool LoopVectorizationCostModel::isMoreProfitable(
5606     const VectorizationFactor &A, const VectorizationFactor &B) const {
5607   InstructionCost CostA = A.Cost;
5608   InstructionCost CostB = B.Cost;
5609 
5610   unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
5611 
5612   if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking &&
5613       MaxTripCount) {
5614     // If we are folding the tail and the trip count is a known (possibly small)
5615     // constant, the trip count will be rounded up to an integer number of
5616     // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),
5617     // which we compare directly. When not folding the tail, the total cost will
5618     // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is
5619     // approximated with the per-lane cost below instead of using the tripcount
5620     // as here.
5621     auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue());
5622     auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue());
5623     return RTCostA < RTCostB;
5624   }
5625 
5626   // Improve estimate for the vector width if it is scalable.
5627   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
5628   unsigned EstimatedWidthB = B.Width.getKnownMinValue();
5629   if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) {
5630     if (A.Width.isScalable())
5631       EstimatedWidthA *= VScale.getValue();
5632     if (B.Width.isScalable())
5633       EstimatedWidthB *= VScale.getValue();
5634   }
5635 
5636   // Assume vscale may be larger than 1 (or the value being tuned for),
5637   // so that scalable vectorization is slightly favorable over fixed-width
5638   // vectorization.
5639   if (A.Width.isScalable() && !B.Width.isScalable())
5640     return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
5641 
5642   // To avoid the need for FP division:
5643   //      (CostA / A.Width) < (CostB / B.Width)
5644   // <=>  (CostA * B.Width) < (CostB * A.Width)
5645   return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
5646 }
5647 
5648 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
5649     const ElementCountSet &VFCandidates) {
5650   InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
5651   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
5652   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
5653   assert(VFCandidates.count(ElementCount::getFixed(1)) &&
5654          "Expected Scalar VF to be a candidate");
5655 
5656   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost);
5657   VectorizationFactor ChosenFactor = ScalarCost;
5658 
5659   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5660   if (ForceVectorization && VFCandidates.size() > 1) {
5661     // Ignore scalar width, because the user explicitly wants vectorization.
5662     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5663     // evaluation.
5664     ChosenFactor.Cost = InstructionCost::getMax();
5665   }
5666 
5667   SmallVector<InstructionVFPair> InvalidCosts;
5668   for (const auto &i : VFCandidates) {
5669     // The cost for scalar VF=1 is already calculated, so ignore it.
5670     if (i.isScalar())
5671       continue;
5672 
5673     VectorizationCostTy C = expectedCost(i, &InvalidCosts);
5674     VectorizationFactor Candidate(i, C.first);
5675 
5676 #ifndef NDEBUG
5677     unsigned AssumedMinimumVscale = 1;
5678     if (Optional<unsigned> VScale = TTI.getVScaleForTuning())
5679       AssumedMinimumVscale = VScale.getValue();
5680     unsigned Width =
5681         Candidate.Width.isScalable()
5682             ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5683             : Candidate.Width.getFixedValue();
5684     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5685                       << " costs: " << (Candidate.Cost / Width));
5686     if (i.isScalable())
5687       LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
5688                         << AssumedMinimumVscale << ")");
5689     LLVM_DEBUG(dbgs() << ".\n");
5690 #endif
5691 
5692     if (!C.second && !ForceVectorization) {
5693       LLVM_DEBUG(
5694           dbgs() << "LV: Not considering vector loop of width " << i
5695                  << " because it will not generate any vector instructions.\n");
5696       continue;
5697     }
5698 
5699     // If profitable add it to ProfitableVF list.
5700     if (isMoreProfitable(Candidate, ScalarCost))
5701       ProfitableVFs.push_back(Candidate);
5702 
5703     if (isMoreProfitable(Candidate, ChosenFactor))
5704       ChosenFactor = Candidate;
5705   }
5706 
5707   // Emit a report of VFs with invalid costs in the loop.
5708   if (!InvalidCosts.empty()) {
5709     // Group the remarks per instruction, keeping the instruction order from
5710     // InvalidCosts.
5711     std::map<Instruction *, unsigned> Numbering;
5712     unsigned I = 0;
5713     for (auto &Pair : InvalidCosts)
5714       if (!Numbering.count(Pair.first))
5715         Numbering[Pair.first] = I++;
5716 
5717     // Sort the list, first on instruction(number) then on VF.
5718     llvm::sort(InvalidCosts,
5719                [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
5720                  if (Numbering[A.first] != Numbering[B.first])
5721                    return Numbering[A.first] < Numbering[B.first];
5722                  ElementCountComparator ECC;
5723                  return ECC(A.second, B.second);
5724                });
5725 
5726     // For a list of ordered instruction-vf pairs:
5727     //   [(load, vf1), (load, vf2), (store, vf1)]
5728     // Group the instructions together to emit separate remarks for:
5729     //   load  (vf1, vf2)
5730     //   store (vf1)
5731     auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
5732     auto Subset = ArrayRef<InstructionVFPair>();
5733     do {
5734       if (Subset.empty())
5735         Subset = Tail.take_front(1);
5736 
5737       Instruction *I = Subset.front().first;
5738 
5739       // If the next instruction is different, or if there are no other pairs,
5740       // emit a remark for the collated subset. e.g.
5741       //   [(load, vf1), (load, vf2))]
5742       // to emit:
5743       //  remark: invalid costs for 'load' at VF=(vf, vf2)
5744       if (Subset == Tail || Tail[Subset.size()].first != I) {
5745         std::string OutString;
5746         raw_string_ostream OS(OutString);
5747         assert(!Subset.empty() && "Unexpected empty range");
5748         OS << "Instruction with invalid costs prevented vectorization at VF=(";
5749         for (auto &Pair : Subset)
5750           OS << (Pair.second == Subset.front().second ? "" : ", ")
5751              << Pair.second;
5752         OS << "):";
5753         if (auto *CI = dyn_cast<CallInst>(I))
5754           OS << " call to " << CI->getCalledFunction()->getName();
5755         else
5756           OS << " " << I->getOpcodeName();
5757         OS.flush();
5758         reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
5759         Tail = Tail.drop_front(Subset.size());
5760         Subset = {};
5761       } else
5762         // Grow the subset by one element
5763         Subset = Tail.take_front(Subset.size() + 1);
5764     } while (!Tail.empty());
5765   }
5766 
5767   if (!EnableCondStoresVectorization && NumPredStores) {
5768     reportVectorizationFailure("There are conditional stores.",
5769         "store that is conditionally executed prevents vectorization",
5770         "ConditionalStore", ORE, TheLoop);
5771     ChosenFactor = ScalarCost;
5772   }
5773 
5774   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
5775                  ChosenFactor.Cost >= ScalarCost.Cost) dbgs()
5776              << "LV: Vectorization seems to be not beneficial, "
5777              << "but was forced by a user.\n");
5778   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
5779   return ChosenFactor;
5780 }
5781 
5782 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
5783     const Loop &L, ElementCount VF) const {
5784   // Cross iteration phis such as reductions need special handling and are
5785   // currently unsupported.
5786   if (any_of(L.getHeader()->phis(),
5787              [&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); }))
5788     return false;
5789 
5790   // Phis with uses outside of the loop require special handling and are
5791   // currently unsupported.
5792   for (auto &Entry : Legal->getInductionVars()) {
5793     // Look for uses of the value of the induction at the last iteration.
5794     Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
5795     for (User *U : PostInc->users())
5796       if (!L.contains(cast<Instruction>(U)))
5797         return false;
5798     // Look for uses of penultimate value of the induction.
5799     for (User *U : Entry.first->users())
5800       if (!L.contains(cast<Instruction>(U)))
5801         return false;
5802   }
5803 
5804   // Induction variables that are widened require special handling that is
5805   // currently not supported.
5806   if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
5807         return !(this->isScalarAfterVectorization(Entry.first, VF) ||
5808                  this->isProfitableToScalarize(Entry.first, VF));
5809       }))
5810     return false;
5811 
5812   // Epilogue vectorization code has not been auditted to ensure it handles
5813   // non-latch exits properly.  It may be fine, but it needs auditted and
5814   // tested.
5815   if (L.getExitingBlock() != L.getLoopLatch())
5816     return false;
5817 
5818   return true;
5819 }
5820 
5821 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5822     const ElementCount VF) const {
5823   // FIXME: We need a much better cost-model to take different parameters such
5824   // as register pressure, code size increase and cost of extra branches into
5825   // account. For now we apply a very crude heuristic and only consider loops
5826   // with vectorization factors larger than a certain value.
5827   // We also consider epilogue vectorization unprofitable for targets that don't
5828   // consider interleaving beneficial (eg. MVE).
5829   if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
5830     return false;
5831   if (VF.getFixedValue() >= EpilogueVectorizationMinVF)
5832     return true;
5833   return false;
5834 }
5835 
5836 VectorizationFactor
5837 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
5838     const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
5839   VectorizationFactor Result = VectorizationFactor::Disabled();
5840   if (!EnableEpilogueVectorization) {
5841     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
5842     return Result;
5843   }
5844 
5845   if (!isScalarEpilogueAllowed()) {
5846     LLVM_DEBUG(
5847         dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
5848                   "allowed.\n";);
5849     return Result;
5850   }
5851 
5852   // Not really a cost consideration, but check for unsupported cases here to
5853   // simplify the logic.
5854   if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
5855     LLVM_DEBUG(
5856         dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
5857                   "not a supported candidate.\n";);
5858     return Result;
5859   }
5860 
5861   if (EpilogueVectorizationForceVF > 1) {
5862     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
5863     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
5864     if (LVP.hasPlanWithVF(ForcedEC))
5865       return {ForcedEC, 0};
5866     else {
5867       LLVM_DEBUG(
5868           dbgs()
5869               << "LEV: Epilogue vectorization forced factor is not viable.\n";);
5870       return Result;
5871     }
5872   }
5873 
5874   if (TheLoop->getHeader()->getParent()->hasOptSize() ||
5875       TheLoop->getHeader()->getParent()->hasMinSize()) {
5876     LLVM_DEBUG(
5877         dbgs()
5878             << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
5879     return Result;
5880   }
5881 
5882   auto FixedMainLoopVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
5883   if (MainLoopVF.isScalable())
5884     LLVM_DEBUG(
5885         dbgs() << "LEV: Epilogue vectorization using scalable vectors not "
5886                   "yet supported. Converting to fixed-width (VF="
5887                << FixedMainLoopVF << ") instead\n");
5888 
5889   if (!isEpilogueVectorizationProfitable(FixedMainLoopVF)) {
5890     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
5891                          "this loop\n");
5892     return Result;
5893   }
5894 
5895   for (auto &NextVF : ProfitableVFs)
5896     if (ElementCount::isKnownLT(NextVF.Width, FixedMainLoopVF) &&
5897         (Result.Width.getFixedValue() == 1 ||
5898          isMoreProfitable(NextVF, Result)) &&
5899         LVP.hasPlanWithVF(NextVF.Width))
5900       Result = NextVF;
5901 
5902   if (Result != VectorizationFactor::Disabled())
5903     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5904                       << Result.Width.getFixedValue() << "\n";);
5905   return Result;
5906 }
5907 
5908 std::pair<unsigned, unsigned>
5909 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5910   unsigned MinWidth = -1U;
5911   unsigned MaxWidth = 8;
5912   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5913   // For in-loop reductions, no element types are added to ElementTypesInLoop
5914   // if there are no loads/stores in the loop. In this case, check through the
5915   // reduction variables to determine the maximum width.
5916   if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
5917     // Reset MaxWidth so that we can find the smallest type used by recurrences
5918     // in the loop.
5919     MaxWidth = -1U;
5920     for (auto &PhiDescriptorPair : Legal->getReductionVars()) {
5921       const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
5922       // When finding the min width used by the recurrence we need to account
5923       // for casts on the input operands of the recurrence.
5924       MaxWidth = std::min<unsigned>(
5925           MaxWidth, std::min<unsigned>(
5926                         RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
5927                         RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
5928     }
5929   } else {
5930     for (Type *T : ElementTypesInLoop) {
5931       MinWidth = std::min<unsigned>(
5932           MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5933       MaxWidth = std::max<unsigned>(
5934           MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5935     }
5936   }
5937   return {MinWidth, MaxWidth};
5938 }
5939 
5940 void LoopVectorizationCostModel::collectElementTypesForWidening() {
5941   ElementTypesInLoop.clear();
5942   // For each block.
5943   for (BasicBlock *BB : TheLoop->blocks()) {
5944     // For each instruction in the loop.
5945     for (Instruction &I : BB->instructionsWithoutDebug()) {
5946       Type *T = I.getType();
5947 
5948       // Skip ignored values.
5949       if (ValuesToIgnore.count(&I))
5950         continue;
5951 
5952       // Only examine Loads, Stores and PHINodes.
5953       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5954         continue;
5955 
5956       // Examine PHI nodes that are reduction variables. Update the type to
5957       // account for the recurrence type.
5958       if (auto *PN = dyn_cast<PHINode>(&I)) {
5959         if (!Legal->isReductionVariable(PN))
5960           continue;
5961         const RecurrenceDescriptor &RdxDesc =
5962             Legal->getReductionVars().find(PN)->second;
5963         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
5964             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
5965                                       RdxDesc.getRecurrenceType(),
5966                                       TargetTransformInfo::ReductionFlags()))
5967           continue;
5968         T = RdxDesc.getRecurrenceType();
5969       }
5970 
5971       // Examine the stored values.
5972       if (auto *ST = dyn_cast<StoreInst>(&I))
5973         T = ST->getValueOperand()->getType();
5974 
5975       assert(T->isSized() &&
5976              "Expected the load/store/recurrence type to be sized");
5977 
5978       ElementTypesInLoop.insert(T);
5979     }
5980   }
5981 }
5982 
5983 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5984                                                            unsigned LoopCost) {
5985   // -- The interleave heuristics --
5986   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5987   // There are many micro-architectural considerations that we can't predict
5988   // at this level. For example, frontend pressure (on decode or fetch) due to
5989   // code size, or the number and capabilities of the execution ports.
5990   //
5991   // We use the following heuristics to select the interleave count:
5992   // 1. If the code has reductions, then we interleave to break the cross
5993   // iteration dependency.
5994   // 2. If the loop is really small, then we interleave to reduce the loop
5995   // overhead.
5996   // 3. We don't interleave if we think that we will spill registers to memory
5997   // due to the increased register pressure.
5998 
5999   if (!isScalarEpilogueAllowed())
6000     return 1;
6001 
6002   // We used the distance for the interleave count.
6003   if (Legal->getMaxSafeDepDistBytes() != -1U)
6004     return 1;
6005 
6006   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
6007   const bool HasReductions = !Legal->getReductionVars().empty();
6008   // Do not interleave loops with a relatively small known or estimated trip
6009   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
6010   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
6011   // because with the above conditions interleaving can expose ILP and break
6012   // cross iteration dependences for reductions.
6013   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
6014       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
6015     return 1;
6016 
6017   RegisterUsage R = calculateRegisterUsage({VF})[0];
6018   // We divide by these constants so assume that we have at least one
6019   // instruction that uses at least one register.
6020   for (auto& pair : R.MaxLocalUsers) {
6021     pair.second = std::max(pair.second, 1U);
6022   }
6023 
6024   // We calculate the interleave count using the following formula.
6025   // Subtract the number of loop invariants from the number of available
6026   // registers. These registers are used by all of the interleaved instances.
6027   // Next, divide the remaining registers by the number of registers that is
6028   // required by the loop, in order to estimate how many parallel instances
6029   // fit without causing spills. All of this is rounded down if necessary to be
6030   // a power of two. We want power of two interleave count to simplify any
6031   // addressing operations or alignment considerations.
6032   // We also want power of two interleave counts to ensure that the induction
6033   // variable of the vector loop wraps to zero, when tail is folded by masking;
6034   // this currently happens when OptForSize, in which case IC is set to 1 above.
6035   unsigned IC = UINT_MAX;
6036 
6037   for (auto& pair : R.MaxLocalUsers) {
6038     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
6039     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
6040                       << " registers of "
6041                       << TTI.getRegisterClassName(pair.first) << " register class\n");
6042     if (VF.isScalar()) {
6043       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
6044         TargetNumRegisters = ForceTargetNumScalarRegs;
6045     } else {
6046       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
6047         TargetNumRegisters = ForceTargetNumVectorRegs;
6048     }
6049     unsigned MaxLocalUsers = pair.second;
6050     unsigned LoopInvariantRegs = 0;
6051     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
6052       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
6053 
6054     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
6055     // Don't count the induction variable as interleaved.
6056     if (EnableIndVarRegisterHeur) {
6057       TmpIC =
6058           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
6059                         std::max(1U, (MaxLocalUsers - 1)));
6060     }
6061 
6062     IC = std::min(IC, TmpIC);
6063   }
6064 
6065   // Clamp the interleave ranges to reasonable counts.
6066   unsigned MaxInterleaveCount =
6067       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
6068 
6069   // Check if the user has overridden the max.
6070   if (VF.isScalar()) {
6071     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
6072       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
6073   } else {
6074     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
6075       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
6076   }
6077 
6078   // If trip count is known or estimated compile time constant, limit the
6079   // interleave count to be less than the trip count divided by VF, provided it
6080   // is at least 1.
6081   //
6082   // For scalable vectors we can't know if interleaving is beneficial. It may
6083   // not be beneficial for small loops if none of the lanes in the second vector
6084   // iterations is enabled. However, for larger loops, there is likely to be a
6085   // similar benefit as for fixed-width vectors. For now, we choose to leave
6086   // the InterleaveCount as if vscale is '1', although if some information about
6087   // the vector is known (e.g. min vector size), we can make a better decision.
6088   if (BestKnownTC) {
6089     MaxInterleaveCount =
6090         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
6091     // Make sure MaxInterleaveCount is greater than 0.
6092     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
6093   }
6094 
6095   assert(MaxInterleaveCount > 0 &&
6096          "Maximum interleave count must be greater than 0");
6097 
6098   // Clamp the calculated IC to be between the 1 and the max interleave count
6099   // that the target and trip count allows.
6100   if (IC > MaxInterleaveCount)
6101     IC = MaxInterleaveCount;
6102   else
6103     // Make sure IC is greater than 0.
6104     IC = std::max(1u, IC);
6105 
6106   assert(IC > 0 && "Interleave count must be greater than 0.");
6107 
6108   // If we did not calculate the cost for VF (because the user selected the VF)
6109   // then we calculate the cost of VF here.
6110   if (LoopCost == 0) {
6111     InstructionCost C = expectedCost(VF).first;
6112     assert(C.isValid() && "Expected to have chosen a VF with valid cost");
6113     LoopCost = *C.getValue();
6114   }
6115 
6116   assert(LoopCost && "Non-zero loop cost expected");
6117 
6118   // Interleave if we vectorized this loop and there is a reduction that could
6119   // benefit from interleaving.
6120   if (VF.isVector() && HasReductions) {
6121     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
6122     return IC;
6123   }
6124 
6125   // Note that if we've already vectorized the loop we will have done the
6126   // runtime check and so interleaving won't require further checks.
6127   bool InterleavingRequiresRuntimePointerCheck =
6128       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
6129 
6130   // We want to interleave small loops in order to reduce the loop overhead and
6131   // potentially expose ILP opportunities.
6132   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
6133                     << "LV: IC is " << IC << '\n'
6134                     << "LV: VF is " << VF << '\n');
6135   const bool AggressivelyInterleaveReductions =
6136       TTI.enableAggressiveInterleaving(HasReductions);
6137   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
6138     // We assume that the cost overhead is 1 and we use the cost model
6139     // to estimate the cost of the loop and interleave until the cost of the
6140     // loop overhead is about 5% of the cost of the loop.
6141     unsigned SmallIC =
6142         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
6143 
6144     // Interleave until store/load ports (estimated by max interleave count) are
6145     // saturated.
6146     unsigned NumStores = Legal->getNumStores();
6147     unsigned NumLoads = Legal->getNumLoads();
6148     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
6149     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
6150 
6151     // There is little point in interleaving for reductions containing selects
6152     // and compares when VF=1 since it may just create more overhead than it's
6153     // worth for loops with small trip counts. This is because we still have to
6154     // do the final reduction after the loop.
6155     bool HasSelectCmpReductions =
6156         HasReductions &&
6157         any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
6158           const RecurrenceDescriptor &RdxDesc = Reduction.second;
6159           return RecurrenceDescriptor::isSelectCmpRecurrenceKind(
6160               RdxDesc.getRecurrenceKind());
6161         });
6162     if (HasSelectCmpReductions) {
6163       LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
6164       return 1;
6165     }
6166 
6167     // If we have a scalar reduction (vector reductions are already dealt with
6168     // by this point), we can increase the critical path length if the loop
6169     // we're interleaving is inside another loop. For tree-wise reductions
6170     // set the limit to 2, and for ordered reductions it's best to disable
6171     // interleaving entirely.
6172     if (HasReductions && TheLoop->getLoopDepth() > 1) {
6173       bool HasOrderedReductions =
6174           any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
6175             const RecurrenceDescriptor &RdxDesc = Reduction.second;
6176             return RdxDesc.isOrdered();
6177           });
6178       if (HasOrderedReductions) {
6179         LLVM_DEBUG(
6180             dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
6181         return 1;
6182       }
6183 
6184       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
6185       SmallIC = std::min(SmallIC, F);
6186       StoresIC = std::min(StoresIC, F);
6187       LoadsIC = std::min(LoadsIC, F);
6188     }
6189 
6190     if (EnableLoadStoreRuntimeInterleave &&
6191         std::max(StoresIC, LoadsIC) > SmallIC) {
6192       LLVM_DEBUG(
6193           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
6194       return std::max(StoresIC, LoadsIC);
6195     }
6196 
6197     // If there are scalar reductions and TTI has enabled aggressive
6198     // interleaving for reductions, we will interleave to expose ILP.
6199     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
6200         AggressivelyInterleaveReductions) {
6201       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6202       // Interleave no less than SmallIC but not as aggressive as the normal IC
6203       // to satisfy the rare situation when resources are too limited.
6204       return std::max(IC / 2, SmallIC);
6205     } else {
6206       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
6207       return SmallIC;
6208     }
6209   }
6210 
6211   // Interleave if this is a large loop (small loops are already dealt with by
6212   // this point) that could benefit from interleaving.
6213   if (AggressivelyInterleaveReductions) {
6214     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6215     return IC;
6216   }
6217 
6218   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
6219   return 1;
6220 }
6221 
6222 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
6223 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
6224   // This function calculates the register usage by measuring the highest number
6225   // of values that are alive at a single location. Obviously, this is a very
6226   // rough estimation. We scan the loop in a topological order in order and
6227   // assign a number to each instruction. We use RPO to ensure that defs are
6228   // met before their users. We assume that each instruction that has in-loop
6229   // users starts an interval. We record every time that an in-loop value is
6230   // used, so we have a list of the first and last occurrences of each
6231   // instruction. Next, we transpose this data structure into a multi map that
6232   // holds the list of intervals that *end* at a specific location. This multi
6233   // map allows us to perform a linear search. We scan the instructions linearly
6234   // and record each time that a new interval starts, by placing it in a set.
6235   // If we find this value in the multi-map then we remove it from the set.
6236   // The max register usage is the maximum size of the set.
6237   // We also search for instructions that are defined outside the loop, but are
6238   // used inside the loop. We need this number separately from the max-interval
6239   // usage number because when we unroll, loop-invariant values do not take
6240   // more register.
6241   LoopBlocksDFS DFS(TheLoop);
6242   DFS.perform(LI);
6243 
6244   RegisterUsage RU;
6245 
6246   // Each 'key' in the map opens a new interval. The values
6247   // of the map are the index of the 'last seen' usage of the
6248   // instruction that is the key.
6249   using IntervalMap = DenseMap<Instruction *, unsigned>;
6250 
6251   // Maps instruction to its index.
6252   SmallVector<Instruction *, 64> IdxToInstr;
6253   // Marks the end of each interval.
6254   IntervalMap EndPoint;
6255   // Saves the list of instruction indices that are used in the loop.
6256   SmallPtrSet<Instruction *, 8> Ends;
6257   // Saves the list of values that are used in the loop but are
6258   // defined outside the loop, such as arguments and constants.
6259   SmallPtrSet<Value *, 8> LoopInvariants;
6260 
6261   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6262     for (Instruction &I : BB->instructionsWithoutDebug()) {
6263       IdxToInstr.push_back(&I);
6264 
6265       // Save the end location of each USE.
6266       for (Value *U : I.operands()) {
6267         auto *Instr = dyn_cast<Instruction>(U);
6268 
6269         // Ignore non-instruction values such as arguments, constants, etc.
6270         if (!Instr)
6271           continue;
6272 
6273         // If this instruction is outside the loop then record it and continue.
6274         if (!TheLoop->contains(Instr)) {
6275           LoopInvariants.insert(Instr);
6276           continue;
6277         }
6278 
6279         // Overwrite previous end points.
6280         EndPoint[Instr] = IdxToInstr.size();
6281         Ends.insert(Instr);
6282       }
6283     }
6284   }
6285 
6286   // Saves the list of intervals that end with the index in 'key'.
6287   using InstrList = SmallVector<Instruction *, 2>;
6288   DenseMap<unsigned, InstrList> TransposeEnds;
6289 
6290   // Transpose the EndPoints to a list of values that end at each index.
6291   for (auto &Interval : EndPoint)
6292     TransposeEnds[Interval.second].push_back(Interval.first);
6293 
6294   SmallPtrSet<Instruction *, 8> OpenIntervals;
6295   SmallVector<RegisterUsage, 8> RUs(VFs.size());
6296   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
6297 
6298   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
6299 
6300   // A lambda that gets the register usage for the given type and VF.
6301   const auto &TTICapture = TTI;
6302   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
6303     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
6304       return 0;
6305     InstructionCost::CostType RegUsage =
6306         *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue();
6307     assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() &&
6308            "Nonsensical values for register usage.");
6309     return RegUsage;
6310   };
6311 
6312   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
6313     Instruction *I = IdxToInstr[i];
6314 
6315     // Remove all of the instructions that end at this location.
6316     InstrList &List = TransposeEnds[i];
6317     for (Instruction *ToRemove : List)
6318       OpenIntervals.erase(ToRemove);
6319 
6320     // Ignore instructions that are never used within the loop.
6321     if (!Ends.count(I))
6322       continue;
6323 
6324     // Skip ignored values.
6325     if (ValuesToIgnore.count(I))
6326       continue;
6327 
6328     // For each VF find the maximum usage of registers.
6329     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6330       // Count the number of live intervals.
6331       SmallMapVector<unsigned, unsigned, 4> RegUsage;
6332 
6333       if (VFs[j].isScalar()) {
6334         for (auto Inst : OpenIntervals) {
6335           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6336           if (RegUsage.find(ClassID) == RegUsage.end())
6337             RegUsage[ClassID] = 1;
6338           else
6339             RegUsage[ClassID] += 1;
6340         }
6341       } else {
6342         collectUniformsAndScalars(VFs[j]);
6343         for (auto Inst : OpenIntervals) {
6344           // Skip ignored values for VF > 1.
6345           if (VecValuesToIgnore.count(Inst))
6346             continue;
6347           if (isScalarAfterVectorization(Inst, VFs[j])) {
6348             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6349             if (RegUsage.find(ClassID) == RegUsage.end())
6350               RegUsage[ClassID] = 1;
6351             else
6352               RegUsage[ClassID] += 1;
6353           } else {
6354             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
6355             if (RegUsage.find(ClassID) == RegUsage.end())
6356               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
6357             else
6358               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6359           }
6360         }
6361       }
6362 
6363       for (auto& pair : RegUsage) {
6364         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
6365           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
6366         else
6367           MaxUsages[j][pair.first] = pair.second;
6368       }
6369     }
6370 
6371     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6372                       << OpenIntervals.size() << '\n');
6373 
6374     // Add the current instruction to the list of open intervals.
6375     OpenIntervals.insert(I);
6376   }
6377 
6378   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6379     SmallMapVector<unsigned, unsigned, 4> Invariant;
6380 
6381     for (auto Inst : LoopInvariants) {
6382       unsigned Usage =
6383           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6384       unsigned ClassID =
6385           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6386       if (Invariant.find(ClassID) == Invariant.end())
6387         Invariant[ClassID] = Usage;
6388       else
6389         Invariant[ClassID] += Usage;
6390     }
6391 
6392     LLVM_DEBUG({
6393       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
6394       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
6395              << " item\n";
6396       for (const auto &pair : MaxUsages[i]) {
6397         dbgs() << "LV(REG): RegisterClass: "
6398                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6399                << " registers\n";
6400       }
6401       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
6402              << " item\n";
6403       for (const auto &pair : Invariant) {
6404         dbgs() << "LV(REG): RegisterClass: "
6405                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6406                << " registers\n";
6407       }
6408     });
6409 
6410     RU.LoopInvariantRegs = Invariant;
6411     RU.MaxLocalUsers = MaxUsages[i];
6412     RUs[i] = RU;
6413   }
6414 
6415   return RUs;
6416 }
6417 
6418 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
6419                                                            ElementCount VF) {
6420   // TODO: Cost model for emulated masked load/store is completely
6421   // broken. This hack guides the cost model to use an artificially
6422   // high enough value to practically disable vectorization with such
6423   // operations, except where previously deployed legality hack allowed
6424   // using very low cost values. This is to avoid regressions coming simply
6425   // from moving "masked load/store" check from legality to cost model.
6426   // Masked Load/Gather emulation was previously never allowed.
6427   // Limited number of Masked Store/Scatter emulation was allowed.
6428   assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction");
6429   return isa<LoadInst>(I) ||
6430          (isa<StoreInst>(I) &&
6431           NumPredStores > NumberOfStoresToPredicate);
6432 }
6433 
6434 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6435   // If we aren't vectorizing the loop, or if we've already collected the
6436   // instructions to scalarize, there's nothing to do. Collection may already
6437   // have occurred if we have a user-selected VF and are now computing the
6438   // expected cost for interleaving.
6439   if (VF.isScalar() || VF.isZero() ||
6440       InstsToScalarize.find(VF) != InstsToScalarize.end())
6441     return;
6442 
6443   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6444   // not profitable to scalarize any instructions, the presence of VF in the
6445   // map will indicate that we've analyzed it already.
6446   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6447 
6448   // Find all the instructions that are scalar with predication in the loop and
6449   // determine if it would be better to not if-convert the blocks they are in.
6450   // If so, we also record the instructions to scalarize.
6451   for (BasicBlock *BB : TheLoop->blocks()) {
6452     if (!blockNeedsPredicationForAnyReason(BB))
6453       continue;
6454     for (Instruction &I : *BB)
6455       if (isScalarWithPredication(&I, VF)) {
6456         ScalarCostsTy ScalarCosts;
6457         // Do not apply discount if scalable, because that would lead to
6458         // invalid scalarization costs.
6459         // Do not apply discount logic if hacked cost is needed
6460         // for emulated masked memrefs.
6461         if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) &&
6462             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6463           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6464         // Remember that BB will remain after vectorization.
6465         PredicatedBBsAfterVectorization.insert(BB);
6466       }
6467   }
6468 }
6469 
6470 int LoopVectorizationCostModel::computePredInstDiscount(
6471     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
6472   assert(!isUniformAfterVectorization(PredInst, VF) &&
6473          "Instruction marked uniform-after-vectorization will be predicated");
6474 
6475   // Initialize the discount to zero, meaning that the scalar version and the
6476   // vector version cost the same.
6477   InstructionCost Discount = 0;
6478 
6479   // Holds instructions to analyze. The instructions we visit are mapped in
6480   // ScalarCosts. Those instructions are the ones that would be scalarized if
6481   // we find that the scalar version costs less.
6482   SmallVector<Instruction *, 8> Worklist;
6483 
6484   // Returns true if the given instruction can be scalarized.
6485   auto canBeScalarized = [&](Instruction *I) -> bool {
6486     // We only attempt to scalarize instructions forming a single-use chain
6487     // from the original predicated block that would otherwise be vectorized.
6488     // Although not strictly necessary, we give up on instructions we know will
6489     // already be scalar to avoid traversing chains that are unlikely to be
6490     // beneficial.
6491     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6492         isScalarAfterVectorization(I, VF))
6493       return false;
6494 
6495     // If the instruction is scalar with predication, it will be analyzed
6496     // separately. We ignore it within the context of PredInst.
6497     if (isScalarWithPredication(I, VF))
6498       return false;
6499 
6500     // If any of the instruction's operands are uniform after vectorization,
6501     // the instruction cannot be scalarized. This prevents, for example, a
6502     // masked load from being scalarized.
6503     //
6504     // We assume we will only emit a value for lane zero of an instruction
6505     // marked uniform after vectorization, rather than VF identical values.
6506     // Thus, if we scalarize an instruction that uses a uniform, we would
6507     // create uses of values corresponding to the lanes we aren't emitting code
6508     // for. This behavior can be changed by allowing getScalarValue to clone
6509     // the lane zero values for uniforms rather than asserting.
6510     for (Use &U : I->operands())
6511       if (auto *J = dyn_cast<Instruction>(U.get()))
6512         if (isUniformAfterVectorization(J, VF))
6513           return false;
6514 
6515     // Otherwise, we can scalarize the instruction.
6516     return true;
6517   };
6518 
6519   // Compute the expected cost discount from scalarizing the entire expression
6520   // feeding the predicated instruction. We currently only consider expressions
6521   // that are single-use instruction chains.
6522   Worklist.push_back(PredInst);
6523   while (!Worklist.empty()) {
6524     Instruction *I = Worklist.pop_back_val();
6525 
6526     // If we've already analyzed the instruction, there's nothing to do.
6527     if (ScalarCosts.find(I) != ScalarCosts.end())
6528       continue;
6529 
6530     // Compute the cost of the vector instruction. Note that this cost already
6531     // includes the scalarization overhead of the predicated instruction.
6532     InstructionCost VectorCost = getInstructionCost(I, VF).first;
6533 
6534     // Compute the cost of the scalarized instruction. This cost is the cost of
6535     // the instruction as if it wasn't if-converted and instead remained in the
6536     // predicated block. We will scale this cost by block probability after
6537     // computing the scalarization overhead.
6538     InstructionCost ScalarCost =
6539         VF.getFixedValue() *
6540         getInstructionCost(I, ElementCount::getFixed(1)).first;
6541 
6542     // Compute the scalarization overhead of needed insertelement instructions
6543     // and phi nodes.
6544     if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
6545       ScalarCost += TTI.getScalarizationOverhead(
6546           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6547           APInt::getAllOnes(VF.getFixedValue()), true, false);
6548       ScalarCost +=
6549           VF.getFixedValue() *
6550           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6551     }
6552 
6553     // Compute the scalarization overhead of needed extractelement
6554     // instructions. For each of the instruction's operands, if the operand can
6555     // be scalarized, add it to the worklist; otherwise, account for the
6556     // overhead.
6557     for (Use &U : I->operands())
6558       if (auto *J = dyn_cast<Instruction>(U.get())) {
6559         assert(VectorType::isValidElementType(J->getType()) &&
6560                "Instruction has non-scalar type");
6561         if (canBeScalarized(J))
6562           Worklist.push_back(J);
6563         else if (needsExtract(J, VF)) {
6564           ScalarCost += TTI.getScalarizationOverhead(
6565               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6566               APInt::getAllOnes(VF.getFixedValue()), false, true);
6567         }
6568       }
6569 
6570     // Scale the total scalar cost by block probability.
6571     ScalarCost /= getReciprocalPredBlockProb();
6572 
6573     // Compute the discount. A non-negative discount means the vector version
6574     // of the instruction costs more, and scalarizing would be beneficial.
6575     Discount += VectorCost - ScalarCost;
6576     ScalarCosts[I] = ScalarCost;
6577   }
6578 
6579   return *Discount.getValue();
6580 }
6581 
6582 LoopVectorizationCostModel::VectorizationCostTy
6583 LoopVectorizationCostModel::expectedCost(
6584     ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
6585   VectorizationCostTy Cost;
6586 
6587   // For each block.
6588   for (BasicBlock *BB : TheLoop->blocks()) {
6589     VectorizationCostTy BlockCost;
6590 
6591     // For each instruction in the old loop.
6592     for (Instruction &I : BB->instructionsWithoutDebug()) {
6593       // Skip ignored values.
6594       if (ValuesToIgnore.count(&I) ||
6595           (VF.isVector() && VecValuesToIgnore.count(&I)))
6596         continue;
6597 
6598       VectorizationCostTy C = getInstructionCost(&I, VF);
6599 
6600       // Check if we should override the cost.
6601       if (C.first.isValid() &&
6602           ForceTargetInstructionCost.getNumOccurrences() > 0)
6603         C.first = InstructionCost(ForceTargetInstructionCost);
6604 
6605       // Keep a list of instructions with invalid costs.
6606       if (Invalid && !C.first.isValid())
6607         Invalid->emplace_back(&I, VF);
6608 
6609       BlockCost.first += C.first;
6610       BlockCost.second |= C.second;
6611       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6612                         << " for VF " << VF << " For instruction: " << I
6613                         << '\n');
6614     }
6615 
6616     // If we are vectorizing a predicated block, it will have been
6617     // if-converted. This means that the block's instructions (aside from
6618     // stores and instructions that may divide by zero) will now be
6619     // unconditionally executed. For the scalar case, we may not always execute
6620     // the predicated block, if it is an if-else block. Thus, scale the block's
6621     // cost by the probability of executing it. blockNeedsPredication from
6622     // Legal is used so as to not include all blocks in tail folded loops.
6623     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6624       BlockCost.first /= getReciprocalPredBlockProb();
6625 
6626     Cost.first += BlockCost.first;
6627     Cost.second |= BlockCost.second;
6628   }
6629 
6630   return Cost;
6631 }
6632 
6633 /// Gets Address Access SCEV after verifying that the access pattern
6634 /// is loop invariant except the induction variable dependence.
6635 ///
6636 /// This SCEV can be sent to the Target in order to estimate the address
6637 /// calculation cost.
6638 static const SCEV *getAddressAccessSCEV(
6639               Value *Ptr,
6640               LoopVectorizationLegality *Legal,
6641               PredicatedScalarEvolution &PSE,
6642               const Loop *TheLoop) {
6643 
6644   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6645   if (!Gep)
6646     return nullptr;
6647 
6648   // We are looking for a gep with all loop invariant indices except for one
6649   // which should be an induction variable.
6650   auto SE = PSE.getSE();
6651   unsigned NumOperands = Gep->getNumOperands();
6652   for (unsigned i = 1; i < NumOperands; ++i) {
6653     Value *Opd = Gep->getOperand(i);
6654     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6655         !Legal->isInductionVariable(Opd))
6656       return nullptr;
6657   }
6658 
6659   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6660   return PSE.getSCEV(Ptr);
6661 }
6662 
6663 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6664   return Legal->hasStride(I->getOperand(0)) ||
6665          Legal->hasStride(I->getOperand(1));
6666 }
6667 
6668 InstructionCost
6669 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6670                                                         ElementCount VF) {
6671   assert(VF.isVector() &&
6672          "Scalarization cost of instruction implies vectorization.");
6673   if (VF.isScalable())
6674     return InstructionCost::getInvalid();
6675 
6676   Type *ValTy = getLoadStoreType(I);
6677   auto SE = PSE.getSE();
6678 
6679   unsigned AS = getLoadStoreAddressSpace(I);
6680   Value *Ptr = getLoadStorePointerOperand(I);
6681   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6682   // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
6683   //       that it is being called from this specific place.
6684 
6685   // Figure out whether the access is strided and get the stride value
6686   // if it's known in compile time
6687   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6688 
6689   // Get the cost of the scalar memory instruction and address computation.
6690   InstructionCost Cost =
6691       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6692 
6693   // Don't pass *I here, since it is scalar but will actually be part of a
6694   // vectorized loop where the user of it is a vectorized instruction.
6695   const Align Alignment = getLoadStoreAlignment(I);
6696   Cost += VF.getKnownMinValue() *
6697           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6698                               AS, TTI::TCK_RecipThroughput);
6699 
6700   // Get the overhead of the extractelement and insertelement instructions
6701   // we might create due to scalarization.
6702   Cost += getScalarizationOverhead(I, VF);
6703 
6704   // If we have a predicated load/store, it will need extra i1 extracts and
6705   // conditional branches, but may not be executed for each vector lane. Scale
6706   // the cost by the probability of executing the predicated block.
6707   if (isPredicatedInst(I, VF)) {
6708     Cost /= getReciprocalPredBlockProb();
6709 
6710     // Add the cost of an i1 extract and a branch
6711     auto *Vec_i1Ty =
6712         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
6713     Cost += TTI.getScalarizationOverhead(
6714         Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
6715         /*Insert=*/false, /*Extract=*/true);
6716     Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
6717 
6718     if (useEmulatedMaskMemRefHack(I, VF))
6719       // Artificially setting to a high enough value to practically disable
6720       // vectorization with such operations.
6721       Cost = 3000000;
6722   }
6723 
6724   return Cost;
6725 }
6726 
6727 InstructionCost
6728 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6729                                                     ElementCount VF) {
6730   Type *ValTy = getLoadStoreType(I);
6731   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6732   Value *Ptr = getLoadStorePointerOperand(I);
6733   unsigned AS = getLoadStoreAddressSpace(I);
6734   int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
6735   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6736 
6737   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6738          "Stride should be 1 or -1 for consecutive memory access");
6739   const Align Alignment = getLoadStoreAlignment(I);
6740   InstructionCost Cost = 0;
6741   if (Legal->isMaskRequired(I))
6742     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6743                                       CostKind);
6744   else
6745     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6746                                 CostKind, I);
6747 
6748   bool Reverse = ConsecutiveStride < 0;
6749   if (Reverse)
6750     Cost +=
6751         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
6752   return Cost;
6753 }
6754 
6755 InstructionCost
6756 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6757                                                 ElementCount VF) {
6758   assert(Legal->isUniformMemOp(*I));
6759 
6760   Type *ValTy = getLoadStoreType(I);
6761   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6762   const Align Alignment = getLoadStoreAlignment(I);
6763   unsigned AS = getLoadStoreAddressSpace(I);
6764   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6765   if (isa<LoadInst>(I)) {
6766     return TTI.getAddressComputationCost(ValTy) +
6767            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6768                                CostKind) +
6769            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6770   }
6771   StoreInst *SI = cast<StoreInst>(I);
6772 
6773   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6774   return TTI.getAddressComputationCost(ValTy) +
6775          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6776                              CostKind) +
6777          (isLoopInvariantStoreValue
6778               ? 0
6779               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6780                                        VF.getKnownMinValue() - 1));
6781 }
6782 
6783 InstructionCost
6784 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6785                                                  ElementCount VF) {
6786   Type *ValTy = getLoadStoreType(I);
6787   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6788   const Align Alignment = getLoadStoreAlignment(I);
6789   const Value *Ptr = getLoadStorePointerOperand(I);
6790 
6791   return TTI.getAddressComputationCost(VectorTy) +
6792          TTI.getGatherScatterOpCost(
6793              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6794              TargetTransformInfo::TCK_RecipThroughput, I);
6795 }
6796 
6797 InstructionCost
6798 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6799                                                    ElementCount VF) {
6800   // TODO: Once we have support for interleaving with scalable vectors
6801   // we can calculate the cost properly here.
6802   if (VF.isScalable())
6803     return InstructionCost::getInvalid();
6804 
6805   Type *ValTy = getLoadStoreType(I);
6806   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6807   unsigned AS = getLoadStoreAddressSpace(I);
6808 
6809   auto Group = getInterleavedAccessGroup(I);
6810   assert(Group && "Fail to get an interleaved access group.");
6811 
6812   unsigned InterleaveFactor = Group->getFactor();
6813   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6814 
6815   // Holds the indices of existing members in the interleaved group.
6816   SmallVector<unsigned, 4> Indices;
6817   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
6818     if (Group->getMember(IF))
6819       Indices.push_back(IF);
6820 
6821   // Calculate the cost of the whole interleaved group.
6822   bool UseMaskForGaps =
6823       (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6824       (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
6825   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
6826       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6827       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6828 
6829   if (Group->isReverse()) {
6830     // TODO: Add support for reversed masked interleaved access.
6831     assert(!Legal->isMaskRequired(I) &&
6832            "Reverse masked interleaved access not supported.");
6833     Cost +=
6834         Group->getNumMembers() *
6835         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
6836   }
6837   return Cost;
6838 }
6839 
6840 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
6841     Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
6842   using namespace llvm::PatternMatch;
6843   // Early exit for no inloop reductions
6844   if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6845     return None;
6846   auto *VectorTy = cast<VectorType>(Ty);
6847 
6848   // We are looking for a pattern of, and finding the minimal acceptable cost:
6849   //  reduce(mul(ext(A), ext(B))) or
6850   //  reduce(mul(A, B)) or
6851   //  reduce(ext(A)) or
6852   //  reduce(A).
6853   // The basic idea is that we walk down the tree to do that, finding the root
6854   // reduction instruction in InLoopReductionImmediateChains. From there we find
6855   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6856   // of the components. If the reduction cost is lower then we return it for the
6857   // reduction instruction and 0 for the other instructions in the pattern. If
6858   // it is not we return an invalid cost specifying the orignal cost method
6859   // should be used.
6860   Instruction *RetI = I;
6861   if (match(RetI, m_ZExtOrSExt(m_Value()))) {
6862     if (!RetI->hasOneUser())
6863       return None;
6864     RetI = RetI->user_back();
6865   }
6866   if (match(RetI, m_Mul(m_Value(), m_Value())) &&
6867       RetI->user_back()->getOpcode() == Instruction::Add) {
6868     if (!RetI->hasOneUser())
6869       return None;
6870     RetI = RetI->user_back();
6871   }
6872 
6873   // Test if the found instruction is a reduction, and if not return an invalid
6874   // cost specifying the parent to use the original cost modelling.
6875   if (!InLoopReductionImmediateChains.count(RetI))
6876     return None;
6877 
6878   // Find the reduction this chain is a part of and calculate the basic cost of
6879   // the reduction on its own.
6880   Instruction *LastChain = InLoopReductionImmediateChains[RetI];
6881   Instruction *ReductionPhi = LastChain;
6882   while (!isa<PHINode>(ReductionPhi))
6883     ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
6884 
6885   const RecurrenceDescriptor &RdxDesc =
6886       Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
6887 
6888   InstructionCost BaseCost = TTI.getArithmeticReductionCost(
6889       RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
6890 
6891   // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6892   // normal fmul instruction to the cost of the fadd reduction.
6893   if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
6894     BaseCost +=
6895         TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
6896 
6897   // If we're using ordered reductions then we can just return the base cost
6898   // here, since getArithmeticReductionCost calculates the full ordered
6899   // reduction cost when FP reassociation is not allowed.
6900   if (useOrderedReductions(RdxDesc))
6901     return BaseCost;
6902 
6903   // Get the operand that was not the reduction chain and match it to one of the
6904   // patterns, returning the better cost if it is found.
6905   Instruction *RedOp = RetI->getOperand(1) == LastChain
6906                            ? dyn_cast<Instruction>(RetI->getOperand(0))
6907                            : dyn_cast<Instruction>(RetI->getOperand(1));
6908 
6909   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6910 
6911   Instruction *Op0, *Op1;
6912   if (RedOp &&
6913       match(RedOp,
6914             m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
6915       match(Op0, m_ZExtOrSExt(m_Value())) &&
6916       Op0->getOpcode() == Op1->getOpcode() &&
6917       Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6918       !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
6919       (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
6920 
6921     // Matched reduce(ext(mul(ext(A), ext(B)))
6922     // Note that the extend opcodes need to all match, or if A==B they will have
6923     // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6924     // which is equally fine.
6925     bool IsUnsigned = isa<ZExtInst>(Op0);
6926     auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6927     auto *MulType = VectorType::get(Op0->getType(), VectorTy);
6928 
6929     InstructionCost ExtCost =
6930         TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
6931                              TTI::CastContextHint::None, CostKind, Op0);
6932     InstructionCost MulCost =
6933         TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
6934     InstructionCost Ext2Cost =
6935         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
6936                              TTI::CastContextHint::None, CostKind, RedOp);
6937 
6938     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6939         /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6940         CostKind);
6941 
6942     if (RedCost.isValid() &&
6943         RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6944       return I == RetI ? RedCost : 0;
6945   } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
6946              !TheLoop->isLoopInvariant(RedOp)) {
6947     // Matched reduce(ext(A))
6948     bool IsUnsigned = isa<ZExtInst>(RedOp);
6949     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6950     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6951         /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6952         CostKind);
6953 
6954     InstructionCost ExtCost =
6955         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6956                              TTI::CastContextHint::None, CostKind, RedOp);
6957     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6958       return I == RetI ? RedCost : 0;
6959   } else if (RedOp &&
6960              match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
6961     if (match(Op0, m_ZExtOrSExt(m_Value())) &&
6962         Op0->getOpcode() == Op1->getOpcode() &&
6963         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
6964       bool IsUnsigned = isa<ZExtInst>(Op0);
6965       Type *Op0Ty = Op0->getOperand(0)->getType();
6966       Type *Op1Ty = Op1->getOperand(0)->getType();
6967       Type *LargestOpTy =
6968           Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
6969                                                                     : Op0Ty;
6970       auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
6971 
6972       // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of
6973       // different sizes. We take the largest type as the ext to reduce, and add
6974       // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6975       InstructionCost ExtCost0 = TTI.getCastInstrCost(
6976           Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
6977           TTI::CastContextHint::None, CostKind, Op0);
6978       InstructionCost ExtCost1 = TTI.getCastInstrCost(
6979           Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
6980           TTI::CastContextHint::None, CostKind, Op1);
6981       InstructionCost MulCost =
6982           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6983 
6984       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6985           /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6986           CostKind);
6987       InstructionCost ExtraExtCost = 0;
6988       if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
6989         Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
6990         ExtraExtCost = TTI.getCastInstrCost(
6991             ExtraExtOp->getOpcode(), ExtType,
6992             VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
6993             TTI::CastContextHint::None, CostKind, ExtraExtOp);
6994       }
6995 
6996       if (RedCost.isValid() &&
6997           (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6998         return I == RetI ? RedCost : 0;
6999     } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
7000       // Matched reduce(mul())
7001       InstructionCost MulCost =
7002           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7003 
7004       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7005           /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy,
7006           CostKind);
7007 
7008       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
7009         return I == RetI ? RedCost : 0;
7010     }
7011   }
7012 
7013   return I == RetI ? Optional<InstructionCost>(BaseCost) : None;
7014 }
7015 
7016 InstructionCost
7017 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
7018                                                      ElementCount VF) {
7019   // Calculate scalar cost only. Vectorization cost should be ready at this
7020   // moment.
7021   if (VF.isScalar()) {
7022     Type *ValTy = getLoadStoreType(I);
7023     const Align Alignment = getLoadStoreAlignment(I);
7024     unsigned AS = getLoadStoreAddressSpace(I);
7025 
7026     return TTI.getAddressComputationCost(ValTy) +
7027            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
7028                                TTI::TCK_RecipThroughput, I);
7029   }
7030   return getWideningCost(I, VF);
7031 }
7032 
7033 LoopVectorizationCostModel::VectorizationCostTy
7034 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
7035                                                ElementCount VF) {
7036   // If we know that this instruction will remain uniform, check the cost of
7037   // the scalar version.
7038   if (isUniformAfterVectorization(I, VF))
7039     VF = ElementCount::getFixed(1);
7040 
7041   if (VF.isVector() && isProfitableToScalarize(I, VF))
7042     return VectorizationCostTy(InstsToScalarize[VF][I], false);
7043 
7044   // Forced scalars do not have any scalarization overhead.
7045   auto ForcedScalar = ForcedScalars.find(VF);
7046   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
7047     auto InstSet = ForcedScalar->second;
7048     if (InstSet.count(I))
7049       return VectorizationCostTy(
7050           (getInstructionCost(I, ElementCount::getFixed(1)).first *
7051            VF.getKnownMinValue()),
7052           false);
7053   }
7054 
7055   Type *VectorTy;
7056   InstructionCost C = getInstructionCost(I, VF, VectorTy);
7057 
7058   bool TypeNotScalarized = false;
7059   if (VF.isVector() && VectorTy->isVectorTy()) {
7060     unsigned NumParts = TTI.getNumberOfParts(VectorTy);
7061     if (NumParts)
7062       TypeNotScalarized = NumParts < VF.getKnownMinValue();
7063     else
7064       C = InstructionCost::getInvalid();
7065   }
7066   return VectorizationCostTy(C, TypeNotScalarized);
7067 }
7068 
7069 InstructionCost
7070 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
7071                                                      ElementCount VF) const {
7072 
7073   // There is no mechanism yet to create a scalable scalarization loop,
7074   // so this is currently Invalid.
7075   if (VF.isScalable())
7076     return InstructionCost::getInvalid();
7077 
7078   if (VF.isScalar())
7079     return 0;
7080 
7081   InstructionCost Cost = 0;
7082   Type *RetTy = ToVectorTy(I->getType(), VF);
7083   if (!RetTy->isVoidTy() &&
7084       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
7085     Cost += TTI.getScalarizationOverhead(
7086         cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true,
7087         false);
7088 
7089   // Some targets keep addresses scalar.
7090   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
7091     return Cost;
7092 
7093   // Some targets support efficient element stores.
7094   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
7095     return Cost;
7096 
7097   // Collect operands to consider.
7098   CallInst *CI = dyn_cast<CallInst>(I);
7099   Instruction::op_range Ops = CI ? CI->args() : I->operands();
7100 
7101   // Skip operands that do not require extraction/scalarization and do not incur
7102   // any overhead.
7103   SmallVector<Type *> Tys;
7104   for (auto *V : filterExtractingOperands(Ops, VF))
7105     Tys.push_back(MaybeVectorizeType(V->getType(), VF));
7106   return Cost + TTI.getOperandsScalarizationOverhead(
7107                     filterExtractingOperands(Ops, VF), Tys);
7108 }
7109 
7110 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
7111   if (VF.isScalar())
7112     return;
7113   NumPredStores = 0;
7114   for (BasicBlock *BB : TheLoop->blocks()) {
7115     // For each instruction in the old loop.
7116     for (Instruction &I : *BB) {
7117       Value *Ptr =  getLoadStorePointerOperand(&I);
7118       if (!Ptr)
7119         continue;
7120 
7121       // TODO: We should generate better code and update the cost model for
7122       // predicated uniform stores. Today they are treated as any other
7123       // predicated store (see added test cases in
7124       // invariant-store-vectorization.ll).
7125       if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
7126         NumPredStores++;
7127 
7128       if (Legal->isUniformMemOp(I)) {
7129         // TODO: Avoid replicating loads and stores instead of
7130         // relying on instcombine to remove them.
7131         // Load: Scalar load + broadcast
7132         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
7133         InstructionCost Cost;
7134         if (isa<StoreInst>(&I) && VF.isScalable() &&
7135             isLegalGatherOrScatter(&I, VF)) {
7136           Cost = getGatherScatterCost(&I, VF);
7137           setWideningDecision(&I, VF, CM_GatherScatter, Cost);
7138         } else {
7139           assert((isa<LoadInst>(&I) || !VF.isScalable()) &&
7140                  "Cannot yet scalarize uniform stores");
7141           Cost = getUniformMemOpCost(&I, VF);
7142           setWideningDecision(&I, VF, CM_Scalarize, Cost);
7143         }
7144         continue;
7145       }
7146 
7147       // We assume that widening is the best solution when possible.
7148       if (memoryInstructionCanBeWidened(&I, VF)) {
7149         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
7150         int ConsecutiveStride = Legal->isConsecutivePtr(
7151             getLoadStoreType(&I), getLoadStorePointerOperand(&I));
7152         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
7153                "Expected consecutive stride.");
7154         InstWidening Decision =
7155             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
7156         setWideningDecision(&I, VF, Decision, Cost);
7157         continue;
7158       }
7159 
7160       // Choose between Interleaving, Gather/Scatter or Scalarization.
7161       InstructionCost InterleaveCost = InstructionCost::getInvalid();
7162       unsigned NumAccesses = 1;
7163       if (isAccessInterleaved(&I)) {
7164         auto Group = getInterleavedAccessGroup(&I);
7165         assert(Group && "Fail to get an interleaved access group.");
7166 
7167         // Make one decision for the whole group.
7168         if (getWideningDecision(&I, VF) != CM_Unknown)
7169           continue;
7170 
7171         NumAccesses = Group->getNumMembers();
7172         if (interleavedAccessCanBeWidened(&I, VF))
7173           InterleaveCost = getInterleaveGroupCost(&I, VF);
7174       }
7175 
7176       InstructionCost GatherScatterCost =
7177           isLegalGatherOrScatter(&I, VF)
7178               ? getGatherScatterCost(&I, VF) * NumAccesses
7179               : InstructionCost::getInvalid();
7180 
7181       InstructionCost ScalarizationCost =
7182           getMemInstScalarizationCost(&I, VF) * NumAccesses;
7183 
7184       // Choose better solution for the current VF,
7185       // write down this decision and use it during vectorization.
7186       InstructionCost Cost;
7187       InstWidening Decision;
7188       if (InterleaveCost <= GatherScatterCost &&
7189           InterleaveCost < ScalarizationCost) {
7190         Decision = CM_Interleave;
7191         Cost = InterleaveCost;
7192       } else if (GatherScatterCost < ScalarizationCost) {
7193         Decision = CM_GatherScatter;
7194         Cost = GatherScatterCost;
7195       } else {
7196         Decision = CM_Scalarize;
7197         Cost = ScalarizationCost;
7198       }
7199       // If the instructions belongs to an interleave group, the whole group
7200       // receives the same decision. The whole group receives the cost, but
7201       // the cost will actually be assigned to one instruction.
7202       if (auto Group = getInterleavedAccessGroup(&I))
7203         setWideningDecision(Group, VF, Decision, Cost);
7204       else
7205         setWideningDecision(&I, VF, Decision, Cost);
7206     }
7207   }
7208 
7209   // Make sure that any load of address and any other address computation
7210   // remains scalar unless there is gather/scatter support. This avoids
7211   // inevitable extracts into address registers, and also has the benefit of
7212   // activating LSR more, since that pass can't optimize vectorized
7213   // addresses.
7214   if (TTI.prefersVectorizedAddressing())
7215     return;
7216 
7217   // Start with all scalar pointer uses.
7218   SmallPtrSet<Instruction *, 8> AddrDefs;
7219   for (BasicBlock *BB : TheLoop->blocks())
7220     for (Instruction &I : *BB) {
7221       Instruction *PtrDef =
7222         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
7223       if (PtrDef && TheLoop->contains(PtrDef) &&
7224           getWideningDecision(&I, VF) != CM_GatherScatter)
7225         AddrDefs.insert(PtrDef);
7226     }
7227 
7228   // Add all instructions used to generate the addresses.
7229   SmallVector<Instruction *, 4> Worklist;
7230   append_range(Worklist, AddrDefs);
7231   while (!Worklist.empty()) {
7232     Instruction *I = Worklist.pop_back_val();
7233     for (auto &Op : I->operands())
7234       if (auto *InstOp = dyn_cast<Instruction>(Op))
7235         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
7236             AddrDefs.insert(InstOp).second)
7237           Worklist.push_back(InstOp);
7238   }
7239 
7240   for (auto *I : AddrDefs) {
7241     if (isa<LoadInst>(I)) {
7242       // Setting the desired widening decision should ideally be handled in
7243       // by cost functions, but since this involves the task of finding out
7244       // if the loaded register is involved in an address computation, it is
7245       // instead changed here when we know this is the case.
7246       InstWidening Decision = getWideningDecision(I, VF);
7247       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
7248         // Scalarize a widened load of address.
7249         setWideningDecision(
7250             I, VF, CM_Scalarize,
7251             (VF.getKnownMinValue() *
7252              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
7253       else if (auto Group = getInterleavedAccessGroup(I)) {
7254         // Scalarize an interleave group of address loads.
7255         for (unsigned I = 0; I < Group->getFactor(); ++I) {
7256           if (Instruction *Member = Group->getMember(I))
7257             setWideningDecision(
7258                 Member, VF, CM_Scalarize,
7259                 (VF.getKnownMinValue() *
7260                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
7261         }
7262       }
7263     } else
7264       // Make sure I gets scalarized and a cost estimate without
7265       // scalarization overhead.
7266       ForcedScalars[VF].insert(I);
7267   }
7268 }
7269 
7270 InstructionCost
7271 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
7272                                                Type *&VectorTy) {
7273   Type *RetTy = I->getType();
7274   if (canTruncateToMinimalBitwidth(I, VF))
7275     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
7276   auto SE = PSE.getSE();
7277   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7278 
7279   auto hasSingleCopyAfterVectorization = [this](Instruction *I,
7280                                                 ElementCount VF) -> bool {
7281     if (VF.isScalar())
7282       return true;
7283 
7284     auto Scalarized = InstsToScalarize.find(VF);
7285     assert(Scalarized != InstsToScalarize.end() &&
7286            "VF not yet analyzed for scalarization profitability");
7287     return !Scalarized->second.count(I) &&
7288            llvm::all_of(I->users(), [&](User *U) {
7289              auto *UI = cast<Instruction>(U);
7290              return !Scalarized->second.count(UI);
7291            });
7292   };
7293   (void) hasSingleCopyAfterVectorization;
7294 
7295   if (isScalarAfterVectorization(I, VF)) {
7296     // With the exception of GEPs and PHIs, after scalarization there should
7297     // only be one copy of the instruction generated in the loop. This is
7298     // because the VF is either 1, or any instructions that need scalarizing
7299     // have already been dealt with by the the time we get here. As a result,
7300     // it means we don't have to multiply the instruction cost by VF.
7301     assert(I->getOpcode() == Instruction::GetElementPtr ||
7302            I->getOpcode() == Instruction::PHI ||
7303            (I->getOpcode() == Instruction::BitCast &&
7304             I->getType()->isPointerTy()) ||
7305            hasSingleCopyAfterVectorization(I, VF));
7306     VectorTy = RetTy;
7307   } else
7308     VectorTy = ToVectorTy(RetTy, VF);
7309 
7310   // TODO: We need to estimate the cost of intrinsic calls.
7311   switch (I->getOpcode()) {
7312   case Instruction::GetElementPtr:
7313     // We mark this instruction as zero-cost because the cost of GEPs in
7314     // vectorized code depends on whether the corresponding memory instruction
7315     // is scalarized or not. Therefore, we handle GEPs with the memory
7316     // instruction cost.
7317     return 0;
7318   case Instruction::Br: {
7319     // In cases of scalarized and predicated instructions, there will be VF
7320     // predicated blocks in the vectorized loop. Each branch around these
7321     // blocks requires also an extract of its vector compare i1 element.
7322     bool ScalarPredicatedBB = false;
7323     BranchInst *BI = cast<BranchInst>(I);
7324     if (VF.isVector() && BI->isConditional() &&
7325         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
7326          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
7327       ScalarPredicatedBB = true;
7328 
7329     if (ScalarPredicatedBB) {
7330       // Not possible to scalarize scalable vector with predicated instructions.
7331       if (VF.isScalable())
7332         return InstructionCost::getInvalid();
7333       // Return cost for branches around scalarized and predicated blocks.
7334       auto *Vec_i1Ty =
7335           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
7336       return (
7337           TTI.getScalarizationOverhead(
7338               Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) +
7339           (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
7340     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
7341       // The back-edge branch will remain, as will all scalar branches.
7342       return TTI.getCFInstrCost(Instruction::Br, CostKind);
7343     else
7344       // This branch will be eliminated by if-conversion.
7345       return 0;
7346     // Note: We currently assume zero cost for an unconditional branch inside
7347     // a predicated block since it will become a fall-through, although we
7348     // may decide in the future to call TTI for all branches.
7349   }
7350   case Instruction::PHI: {
7351     auto *Phi = cast<PHINode>(I);
7352 
7353     // First-order recurrences are replaced by vector shuffles inside the loop.
7354     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
7355     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
7356       return TTI.getShuffleCost(
7357           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
7358           None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
7359 
7360     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7361     // converted into select instructions. We require N - 1 selects per phi
7362     // node, where N is the number of incoming values.
7363     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
7364       return (Phi->getNumIncomingValues() - 1) *
7365              TTI.getCmpSelInstrCost(
7366                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
7367                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
7368                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
7369 
7370     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
7371   }
7372   case Instruction::UDiv:
7373   case Instruction::SDiv:
7374   case Instruction::URem:
7375   case Instruction::SRem:
7376     // If we have a predicated instruction, it may not be executed for each
7377     // vector lane. Get the scalarization cost and scale this amount by the
7378     // probability of executing the predicated block. If the instruction is not
7379     // predicated, we fall through to the next case.
7380     if (VF.isVector() && isScalarWithPredication(I, VF)) {
7381       InstructionCost Cost = 0;
7382 
7383       // These instructions have a non-void type, so account for the phi nodes
7384       // that we will create. This cost is likely to be zero. The phi node
7385       // cost, if any, should be scaled by the block probability because it
7386       // models a copy at the end of each predicated block.
7387       Cost += VF.getKnownMinValue() *
7388               TTI.getCFInstrCost(Instruction::PHI, CostKind);
7389 
7390       // The cost of the non-predicated instruction.
7391       Cost += VF.getKnownMinValue() *
7392               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
7393 
7394       // The cost of insertelement and extractelement instructions needed for
7395       // scalarization.
7396       Cost += getScalarizationOverhead(I, VF);
7397 
7398       // Scale the cost by the probability of executing the predicated blocks.
7399       // This assumes the predicated block for each vector lane is equally
7400       // likely.
7401       return Cost / getReciprocalPredBlockProb();
7402     }
7403     LLVM_FALLTHROUGH;
7404   case Instruction::Add:
7405   case Instruction::FAdd:
7406   case Instruction::Sub:
7407   case Instruction::FSub:
7408   case Instruction::Mul:
7409   case Instruction::FMul:
7410   case Instruction::FDiv:
7411   case Instruction::FRem:
7412   case Instruction::Shl:
7413   case Instruction::LShr:
7414   case Instruction::AShr:
7415   case Instruction::And:
7416   case Instruction::Or:
7417   case Instruction::Xor: {
7418     // Since we will replace the stride by 1 the multiplication should go away.
7419     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7420       return 0;
7421 
7422     // Detect reduction patterns
7423     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7424       return *RedCost;
7425 
7426     // Certain instructions can be cheaper to vectorize if they have a constant
7427     // second vector operand. One example of this are shifts on x86.
7428     Value *Op2 = I->getOperand(1);
7429     TargetTransformInfo::OperandValueProperties Op2VP;
7430     TargetTransformInfo::OperandValueKind Op2VK =
7431         TTI.getOperandInfo(Op2, Op2VP);
7432     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7433       Op2VK = TargetTransformInfo::OK_UniformValue;
7434 
7435     SmallVector<const Value *, 4> Operands(I->operand_values());
7436     return TTI.getArithmeticInstrCost(
7437         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7438         Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
7439   }
7440   case Instruction::FNeg: {
7441     return TTI.getArithmeticInstrCost(
7442         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7443         TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None,
7444         TargetTransformInfo::OP_None, I->getOperand(0), I);
7445   }
7446   case Instruction::Select: {
7447     SelectInst *SI = cast<SelectInst>(I);
7448     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7449     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7450 
7451     const Value *Op0, *Op1;
7452     using namespace llvm::PatternMatch;
7453     if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
7454                         match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
7455       // select x, y, false --> x & y
7456       // select x, true, y --> x | y
7457       TTI::OperandValueProperties Op1VP = TTI::OP_None;
7458       TTI::OperandValueProperties Op2VP = TTI::OP_None;
7459       TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP);
7460       TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP);
7461       assert(Op0->getType()->getScalarSizeInBits() == 1 &&
7462               Op1->getType()->getScalarSizeInBits() == 1);
7463 
7464       SmallVector<const Value *, 2> Operands{Op0, Op1};
7465       return TTI.getArithmeticInstrCost(
7466           match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
7467           CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I);
7468     }
7469 
7470     Type *CondTy = SI->getCondition()->getType();
7471     if (!ScalarCond)
7472       CondTy = VectorType::get(CondTy, VF);
7473 
7474     CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
7475     if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
7476       Pred = Cmp->getPredicate();
7477     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
7478                                   CostKind, I);
7479   }
7480   case Instruction::ICmp:
7481   case Instruction::FCmp: {
7482     Type *ValTy = I->getOperand(0)->getType();
7483     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7484     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7485       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7486     VectorTy = ToVectorTy(ValTy, VF);
7487     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7488                                   cast<CmpInst>(I)->getPredicate(), CostKind,
7489                                   I);
7490   }
7491   case Instruction::Store:
7492   case Instruction::Load: {
7493     ElementCount Width = VF;
7494     if (Width.isVector()) {
7495       InstWidening Decision = getWideningDecision(I, Width);
7496       assert(Decision != CM_Unknown &&
7497              "CM decision should be taken at this point");
7498       if (Decision == CM_Scalarize)
7499         Width = ElementCount::getFixed(1);
7500     }
7501     VectorTy = ToVectorTy(getLoadStoreType(I), Width);
7502     return getMemoryInstructionCost(I, VF);
7503   }
7504   case Instruction::BitCast:
7505     if (I->getType()->isPointerTy())
7506       return 0;
7507     LLVM_FALLTHROUGH;
7508   case Instruction::ZExt:
7509   case Instruction::SExt:
7510   case Instruction::FPToUI:
7511   case Instruction::FPToSI:
7512   case Instruction::FPExt:
7513   case Instruction::PtrToInt:
7514   case Instruction::IntToPtr:
7515   case Instruction::SIToFP:
7516   case Instruction::UIToFP:
7517   case Instruction::Trunc:
7518   case Instruction::FPTrunc: {
7519     // Computes the CastContextHint from a Load/Store instruction.
7520     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7521       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7522              "Expected a load or a store!");
7523 
7524       if (VF.isScalar() || !TheLoop->contains(I))
7525         return TTI::CastContextHint::Normal;
7526 
7527       switch (getWideningDecision(I, VF)) {
7528       case LoopVectorizationCostModel::CM_GatherScatter:
7529         return TTI::CastContextHint::GatherScatter;
7530       case LoopVectorizationCostModel::CM_Interleave:
7531         return TTI::CastContextHint::Interleave;
7532       case LoopVectorizationCostModel::CM_Scalarize:
7533       case LoopVectorizationCostModel::CM_Widen:
7534         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7535                                         : TTI::CastContextHint::Normal;
7536       case LoopVectorizationCostModel::CM_Widen_Reverse:
7537         return TTI::CastContextHint::Reversed;
7538       case LoopVectorizationCostModel::CM_Unknown:
7539         llvm_unreachable("Instr did not go through cost modelling?");
7540       }
7541 
7542       llvm_unreachable("Unhandled case!");
7543     };
7544 
7545     unsigned Opcode = I->getOpcode();
7546     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7547     // For Trunc, the context is the only user, which must be a StoreInst.
7548     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7549       if (I->hasOneUse())
7550         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7551           CCH = ComputeCCH(Store);
7552     }
7553     // For Z/Sext, the context is the operand, which must be a LoadInst.
7554     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7555              Opcode == Instruction::FPExt) {
7556       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7557         CCH = ComputeCCH(Load);
7558     }
7559 
7560     // We optimize the truncation of induction variables having constant
7561     // integer steps. The cost of these truncations is the same as the scalar
7562     // operation.
7563     if (isOptimizableIVTruncate(I, VF)) {
7564       auto *Trunc = cast<TruncInst>(I);
7565       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7566                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7567     }
7568 
7569     // Detect reduction patterns
7570     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7571       return *RedCost;
7572 
7573     Type *SrcScalarTy = I->getOperand(0)->getType();
7574     Type *SrcVecTy =
7575         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7576     if (canTruncateToMinimalBitwidth(I, VF)) {
7577       // This cast is going to be shrunk. This may remove the cast or it might
7578       // turn it into slightly different cast. For example, if MinBW == 16,
7579       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7580       //
7581       // Calculate the modified src and dest types.
7582       Type *MinVecTy = VectorTy;
7583       if (Opcode == Instruction::Trunc) {
7584         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7585         VectorTy =
7586             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7587       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7588         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7589         VectorTy =
7590             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7591       }
7592     }
7593 
7594     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7595   }
7596   case Instruction::Call: {
7597     if (RecurrenceDescriptor::isFMulAddIntrinsic(I))
7598       if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7599         return *RedCost;
7600     bool NeedToScalarize;
7601     CallInst *CI = cast<CallInst>(I);
7602     InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7603     if (getVectorIntrinsicIDForCall(CI, TLI)) {
7604       InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
7605       return std::min(CallCost, IntrinsicCost);
7606     }
7607     return CallCost;
7608   }
7609   case Instruction::ExtractValue:
7610     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7611   case Instruction::Alloca:
7612     // We cannot easily widen alloca to a scalable alloca, as
7613     // the result would need to be a vector of pointers.
7614     if (VF.isScalable())
7615       return InstructionCost::getInvalid();
7616     LLVM_FALLTHROUGH;
7617   default:
7618     // This opcode is unknown. Assume that it is the same as 'mul'.
7619     return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7620   } // end of switch.
7621 }
7622 
7623 char LoopVectorize::ID = 0;
7624 
7625 static const char lv_name[] = "Loop Vectorization";
7626 
7627 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7628 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7629 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7630 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
7631 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7632 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7633 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7634 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7635 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7636 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7637 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7638 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7639 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7640 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
7641 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
7642 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7643 
7644 namespace llvm {
7645 
7646 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7647 
7648 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7649                               bool VectorizeOnlyWhenForced) {
7650   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7651 }
7652 
7653 } // end namespace llvm
7654 
7655 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
7656   // Check if the pointer operand of a load or store instruction is
7657   // consecutive.
7658   if (auto *Ptr = getLoadStorePointerOperand(Inst))
7659     return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr);
7660   return false;
7661 }
7662 
7663 void LoopVectorizationCostModel::collectValuesToIgnore() {
7664   // Ignore ephemeral values.
7665   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7666 
7667   // Ignore type-promoting instructions we identified during reduction
7668   // detection.
7669   for (auto &Reduction : Legal->getReductionVars()) {
7670     const RecurrenceDescriptor &RedDes = Reduction.second;
7671     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7672     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7673   }
7674   // Ignore type-casting instructions we identified during induction
7675   // detection.
7676   for (auto &Induction : Legal->getInductionVars()) {
7677     const InductionDescriptor &IndDes = Induction.second;
7678     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7679     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7680   }
7681 }
7682 
7683 void LoopVectorizationCostModel::collectInLoopReductions() {
7684   for (auto &Reduction : Legal->getReductionVars()) {
7685     PHINode *Phi = Reduction.first;
7686     const RecurrenceDescriptor &RdxDesc = Reduction.second;
7687 
7688     // We don't collect reductions that are type promoted (yet).
7689     if (RdxDesc.getRecurrenceType() != Phi->getType())
7690       continue;
7691 
7692     // If the target would prefer this reduction to happen "in-loop", then we
7693     // want to record it as such.
7694     unsigned Opcode = RdxDesc.getOpcode();
7695     if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7696         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7697                                    TargetTransformInfo::ReductionFlags()))
7698       continue;
7699 
7700     // Check that we can correctly put the reductions into the loop, by
7701     // finding the chain of operations that leads from the phi to the loop
7702     // exit value.
7703     SmallVector<Instruction *, 4> ReductionOperations =
7704         RdxDesc.getReductionOpChain(Phi, TheLoop);
7705     bool InLoop = !ReductionOperations.empty();
7706     if (InLoop) {
7707       InLoopReductionChains[Phi] = ReductionOperations;
7708       // Add the elements to InLoopReductionImmediateChains for cost modelling.
7709       Instruction *LastChain = Phi;
7710       for (auto *I : ReductionOperations) {
7711         InLoopReductionImmediateChains[I] = LastChain;
7712         LastChain = I;
7713       }
7714     }
7715     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7716                       << " reduction for phi: " << *Phi << "\n");
7717   }
7718 }
7719 
7720 // TODO: we could return a pair of values that specify the max VF and
7721 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7722 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7723 // doesn't have a cost model that can choose which plan to execute if
7724 // more than one is generated.
7725 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7726                                  LoopVectorizationCostModel &CM) {
7727   unsigned WidestType;
7728   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7729   return WidestVectorRegBits / WidestType;
7730 }
7731 
7732 VectorizationFactor
7733 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7734   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
7735   ElementCount VF = UserVF;
7736   // Outer loop handling: They may require CFG and instruction level
7737   // transformations before even evaluating whether vectorization is profitable.
7738   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7739   // the vectorization pipeline.
7740   if (!OrigLoop->isInnermost()) {
7741     // If the user doesn't provide a vectorization factor, determine a
7742     // reasonable one.
7743     if (UserVF.isZero()) {
7744       VF = ElementCount::getFixed(determineVPlanVF(
7745           TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
7746               .getFixedSize(),
7747           CM));
7748       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7749 
7750       // Make sure we have a VF > 1 for stress testing.
7751       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7752         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7753                           << "overriding computed VF.\n");
7754         VF = ElementCount::getFixed(4);
7755       }
7756     }
7757     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7758     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7759            "VF needs to be a power of two");
7760     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7761                       << "VF " << VF << " to build VPlans.\n");
7762     buildVPlans(VF, VF);
7763 
7764     // For VPlan build stress testing, we bail out after VPlan construction.
7765     if (VPlanBuildStressTest)
7766       return VectorizationFactor::Disabled();
7767 
7768     return {VF, 0 /*Cost*/};
7769   }
7770 
7771   LLVM_DEBUG(
7772       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7773                 "VPlan-native path.\n");
7774   return VectorizationFactor::Disabled();
7775 }
7776 
7777 Optional<VectorizationFactor>
7778 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7779   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7780   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7781   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7782     return None;
7783 
7784   // Invalidate interleave groups if all blocks of loop will be predicated.
7785   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7786       !useMaskedInterleavedAccesses(*TTI)) {
7787     LLVM_DEBUG(
7788         dbgs()
7789         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7790            "which requires masked-interleaved support.\n");
7791     if (CM.InterleaveInfo.invalidateGroups())
7792       // Invalidating interleave groups also requires invalidating all decisions
7793       // based on them, which includes widening decisions and uniform and scalar
7794       // values.
7795       CM.invalidateCostModelingDecisions();
7796   }
7797 
7798   ElementCount MaxUserVF =
7799       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7800   bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
7801   if (!UserVF.isZero() && UserVFIsLegal) {
7802     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7803            "VF needs to be a power of two");
7804     // Collect the instructions (and their associated costs) that will be more
7805     // profitable to scalarize.
7806     if (CM.selectUserVectorizationFactor(UserVF)) {
7807       LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7808       CM.collectInLoopReductions();
7809       buildVPlansWithVPRecipes(UserVF, UserVF);
7810       LLVM_DEBUG(printPlans(dbgs()));
7811       return {{UserVF, 0}};
7812     } else
7813       reportVectorizationInfo("UserVF ignored because of invalid costs.",
7814                               "InvalidCost", ORE, OrigLoop);
7815   }
7816 
7817   // Populate the set of Vectorization Factor Candidates.
7818   ElementCountSet VFCandidates;
7819   for (auto VF = ElementCount::getFixed(1);
7820        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7821     VFCandidates.insert(VF);
7822   for (auto VF = ElementCount::getScalable(1);
7823        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7824     VFCandidates.insert(VF);
7825 
7826   for (const auto &VF : VFCandidates) {
7827     // Collect Uniform and Scalar instructions after vectorization with VF.
7828     CM.collectUniformsAndScalars(VF);
7829 
7830     // Collect the instructions (and their associated costs) that will be more
7831     // profitable to scalarize.
7832     if (VF.isVector())
7833       CM.collectInstsToScalarize(VF);
7834   }
7835 
7836   CM.collectInLoopReductions();
7837   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7838   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7839 
7840   LLVM_DEBUG(printPlans(dbgs()));
7841   if (!MaxFactors.hasVector())
7842     return VectorizationFactor::Disabled();
7843 
7844   // Select the optimal vectorization factor.
7845   auto SelectedVF = CM.selectVectorizationFactor(VFCandidates);
7846 
7847   // Check if it is profitable to vectorize with runtime checks.
7848   unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
7849   if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) {
7850     bool PragmaThresholdReached =
7851         NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
7852     bool ThresholdReached =
7853         NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
7854     if ((ThresholdReached && !Hints.allowReordering()) ||
7855         PragmaThresholdReached) {
7856       ORE->emit([&]() {
7857         return OptimizationRemarkAnalysisAliasing(
7858                    DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(),
7859                    OrigLoop->getHeader())
7860                << "loop not vectorized: cannot prove it is safe to reorder "
7861                   "memory operations";
7862       });
7863       LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
7864       Hints.emitRemarkWithHints();
7865       return VectorizationFactor::Disabled();
7866     }
7867   }
7868   return SelectedVF;
7869 }
7870 
7871 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
7872   assert(count_if(VPlans,
7873                   [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7874              1 &&
7875          "Best VF has not a single VPlan.");
7876 
7877   for (const VPlanPtr &Plan : VPlans) {
7878     if (Plan->hasVF(VF))
7879       return *Plan.get();
7880   }
7881   llvm_unreachable("No plan found!");
7882 }
7883 
7884 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7885   SmallVector<Metadata *, 4> MDs;
7886   // Reserve first location for self reference to the LoopID metadata node.
7887   MDs.push_back(nullptr);
7888   bool IsUnrollMetadata = false;
7889   MDNode *LoopID = L->getLoopID();
7890   if (LoopID) {
7891     // First find existing loop unrolling disable metadata.
7892     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7893       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7894       if (MD) {
7895         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7896         IsUnrollMetadata =
7897             S && S->getString().startswith("llvm.loop.unroll.disable");
7898       }
7899       MDs.push_back(LoopID->getOperand(i));
7900     }
7901   }
7902 
7903   if (!IsUnrollMetadata) {
7904     // Add runtime unroll disable metadata.
7905     LLVMContext &Context = L->getHeader()->getContext();
7906     SmallVector<Metadata *, 1> DisableOperands;
7907     DisableOperands.push_back(
7908         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7909     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7910     MDs.push_back(DisableNode);
7911     MDNode *NewLoopID = MDNode::get(Context, MDs);
7912     // Set operand 0 to refer to the loop id itself.
7913     NewLoopID->replaceOperandWith(0, NewLoopID);
7914     L->setLoopID(NewLoopID);
7915   }
7916 }
7917 
7918 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
7919                                            VPlan &BestVPlan,
7920                                            InnerLoopVectorizer &ILV,
7921                                            DominatorTree *DT) {
7922   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
7923                     << '\n');
7924 
7925   // Perform the actual loop transformation.
7926 
7927   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
7928   VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
7929   Value *CanonicalIVStartValue;
7930   std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
7931       ILV.createVectorizedLoopSkeleton();
7932   ILV.collectPoisonGeneratingRecipes(State);
7933 
7934   ILV.printDebugTracesAtStart();
7935 
7936   //===------------------------------------------------===//
7937   //
7938   // Notice: any optimization or new instruction that go
7939   // into the code below should also be implemented in
7940   // the cost-model.
7941   //
7942   //===------------------------------------------------===//
7943 
7944   // 2. Copy and widen instructions from the old loop into the new loop.
7945   BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr),
7946                              ILV.getOrCreateVectorTripCount(nullptr),
7947                              CanonicalIVStartValue, State);
7948   BestVPlan.execute(&State);
7949 
7950   // Keep all loop hints from the original loop on the vector loop (we'll
7951   // replace the vectorizer-specific hints below).
7952   MDNode *OrigLoopID = OrigLoop->getLoopID();
7953 
7954   Optional<MDNode *> VectorizedLoopID =
7955       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7956                                       LLVMLoopVectorizeFollowupVectorized});
7957 
7958   Loop *L = LI->getLoopFor(State.CFG.PrevBB);
7959   if (VectorizedLoopID.hasValue())
7960     L->setLoopID(VectorizedLoopID.getValue());
7961   else {
7962     // Keep all loop hints from the original loop on the vector loop (we'll
7963     // replace the vectorizer-specific hints below).
7964     if (MDNode *LID = OrigLoop->getLoopID())
7965       L->setLoopID(LID);
7966 
7967     LoopVectorizeHints Hints(L, true, *ORE);
7968     Hints.setAlreadyVectorized();
7969   }
7970   // Disable runtime unrolling when vectorizing the epilogue loop.
7971   if (CanonicalIVStartValue)
7972     AddRuntimeUnrollDisableMetaData(L);
7973 
7974   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7975   //    predication, updating analyses.
7976   ILV.fixVectorizedLoop(State);
7977 
7978   ILV.printDebugTracesAtEnd();
7979 }
7980 
7981 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
7982 void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
7983   for (const auto &Plan : VPlans)
7984     if (PrintVPlansInDotFormat)
7985       Plan->printDOT(O);
7986     else
7987       Plan->print(O);
7988 }
7989 #endif
7990 
7991 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
7992     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7993 
7994   // We create new control-flow for the vectorized loop, so the original exit
7995   // conditions will be dead after vectorization if it's only used by the
7996   // terminator
7997   SmallVector<BasicBlock*> ExitingBlocks;
7998   OrigLoop->getExitingBlocks(ExitingBlocks);
7999   for (auto *BB : ExitingBlocks) {
8000     auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));
8001     if (!Cmp || !Cmp->hasOneUse())
8002       continue;
8003 
8004     // TODO: we should introduce a getUniqueExitingBlocks on Loop
8005     if (!DeadInstructions.insert(Cmp).second)
8006       continue;
8007 
8008     // The operands of the icmp is often a dead trunc, used by IndUpdate.
8009     // TODO: can recurse through operands in general
8010     for (Value *Op : Cmp->operands()) {
8011       if (isa<TruncInst>(Op) && Op->hasOneUse())
8012           DeadInstructions.insert(cast<Instruction>(Op));
8013     }
8014   }
8015 
8016   // We create new "steps" for induction variable updates to which the original
8017   // induction variables map. An original update instruction will be dead if
8018   // all its users except the induction variable are dead.
8019   auto *Latch = OrigLoop->getLoopLatch();
8020   for (auto &Induction : Legal->getInductionVars()) {
8021     PHINode *Ind = Induction.first;
8022     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
8023 
8024     // If the tail is to be folded by masking, the primary induction variable,
8025     // if exists, isn't dead: it will be used for masking. Don't kill it.
8026     if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
8027       continue;
8028 
8029     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
8030           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
8031         }))
8032       DeadInstructions.insert(IndUpdate);
8033   }
8034 }
8035 
8036 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
8037 
8038 //===--------------------------------------------------------------------===//
8039 // EpilogueVectorizerMainLoop
8040 //===--------------------------------------------------------------------===//
8041 
8042 /// This function is partially responsible for generating the control flow
8043 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
8044 std::pair<BasicBlock *, Value *>
8045 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
8046   MDNode *OrigLoopID = OrigLoop->getLoopID();
8047   Loop *Lp = createVectorLoopSkeleton("");
8048 
8049   // Generate the code to check the minimum iteration count of the vector
8050   // epilogue (see below).
8051   EPI.EpilogueIterationCountCheck =
8052       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true);
8053   EPI.EpilogueIterationCountCheck->setName("iter.check");
8054 
8055   // Generate the code to check any assumptions that we've made for SCEV
8056   // expressions.
8057   EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader);
8058 
8059   // Generate the code that checks at runtime if arrays overlap. We put the
8060   // checks into a separate block to make the more common case of few elements
8061   // faster.
8062   EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
8063 
8064   // Generate the iteration count check for the main loop, *after* the check
8065   // for the epilogue loop, so that the path-length is shorter for the case
8066   // that goes directly through the vector epilogue. The longer-path length for
8067   // the main loop is compensated for, by the gain from vectorizing the larger
8068   // trip count. Note: the branch will get updated later on when we vectorize
8069   // the epilogue.
8070   EPI.MainLoopIterationCountCheck =
8071       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
8072 
8073   // Generate the induction variable.
8074   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
8075   EPI.VectorTripCount = CountRoundDown;
8076   createHeaderBranch(Lp);
8077 
8078   // Skip induction resume value creation here because they will be created in
8079   // the second pass. If we created them here, they wouldn't be used anyway,
8080   // because the vplan in the second pass still contains the inductions from the
8081   // original loop.
8082 
8083   return {completeLoopSkeleton(Lp, OrigLoopID), nullptr};
8084 }
8085 
8086 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
8087   LLVM_DEBUG({
8088     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
8089            << "Main Loop VF:" << EPI.MainLoopVF
8090            << ", Main Loop UF:" << EPI.MainLoopUF
8091            << ", Epilogue Loop VF:" << EPI.EpilogueVF
8092            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8093   });
8094 }
8095 
8096 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
8097   DEBUG_WITH_TYPE(VerboseDebug, {
8098     dbgs() << "intermediate fn:\n"
8099            << *OrigLoop->getHeader()->getParent() << "\n";
8100   });
8101 }
8102 
8103 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
8104     Loop *L, BasicBlock *Bypass, bool ForEpilogue) {
8105   assert(L && "Expected valid Loop.");
8106   assert(Bypass && "Expected valid bypass basic block.");
8107   ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
8108   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
8109   Value *Count = getOrCreateTripCount(L);
8110   // Reuse existing vector loop preheader for TC checks.
8111   // Note that new preheader block is generated for vector loop.
8112   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
8113   IRBuilder<> Builder(TCCheckBlock->getTerminator());
8114 
8115   // Generate code to check if the loop's trip count is less than VF * UF of the
8116   // main vector loop.
8117   auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ?
8118       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
8119 
8120   Value *CheckMinIters = Builder.CreateICmp(
8121       P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
8122       "min.iters.check");
8123 
8124   if (!ForEpilogue)
8125     TCCheckBlock->setName("vector.main.loop.iter.check");
8126 
8127   // Create new preheader for vector loop.
8128   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
8129                                    DT, LI, nullptr, "vector.ph");
8130 
8131   if (ForEpilogue) {
8132     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
8133                                  DT->getNode(Bypass)->getIDom()) &&
8134            "TC check is expected to dominate Bypass");
8135 
8136     // Update dominator for Bypass & LoopExit.
8137     DT->changeImmediateDominator(Bypass, TCCheckBlock);
8138     if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
8139       // For loops with multiple exits, there's no edge from the middle block
8140       // to exit blocks (as the epilogue must run) and thus no need to update
8141       // the immediate dominator of the exit blocks.
8142       DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
8143 
8144     LoopBypassBlocks.push_back(TCCheckBlock);
8145 
8146     // Save the trip count so we don't have to regenerate it in the
8147     // vec.epilog.iter.check. This is safe to do because the trip count
8148     // generated here dominates the vector epilog iter check.
8149     EPI.TripCount = Count;
8150   }
8151 
8152   ReplaceInstWithInst(
8153       TCCheckBlock->getTerminator(),
8154       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
8155 
8156   return TCCheckBlock;
8157 }
8158 
8159 //===--------------------------------------------------------------------===//
8160 // EpilogueVectorizerEpilogueLoop
8161 //===--------------------------------------------------------------------===//
8162 
8163 /// This function is partially responsible for generating the control flow
8164 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
8165 std::pair<BasicBlock *, Value *>
8166 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
8167   MDNode *OrigLoopID = OrigLoop->getLoopID();
8168   Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
8169 
8170   // Now, compare the remaining count and if there aren't enough iterations to
8171   // execute the vectorized epilogue skip to the scalar part.
8172   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
8173   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
8174   LoopVectorPreHeader =
8175       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
8176                  LI, nullptr, "vec.epilog.ph");
8177   emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader,
8178                                           VecEpilogueIterationCountCheck);
8179 
8180   // Adjust the control flow taking the state info from the main loop
8181   // vectorization into account.
8182   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
8183          "expected this to be saved from the previous pass.");
8184   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
8185       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
8186 
8187   DT->changeImmediateDominator(LoopVectorPreHeader,
8188                                EPI.MainLoopIterationCountCheck);
8189 
8190   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
8191       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8192 
8193   if (EPI.SCEVSafetyCheck)
8194     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
8195         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8196   if (EPI.MemSafetyCheck)
8197     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
8198         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8199 
8200   DT->changeImmediateDominator(
8201       VecEpilogueIterationCountCheck,
8202       VecEpilogueIterationCountCheck->getSinglePredecessor());
8203 
8204   DT->changeImmediateDominator(LoopScalarPreHeader,
8205                                EPI.EpilogueIterationCountCheck);
8206   if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
8207     // If there is an epilogue which must run, there's no edge from the
8208     // middle block to exit blocks  and thus no need to update the immediate
8209     // dominator of the exit blocks.
8210     DT->changeImmediateDominator(LoopExitBlock,
8211                                  EPI.EpilogueIterationCountCheck);
8212 
8213   // Keep track of bypass blocks, as they feed start values to the induction
8214   // phis in the scalar loop preheader.
8215   if (EPI.SCEVSafetyCheck)
8216     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
8217   if (EPI.MemSafetyCheck)
8218     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
8219   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
8220 
8221   // The vec.epilog.iter.check block may contain Phi nodes from reductions which
8222   // merge control-flow from the latch block and the middle block. Update the
8223   // incoming values here and move the Phi into the preheader.
8224   SmallVector<PHINode *, 4> PhisInBlock;
8225   for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
8226     PhisInBlock.push_back(&Phi);
8227 
8228   for (PHINode *Phi : PhisInBlock) {
8229     Phi->replaceIncomingBlockWith(
8230         VecEpilogueIterationCountCheck->getSinglePredecessor(),
8231         VecEpilogueIterationCountCheck);
8232     Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
8233     if (EPI.SCEVSafetyCheck)
8234       Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
8235     if (EPI.MemSafetyCheck)
8236       Phi->removeIncomingValue(EPI.MemSafetyCheck);
8237     Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
8238   }
8239 
8240   // Generate a resume induction for the vector epilogue and put it in the
8241   // vector epilogue preheader
8242   Type *IdxTy = Legal->getWidestInductionType();
8243   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
8244                                          LoopVectorPreHeader->getFirstNonPHI());
8245   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
8246   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
8247                            EPI.MainLoopIterationCountCheck);
8248 
8249   // Generate the induction variable.
8250   createHeaderBranch(Lp);
8251 
8252   // Generate induction resume values. These variables save the new starting
8253   // indexes for the scalar loop. They are used to test if there are any tail
8254   // iterations left once the vector loop has completed.
8255   // Note that when the vectorized epilogue is skipped due to iteration count
8256   // check, then the resume value for the induction variable comes from
8257   // the trip count of the main vector loop, hence passing the AdditionalBypass
8258   // argument.
8259   createInductionResumeValues(Lp, {VecEpilogueIterationCountCheck,
8260                                    EPI.VectorTripCount} /* AdditionalBypass */);
8261 
8262   return {completeLoopSkeleton(Lp, OrigLoopID), EPResumeVal};
8263 }
8264 
8265 BasicBlock *
8266 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
8267     Loop *L, BasicBlock *Bypass, BasicBlock *Insert) {
8268 
8269   assert(EPI.TripCount &&
8270          "Expected trip count to have been safed in the first pass.");
8271   assert(
8272       (!isa<Instruction>(EPI.TripCount) ||
8273        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
8274       "saved trip count does not dominate insertion point.");
8275   Value *TC = EPI.TripCount;
8276   IRBuilder<> Builder(Insert->getTerminator());
8277   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
8278 
8279   // Generate code to check if the loop's trip count is less than VF * UF of the
8280   // vector epilogue loop.
8281   auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ?
8282       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
8283 
8284   Value *CheckMinIters =
8285       Builder.CreateICmp(P, Count,
8286                          createStepForVF(Builder, Count->getType(),
8287                                          EPI.EpilogueVF, EPI.EpilogueUF),
8288                          "min.epilog.iters.check");
8289 
8290   ReplaceInstWithInst(
8291       Insert->getTerminator(),
8292       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
8293 
8294   LoopBypassBlocks.push_back(Insert);
8295   return Insert;
8296 }
8297 
8298 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
8299   LLVM_DEBUG({
8300     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
8301            << "Epilogue Loop VF:" << EPI.EpilogueVF
8302            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8303   });
8304 }
8305 
8306 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
8307   DEBUG_WITH_TYPE(VerboseDebug, {
8308     dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
8309   });
8310 }
8311 
8312 bool LoopVectorizationPlanner::getDecisionAndClampRange(
8313     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
8314   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
8315   bool PredicateAtRangeStart = Predicate(Range.Start);
8316 
8317   for (ElementCount TmpVF = Range.Start * 2;
8318        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
8319     if (Predicate(TmpVF) != PredicateAtRangeStart) {
8320       Range.End = TmpVF;
8321       break;
8322     }
8323 
8324   return PredicateAtRangeStart;
8325 }
8326 
8327 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
8328 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
8329 /// of VF's starting at a given VF and extending it as much as possible. Each
8330 /// vectorization decision can potentially shorten this sub-range during
8331 /// buildVPlan().
8332 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
8333                                            ElementCount MaxVF) {
8334   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8335   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8336     VFRange SubRange = {VF, MaxVFPlusOne};
8337     VPlans.push_back(buildVPlan(SubRange));
8338     VF = SubRange.End;
8339   }
8340 }
8341 
8342 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
8343                                          VPlanPtr &Plan) {
8344   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8345 
8346   // Look for cached value.
8347   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8348   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8349   if (ECEntryIt != EdgeMaskCache.end())
8350     return ECEntryIt->second;
8351 
8352   VPValue *SrcMask = createBlockInMask(Src, Plan);
8353 
8354   // The terminator has to be a branch inst!
8355   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8356   assert(BI && "Unexpected terminator found");
8357 
8358   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8359     return EdgeMaskCache[Edge] = SrcMask;
8360 
8361   // If source is an exiting block, we know the exit edge is dynamically dead
8362   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
8363   // adding uses of an otherwise potentially dead instruction.
8364   if (OrigLoop->isLoopExiting(Src))
8365     return EdgeMaskCache[Edge] = SrcMask;
8366 
8367   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
8368   assert(EdgeMask && "No Edge Mask found for condition");
8369 
8370   if (BI->getSuccessor(0) != Dst)
8371     EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
8372 
8373   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8374     // The condition is 'SrcMask && EdgeMask', which is equivalent to
8375     // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8376     // The select version does not introduce new UB if SrcMask is false and
8377     // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8378     VPValue *False = Plan->getOrAddVPValue(
8379         ConstantInt::getFalse(BI->getCondition()->getType()));
8380     EdgeMask =
8381         Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
8382   }
8383 
8384   return EdgeMaskCache[Edge] = EdgeMask;
8385 }
8386 
8387 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
8388   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8389 
8390   // Look for cached value.
8391   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
8392   if (BCEntryIt != BlockMaskCache.end())
8393     return BCEntryIt->second;
8394 
8395   // All-one mask is modelled as no-mask following the convention for masked
8396   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8397   VPValue *BlockMask = nullptr;
8398 
8399   if (OrigLoop->getHeader() == BB) {
8400     if (!CM.blockNeedsPredicationForAnyReason(BB))
8401       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
8402 
8403     // Introduce the early-exit compare IV <= BTC to form header block mask.
8404     // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8405     // constructing the desired canonical IV in the header block as its first
8406     // non-phi instructions.
8407     assert(CM.foldTailByMasking() && "must fold the tail");
8408     VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock();
8409     auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8410     auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV());
8411     HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi());
8412 
8413     VPBuilder::InsertPointGuard Guard(Builder);
8414     Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8415     if (CM.TTI.emitGetActiveLaneMask()) {
8416       VPValue *TC = Plan->getOrCreateTripCount();
8417       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC});
8418     } else {
8419       VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
8420       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
8421     }
8422     return BlockMaskCache[BB] = BlockMask;
8423   }
8424 
8425   // This is the block mask. We OR all incoming edges.
8426   for (auto *Predecessor : predecessors(BB)) {
8427     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8428     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
8429       return BlockMaskCache[BB] = EdgeMask;
8430 
8431     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8432       BlockMask = EdgeMask;
8433       continue;
8434     }
8435 
8436     BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8437   }
8438 
8439   return BlockMaskCache[BB] = BlockMask;
8440 }
8441 
8442 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
8443                                                 ArrayRef<VPValue *> Operands,
8444                                                 VFRange &Range,
8445                                                 VPlanPtr &Plan) {
8446   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8447          "Must be called with either a load or store");
8448 
8449   auto willWiden = [&](ElementCount VF) -> bool {
8450     if (VF.isScalar())
8451       return false;
8452     LoopVectorizationCostModel::InstWidening Decision =
8453         CM.getWideningDecision(I, VF);
8454     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8455            "CM decision should be taken at this point.");
8456     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8457       return true;
8458     if (CM.isScalarAfterVectorization(I, VF) ||
8459         CM.isProfitableToScalarize(I, VF))
8460       return false;
8461     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8462   };
8463 
8464   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8465     return nullptr;
8466 
8467   VPValue *Mask = nullptr;
8468   if (Legal->isMaskRequired(I))
8469     Mask = createBlockInMask(I->getParent(), Plan);
8470 
8471   // Determine if the pointer operand of the access is either consecutive or
8472   // reverse consecutive.
8473   LoopVectorizationCostModel::InstWidening Decision =
8474       CM.getWideningDecision(I, Range.Start);
8475   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8476   bool Consecutive =
8477       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8478 
8479   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8480     return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
8481                                               Consecutive, Reverse);
8482 
8483   StoreInst *Store = cast<StoreInst>(I);
8484   return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
8485                                             Mask, Consecutive, Reverse);
8486 }
8487 
8488 static VPWidenIntOrFpInductionRecipe *
8489 createWidenInductionRecipe(PHINode *Phi, Instruction *PhiOrTrunc,
8490                            VPValue *Start, const InductionDescriptor &IndDesc,
8491                            LoopVectorizationCostModel &CM, Loop &OrigLoop,
8492                            VFRange &Range) {
8493   // Returns true if an instruction \p I should be scalarized instead of
8494   // vectorized for the chosen vectorization factor.
8495   auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) {
8496     return CM.isScalarAfterVectorization(I, VF) ||
8497            CM.isProfitableToScalarize(I, VF);
8498   };
8499 
8500   bool NeedsScalarIV = LoopVectorizationPlanner::getDecisionAndClampRange(
8501       [&](ElementCount VF) {
8502         // Returns true if we should generate a scalar version of \p IV.
8503         if (ShouldScalarizeInstruction(PhiOrTrunc, VF))
8504           return true;
8505         auto isScalarInst = [&](User *U) -> bool {
8506           auto *I = cast<Instruction>(U);
8507           return OrigLoop.contains(I) && ShouldScalarizeInstruction(I, VF);
8508         };
8509         return any_of(PhiOrTrunc->users(), isScalarInst);
8510       },
8511       Range);
8512   bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange(
8513       [&](ElementCount VF) {
8514         return ShouldScalarizeInstruction(PhiOrTrunc, VF);
8515       },
8516       Range);
8517   assert(IndDesc.getStartValue() ==
8518          Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8519   if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8520     return new VPWidenIntOrFpInductionRecipe(Phi, Start, IndDesc, TruncI,
8521                                              NeedsScalarIV, !NeedsScalarIVOnly);
8522   }
8523   assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8524   return new VPWidenIntOrFpInductionRecipe(Phi, Start, IndDesc, NeedsScalarIV,
8525                                            !NeedsScalarIVOnly);
8526 }
8527 
8528 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
8529     PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) const {
8530 
8531   // Check if this is an integer or fp induction. If so, build the recipe that
8532   // produces its scalar and vector values.
8533   if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8534     return createWidenInductionRecipe(Phi, Phi, Operands[0], *II, CM, *OrigLoop,
8535                                       Range);
8536 
8537   return nullptr;
8538 }
8539 
8540 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8541     TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range,
8542     VPlan &Plan) const {
8543   // Optimize the special case where the source is a constant integer
8544   // induction variable. Notice that we can only optimize the 'trunc' case
8545   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8546   // (c) other casts depend on pointer size.
8547 
8548   // Determine whether \p K is a truncation based on an induction variable that
8549   // can be optimized.
8550   auto isOptimizableIVTruncate =
8551       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8552     return [=](ElementCount VF) -> bool {
8553       return CM.isOptimizableIVTruncate(K, VF);
8554     };
8555   };
8556 
8557   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8558           isOptimizableIVTruncate(I), Range)) {
8559 
8560     auto *Phi = cast<PHINode>(I->getOperand(0));
8561     const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8562     VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
8563     return createWidenInductionRecipe(Phi, I, Start, II, CM, *OrigLoop, Range);
8564   }
8565   return nullptr;
8566 }
8567 
8568 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
8569                                                 ArrayRef<VPValue *> Operands,
8570                                                 VPlanPtr &Plan) {
8571   // If all incoming values are equal, the incoming VPValue can be used directly
8572   // instead of creating a new VPBlendRecipe.
8573   VPValue *FirstIncoming = Operands[0];
8574   if (all_of(Operands, [FirstIncoming](const VPValue *Inc) {
8575         return FirstIncoming == Inc;
8576       })) {
8577     return Operands[0];
8578   }
8579 
8580   // We know that all PHIs in non-header blocks are converted into selects, so
8581   // we don't have to worry about the insertion order and we can just use the
8582   // builder. At this point we generate the predication tree. There may be
8583   // duplications since this is a simple recursive scan, but future
8584   // optimizations will clean it up.
8585   SmallVector<VPValue *, 2> OperandsWithMask;
8586   unsigned NumIncoming = Phi->getNumIncomingValues();
8587 
8588   for (unsigned In = 0; In < NumIncoming; In++) {
8589     VPValue *EdgeMask =
8590       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8591     assert((EdgeMask || NumIncoming == 1) &&
8592            "Multiple predecessors with one having a full mask");
8593     OperandsWithMask.push_back(Operands[In]);
8594     if (EdgeMask)
8595       OperandsWithMask.push_back(EdgeMask);
8596   }
8597   return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
8598 }
8599 
8600 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8601                                                    ArrayRef<VPValue *> Operands,
8602                                                    VFRange &Range) const {
8603 
8604   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8605       [this, CI](ElementCount VF) {
8606         return CM.isScalarWithPredication(CI, VF);
8607       },
8608       Range);
8609 
8610   if (IsPredicated)
8611     return nullptr;
8612 
8613   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8614   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8615              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8616              ID == Intrinsic::pseudoprobe ||
8617              ID == Intrinsic::experimental_noalias_scope_decl))
8618     return nullptr;
8619 
8620   auto willWiden = [&](ElementCount VF) -> bool {
8621     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8622     // The following case may be scalarized depending on the VF.
8623     // The flag shows whether we use Intrinsic or a usual Call for vectorized
8624     // version of the instruction.
8625     // Is it beneficial to perform intrinsic call compared to lib call?
8626     bool NeedToScalarize = false;
8627     InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
8628     InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
8629     bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
8630     return UseVectorIntrinsic || !NeedToScalarize;
8631   };
8632 
8633   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8634     return nullptr;
8635 
8636   ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());
8637   return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()));
8638 }
8639 
8640 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8641   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8642          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8643   // Instruction should be widened, unless it is scalar after vectorization,
8644   // scalarization is profitable or it is predicated.
8645   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8646     return CM.isScalarAfterVectorization(I, VF) ||
8647            CM.isProfitableToScalarize(I, VF) ||
8648            CM.isScalarWithPredication(I, VF);
8649   };
8650   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8651                                                              Range);
8652 }
8653 
8654 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8655                                            ArrayRef<VPValue *> Operands) const {
8656   auto IsVectorizableOpcode = [](unsigned Opcode) {
8657     switch (Opcode) {
8658     case Instruction::Add:
8659     case Instruction::And:
8660     case Instruction::AShr:
8661     case Instruction::BitCast:
8662     case Instruction::FAdd:
8663     case Instruction::FCmp:
8664     case Instruction::FDiv:
8665     case Instruction::FMul:
8666     case Instruction::FNeg:
8667     case Instruction::FPExt:
8668     case Instruction::FPToSI:
8669     case Instruction::FPToUI:
8670     case Instruction::FPTrunc:
8671     case Instruction::FRem:
8672     case Instruction::FSub:
8673     case Instruction::ICmp:
8674     case Instruction::IntToPtr:
8675     case Instruction::LShr:
8676     case Instruction::Mul:
8677     case Instruction::Or:
8678     case Instruction::PtrToInt:
8679     case Instruction::SDiv:
8680     case Instruction::Select:
8681     case Instruction::SExt:
8682     case Instruction::Shl:
8683     case Instruction::SIToFP:
8684     case Instruction::SRem:
8685     case Instruction::Sub:
8686     case Instruction::Trunc:
8687     case Instruction::UDiv:
8688     case Instruction::UIToFP:
8689     case Instruction::URem:
8690     case Instruction::Xor:
8691     case Instruction::ZExt:
8692       return true;
8693     }
8694     return false;
8695   };
8696 
8697   if (!IsVectorizableOpcode(I->getOpcode()))
8698     return nullptr;
8699 
8700   // Success: widen this instruction.
8701   return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8702 }
8703 
8704 void VPRecipeBuilder::fixHeaderPhis() {
8705   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8706   for (VPHeaderPHIRecipe *R : PhisToFix) {
8707     auto *PN = cast<PHINode>(R->getUnderlyingValue());
8708     VPRecipeBase *IncR =
8709         getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8710     R->addOperand(IncR->getVPSingleValue());
8711   }
8712 }
8713 
8714 VPBasicBlock *VPRecipeBuilder::handleReplication(
8715     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8716     VPlanPtr &Plan) {
8717   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8718       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8719       Range);
8720 
8721   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8722       [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); },
8723       Range);
8724 
8725   // Even if the instruction is not marked as uniform, there are certain
8726   // intrinsic calls that can be effectively treated as such, so we check for
8727   // them here. Conservatively, we only do this for scalable vectors, since
8728   // for fixed-width VFs we can always fall back on full scalarization.
8729   if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8730     switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8731     case Intrinsic::assume:
8732     case Intrinsic::lifetime_start:
8733     case Intrinsic::lifetime_end:
8734       // For scalable vectors if one of the operands is variant then we still
8735       // want to mark as uniform, which will generate one instruction for just
8736       // the first lane of the vector. We can't scalarize the call in the same
8737       // way as for fixed-width vectors because we don't know how many lanes
8738       // there are.
8739       //
8740       // The reasons for doing it this way for scalable vectors are:
8741       //   1. For the assume intrinsic generating the instruction for the first
8742       //      lane is still be better than not generating any at all. For
8743       //      example, the input may be a splat across all lanes.
8744       //   2. For the lifetime start/end intrinsics the pointer operand only
8745       //      does anything useful when the input comes from a stack object,
8746       //      which suggests it should always be uniform. For non-stack objects
8747       //      the effect is to poison the object, which still allows us to
8748       //      remove the call.
8749       IsUniform = true;
8750       break;
8751     default:
8752       break;
8753     }
8754   }
8755 
8756   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8757                                        IsUniform, IsPredicated);
8758   setRecipe(I, Recipe);
8759   Plan->addVPValue(I, Recipe);
8760 
8761   // Find if I uses a predicated instruction. If so, it will use its scalar
8762   // value. Avoid hoisting the insert-element which packs the scalar value into
8763   // a vector value, as that happens iff all users use the vector value.
8764   for (VPValue *Op : Recipe->operands()) {
8765     auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef());
8766     if (!PredR)
8767       continue;
8768     auto *RepR =
8769         cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef());
8770     assert(RepR->isPredicated() &&
8771            "expected Replicate recipe to be predicated");
8772     RepR->setAlsoPack(false);
8773   }
8774 
8775   // Finalize the recipe for Instr, first if it is not predicated.
8776   if (!IsPredicated) {
8777     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8778     VPBB->appendRecipe(Recipe);
8779     return VPBB;
8780   }
8781   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8782 
8783   VPBlockBase *SingleSucc = VPBB->getSingleSuccessor();
8784   assert(SingleSucc && "VPBB must have a single successor when handling "
8785                        "predicated replication.");
8786   VPBlockUtils::disconnectBlocks(VPBB, SingleSucc);
8787   // Record predicated instructions for above packing optimizations.
8788   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
8789   VPBlockUtils::insertBlockAfter(Region, VPBB);
8790   auto *RegSucc = new VPBasicBlock();
8791   VPBlockUtils::insertBlockAfter(RegSucc, Region);
8792   VPBlockUtils::connectBlocks(RegSucc, SingleSucc);
8793   return RegSucc;
8794 }
8795 
8796 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
8797                                                       VPRecipeBase *PredRecipe,
8798                                                       VPlanPtr &Plan) {
8799   // Instructions marked for predication are replicated and placed under an
8800   // if-then construct to prevent side-effects.
8801 
8802   // Generate recipes to compute the block mask for this region.
8803   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
8804 
8805   // Build the triangular if-then region.
8806   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8807   assert(Instr->getParent() && "Predicated instruction not in any basic block");
8808   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8809   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8810   auto *PHIRecipe = Instr->getType()->isVoidTy()
8811                         ? nullptr
8812                         : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
8813   if (PHIRecipe) {
8814     Plan->removeVPValueFor(Instr);
8815     Plan->addVPValue(Instr, PHIRecipe);
8816   }
8817   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8818   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8819   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
8820 
8821   // Note: first set Entry as region entry and then connect successors starting
8822   // from it in order, to propagate the "parent" of each VPBasicBlock.
8823   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
8824   VPBlockUtils::connectBlocks(Pred, Exit);
8825 
8826   return Region;
8827 }
8828 
8829 VPRecipeOrVPValueTy
8830 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8831                                         ArrayRef<VPValue *> Operands,
8832                                         VFRange &Range, VPlanPtr &Plan) {
8833   // First, check for specific widening recipes that deal with calls, memory
8834   // operations, inductions and Phi nodes.
8835   if (auto *CI = dyn_cast<CallInst>(Instr))
8836     return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
8837 
8838   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8839     return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
8840 
8841   VPRecipeBase *Recipe;
8842   if (auto Phi = dyn_cast<PHINode>(Instr)) {
8843     if (Phi->getParent() != OrigLoop->getHeader())
8844       return tryToBlend(Phi, Operands, Plan);
8845     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
8846       return toVPRecipeResult(Recipe);
8847 
8848     VPHeaderPHIRecipe *PhiRecipe = nullptr;
8849     if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) {
8850       VPValue *StartV = Operands[0];
8851       if (Legal->isReductionVariable(Phi)) {
8852         const RecurrenceDescriptor &RdxDesc =
8853             Legal->getReductionVars().find(Phi)->second;
8854         assert(RdxDesc.getRecurrenceStartValue() ==
8855                Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8856         PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8857                                              CM.isInLoopReduction(Phi),
8858                                              CM.useOrderedReductions(RdxDesc));
8859       } else {
8860         PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8861       }
8862 
8863       // Record the incoming value from the backedge, so we can add the incoming
8864       // value from the backedge after all recipes have been created.
8865       recordRecipeOf(cast<Instruction>(
8866           Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())));
8867       PhisToFix.push_back(PhiRecipe);
8868     } else {
8869       // TODO: record backedge value for remaining pointer induction phis.
8870       assert(Phi->getType()->isPointerTy() &&
8871              "only pointer phis should be handled here");
8872       assert(Legal->getInductionVars().count(Phi) &&
8873              "Not an induction variable");
8874       InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
8875       VPValue *Start = Plan->getOrAddVPValue(II.getStartValue());
8876       PhiRecipe = new VPWidenPHIRecipe(Phi, Start);
8877     }
8878 
8879     return toVPRecipeResult(PhiRecipe);
8880   }
8881 
8882   if (isa<TruncInst>(Instr) &&
8883       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
8884                                                Range, *Plan)))
8885     return toVPRecipeResult(Recipe);
8886 
8887   if (!shouldWiden(Instr, Range))
8888     return nullptr;
8889 
8890   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8891     return toVPRecipeResult(new VPWidenGEPRecipe(
8892         GEP, make_range(Operands.begin(), Operands.end()), OrigLoop));
8893 
8894   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8895     bool InvariantCond =
8896         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
8897     return toVPRecipeResult(new VPWidenSelectRecipe(
8898         *SI, make_range(Operands.begin(), Operands.end()), InvariantCond));
8899   }
8900 
8901   return toVPRecipeResult(tryToWiden(Instr, Operands));
8902 }
8903 
8904 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8905                                                         ElementCount MaxVF) {
8906   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8907 
8908   // Collect instructions from the original loop that will become trivially dead
8909   // in the vectorized loop. We don't need to vectorize these instructions. For
8910   // example, original induction update instructions can become dead because we
8911   // separately emit induction "steps" when generating code for the new loop.
8912   // Similarly, we create a new latch condition when setting up the structure
8913   // of the new loop, so the old one can become dead.
8914   SmallPtrSet<Instruction *, 4> DeadInstructions;
8915   collectTriviallyDeadInstructions(DeadInstructions);
8916 
8917   // Add assume instructions we need to drop to DeadInstructions, to prevent
8918   // them from being added to the VPlan.
8919   // TODO: We only need to drop assumes in blocks that get flattend. If the
8920   // control flow is preserved, we should keep them.
8921   auto &ConditionalAssumes = Legal->getConditionalAssumes();
8922   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
8923 
8924   MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
8925   // Dead instructions do not need sinking. Remove them from SinkAfter.
8926   for (Instruction *I : DeadInstructions)
8927     SinkAfter.erase(I);
8928 
8929   // Cannot sink instructions after dead instructions (there won't be any
8930   // recipes for them). Instead, find the first non-dead previous instruction.
8931   for (auto &P : Legal->getSinkAfter()) {
8932     Instruction *SinkTarget = P.second;
8933     Instruction *FirstInst = &*SinkTarget->getParent()->begin();
8934     (void)FirstInst;
8935     while (DeadInstructions.contains(SinkTarget)) {
8936       assert(
8937           SinkTarget != FirstInst &&
8938           "Must find a live instruction (at least the one feeding the "
8939           "first-order recurrence PHI) before reaching beginning of the block");
8940       SinkTarget = SinkTarget->getPrevNode();
8941       assert(SinkTarget != P.first &&
8942              "sink source equals target, no sinking required");
8943     }
8944     P.second = SinkTarget;
8945   }
8946 
8947   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8948   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8949     VFRange SubRange = {VF, MaxVFPlusOne};
8950     VPlans.push_back(
8951         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
8952     VF = SubRange.End;
8953   }
8954 }
8955 
8956 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a
8957 // CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a
8958 // BranchOnCount VPInstruction to the latch.
8959 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
8960                                   bool HasNUW, bool IsVPlanNative) {
8961   Value *StartIdx = ConstantInt::get(IdxTy, 0);
8962   auto *StartV = Plan.getOrAddVPValue(StartIdx);
8963 
8964   auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8965   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8966   VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8967   if (IsVPlanNative)
8968     Header = cast<VPBasicBlock>(Header->getSingleSuccessor());
8969   Header->insert(CanonicalIVPHI, Header->begin());
8970 
8971   auto *CanonicalIVIncrement =
8972       new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW
8973                                : VPInstruction::CanonicalIVIncrement,
8974                         {CanonicalIVPHI}, DL);
8975   CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8976 
8977   VPBasicBlock *EB = TopRegion->getExitBasicBlock();
8978   if (IsVPlanNative) {
8979     EB = cast<VPBasicBlock>(EB->getSinglePredecessor());
8980     EB->setCondBit(nullptr);
8981   }
8982   EB->appendRecipe(CanonicalIVIncrement);
8983 
8984   auto *BranchOnCount =
8985       new VPInstruction(VPInstruction::BranchOnCount,
8986                         {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8987   EB->appendRecipe(BranchOnCount);
8988 }
8989 
8990 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
8991     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
8992     const MapVector<Instruction *, Instruction *> &SinkAfter) {
8993 
8994   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8995 
8996   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8997 
8998   // ---------------------------------------------------------------------------
8999   // Pre-construction: record ingredients whose recipes we'll need to further
9000   // process after constructing the initial VPlan.
9001   // ---------------------------------------------------------------------------
9002 
9003   // Mark instructions we'll need to sink later and their targets as
9004   // ingredients whose recipe we'll need to record.
9005   for (auto &Entry : SinkAfter) {
9006     RecipeBuilder.recordRecipeOf(Entry.first);
9007     RecipeBuilder.recordRecipeOf(Entry.second);
9008   }
9009   for (auto &Reduction : CM.getInLoopReductionChains()) {
9010     PHINode *Phi = Reduction.first;
9011     RecurKind Kind =
9012         Legal->getReductionVars().find(Phi)->second.getRecurrenceKind();
9013     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9014 
9015     RecipeBuilder.recordRecipeOf(Phi);
9016     for (auto &R : ReductionOperations) {
9017       RecipeBuilder.recordRecipeOf(R);
9018       // For min/max reducitons, where we have a pair of icmp/select, we also
9019       // need to record the ICmp recipe, so it can be removed later.
9020       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
9021              "Only min/max recurrences allowed for inloop reductions");
9022       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
9023         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
9024     }
9025   }
9026 
9027   // For each interleave group which is relevant for this (possibly trimmed)
9028   // Range, add it to the set of groups to be later applied to the VPlan and add
9029   // placeholders for its members' Recipes which we'll be replacing with a
9030   // single VPInterleaveRecipe.
9031   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
9032     auto applyIG = [IG, this](ElementCount VF) -> bool {
9033       return (VF.isVector() && // Query is illegal for VF == 1
9034               CM.getWideningDecision(IG->getInsertPos(), VF) ==
9035                   LoopVectorizationCostModel::CM_Interleave);
9036     };
9037     if (!getDecisionAndClampRange(applyIG, Range))
9038       continue;
9039     InterleaveGroups.insert(IG);
9040     for (unsigned i = 0; i < IG->getFactor(); i++)
9041       if (Instruction *Member = IG->getMember(i))
9042         RecipeBuilder.recordRecipeOf(Member);
9043   };
9044 
9045   // ---------------------------------------------------------------------------
9046   // Build initial VPlan: Scan the body of the loop in a topological order to
9047   // visit each basic block after having visited its predecessor basic blocks.
9048   // ---------------------------------------------------------------------------
9049 
9050   // Create initial VPlan skeleton, with separate header and latch blocks.
9051   VPBasicBlock *HeaderVPBB = new VPBasicBlock();
9052   VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
9053   VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
9054   auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop");
9055   auto Plan = std::make_unique<VPlan>(TopRegion);
9056 
9057   Instruction *DLInst =
9058       getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
9059   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(),
9060                         DLInst ? DLInst->getDebugLoc() : DebugLoc(),
9061                         !CM.foldTailByMasking(), false);
9062 
9063   // Scan the body of the loop in a topological order to visit each basic block
9064   // after having visited its predecessor basic blocks.
9065   LoopBlocksDFS DFS(OrigLoop);
9066   DFS.perform(LI);
9067 
9068   VPBasicBlock *VPBB = HeaderVPBB;
9069   SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove;
9070   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
9071     // Relevant instructions from basic block BB will be grouped into VPRecipe
9072     // ingredients and fill a new VPBasicBlock.
9073     unsigned VPBBsForBB = 0;
9074     VPBB->setName(BB->getName());
9075     Builder.setInsertPoint(VPBB);
9076 
9077     // Introduce each ingredient into VPlan.
9078     // TODO: Model and preserve debug instrinsics in VPlan.
9079     for (Instruction &I : BB->instructionsWithoutDebug()) {
9080       Instruction *Instr = &I;
9081 
9082       // First filter out irrelevant instructions, to ensure no recipes are
9083       // built for them.
9084       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
9085         continue;
9086 
9087       SmallVector<VPValue *, 4> Operands;
9088       auto *Phi = dyn_cast<PHINode>(Instr);
9089       if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
9090         Operands.push_back(Plan->getOrAddVPValue(
9091             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
9092       } else {
9093         auto OpRange = Plan->mapToVPValues(Instr->operands());
9094         Operands = {OpRange.begin(), OpRange.end()};
9095       }
9096       if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
9097               Instr, Operands, Range, Plan)) {
9098         // If Instr can be simplified to an existing VPValue, use it.
9099         if (RecipeOrValue.is<VPValue *>()) {
9100           auto *VPV = RecipeOrValue.get<VPValue *>();
9101           Plan->addVPValue(Instr, VPV);
9102           // If the re-used value is a recipe, register the recipe for the
9103           // instruction, in case the recipe for Instr needs to be recorded.
9104           if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef()))
9105             RecipeBuilder.setRecipe(Instr, R);
9106           continue;
9107         }
9108         // Otherwise, add the new recipe.
9109         VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>();
9110         for (auto *Def : Recipe->definedValues()) {
9111           auto *UV = Def->getUnderlyingValue();
9112           Plan->addVPValue(UV, Def);
9113         }
9114 
9115         if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) &&
9116             HeaderVPBB->getFirstNonPhi() != VPBB->end()) {
9117           // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section
9118           // of the header block. That can happen for truncates of induction
9119           // variables. Those recipes are moved to the phi section of the header
9120           // block after applying SinkAfter, which relies on the original
9121           // position of the trunc.
9122           assert(isa<TruncInst>(Instr));
9123           InductionsToMove.push_back(
9124               cast<VPWidenIntOrFpInductionRecipe>(Recipe));
9125         }
9126         RecipeBuilder.setRecipe(Instr, Recipe);
9127         VPBB->appendRecipe(Recipe);
9128         continue;
9129       }
9130 
9131       // Otherwise, if all widening options failed, Instruction is to be
9132       // replicated. This may create a successor for VPBB.
9133       VPBasicBlock *NextVPBB =
9134           RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan);
9135       if (NextVPBB != VPBB) {
9136         VPBB = NextVPBB;
9137         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
9138                                     : "");
9139       }
9140     }
9141 
9142     VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
9143     VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
9144   }
9145 
9146   // Fold the last, empty block into its predecessor.
9147   VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB);
9148   assert(VPBB && "expected to fold last (empty) block");
9149   // After here, VPBB should not be used.
9150   VPBB = nullptr;
9151 
9152   assert(isa<VPRegionBlock>(Plan->getEntry()) &&
9153          !Plan->getEntry()->getEntryBasicBlock()->empty() &&
9154          "entry block must be set to a VPRegionBlock having a non-empty entry "
9155          "VPBasicBlock");
9156   RecipeBuilder.fixHeaderPhis();
9157 
9158   // ---------------------------------------------------------------------------
9159   // Transform initial VPlan: Apply previously taken decisions, in order, to
9160   // bring the VPlan to its final state.
9161   // ---------------------------------------------------------------------------
9162 
9163   // Apply Sink-After legal constraints.
9164   auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {
9165     auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
9166     if (Region && Region->isReplicator()) {
9167       assert(Region->getNumSuccessors() == 1 &&
9168              Region->getNumPredecessors() == 1 && "Expected SESE region!");
9169       assert(R->getParent()->size() == 1 &&
9170              "A recipe in an original replicator region must be the only "
9171              "recipe in its block");
9172       return Region;
9173     }
9174     return nullptr;
9175   };
9176   for (auto &Entry : SinkAfter) {
9177     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
9178     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
9179 
9180     auto *TargetRegion = GetReplicateRegion(Target);
9181     auto *SinkRegion = GetReplicateRegion(Sink);
9182     if (!SinkRegion) {
9183       // If the sink source is not a replicate region, sink the recipe directly.
9184       if (TargetRegion) {
9185         // The target is in a replication region, make sure to move Sink to
9186         // the block after it, not into the replication region itself.
9187         VPBasicBlock *NextBlock =
9188             cast<VPBasicBlock>(TargetRegion->getSuccessors().front());
9189         Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
9190       } else
9191         Sink->moveAfter(Target);
9192       continue;
9193     }
9194 
9195     // The sink source is in a replicate region. Unhook the region from the CFG.
9196     auto *SinkPred = SinkRegion->getSinglePredecessor();
9197     auto *SinkSucc = SinkRegion->getSingleSuccessor();
9198     VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion);
9199     VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc);
9200     VPBlockUtils::connectBlocks(SinkPred, SinkSucc);
9201 
9202     if (TargetRegion) {
9203       // The target recipe is also in a replicate region, move the sink region
9204       // after the target region.
9205       auto *TargetSucc = TargetRegion->getSingleSuccessor();
9206       VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc);
9207       VPBlockUtils::connectBlocks(TargetRegion, SinkRegion);
9208       VPBlockUtils::connectBlocks(SinkRegion, TargetSucc);
9209     } else {
9210       // The sink source is in a replicate region, we need to move the whole
9211       // replicate region, which should only contain a single recipe in the
9212       // main block.
9213       auto *SplitBlock =
9214           Target->getParent()->splitAt(std::next(Target->getIterator()));
9215 
9216       auto *SplitPred = SplitBlock->getSinglePredecessor();
9217 
9218       VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock);
9219       VPBlockUtils::connectBlocks(SplitPred, SinkRegion);
9220       VPBlockUtils::connectBlocks(SinkRegion, SplitBlock);
9221     }
9222   }
9223 
9224   VPlanTransforms::removeRedundantCanonicalIVs(*Plan);
9225   VPlanTransforms::removeRedundantInductionCasts(*Plan);
9226 
9227   // Now that sink-after is done, move induction recipes for optimized truncates
9228   // to the phi section of the header block.
9229   for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove)
9230     Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
9231 
9232   // Adjust the recipes for any inloop reductions.
9233   adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExit()), Plan,
9234                              RecipeBuilder, Range.Start);
9235 
9236   // Introduce a recipe to combine the incoming and previous values of a
9237   // first-order recurrence.
9238   for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
9239     auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R);
9240     if (!RecurPhi)
9241       continue;
9242 
9243     VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe();
9244     VPBasicBlock *InsertBlock = PrevRecipe->getParent();
9245     auto *Region = GetReplicateRegion(PrevRecipe);
9246     if (Region)
9247       InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor());
9248     if (Region || PrevRecipe->isPhi())
9249       Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
9250     else
9251       Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator()));
9252 
9253     auto *RecurSplice = cast<VPInstruction>(
9254         Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
9255                              {RecurPhi, RecurPhi->getBackedgeValue()}));
9256 
9257     RecurPhi->replaceAllUsesWith(RecurSplice);
9258     // Set the first operand of RecurSplice to RecurPhi again, after replacing
9259     // all users.
9260     RecurSplice->setOperand(0, RecurPhi);
9261   }
9262 
9263   // Interleave memory: for each Interleave Group we marked earlier as relevant
9264   // for this VPlan, replace the Recipes widening its memory instructions with a
9265   // single VPInterleaveRecipe at its insertion point.
9266   for (auto IG : InterleaveGroups) {
9267     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
9268         RecipeBuilder.getRecipe(IG->getInsertPos()));
9269     SmallVector<VPValue *, 4> StoredValues;
9270     for (unsigned i = 0; i < IG->getFactor(); ++i)
9271       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
9272         auto *StoreR =
9273             cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
9274         StoredValues.push_back(StoreR->getStoredValue());
9275       }
9276 
9277     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
9278                                         Recipe->getMask());
9279     VPIG->insertBefore(Recipe);
9280     unsigned J = 0;
9281     for (unsigned i = 0; i < IG->getFactor(); ++i)
9282       if (Instruction *Member = IG->getMember(i)) {
9283         if (!Member->getType()->isVoidTy()) {
9284           VPValue *OriginalV = Plan->getVPValue(Member);
9285           Plan->removeVPValueFor(Member);
9286           Plan->addVPValue(Member, VPIG->getVPValue(J));
9287           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
9288           J++;
9289         }
9290         RecipeBuilder.getRecipe(Member)->eraseFromParent();
9291       }
9292   }
9293 
9294   // From this point onwards, VPlan-to-VPlan transformations may change the plan
9295   // in ways that accessing values using original IR values is incorrect.
9296   Plan->disableValue2VPValue();
9297 
9298   VPlanTransforms::sinkScalarOperands(*Plan);
9299   VPlanTransforms::mergeReplicateRegions(*Plan);
9300 
9301   std::string PlanName;
9302   raw_string_ostream RSO(PlanName);
9303   ElementCount VF = Range.Start;
9304   Plan->addVF(VF);
9305   RSO << "Initial VPlan for VF={" << VF;
9306   for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
9307     Plan->addVF(VF);
9308     RSO << "," << VF;
9309   }
9310   RSO << "},UF>=1";
9311   RSO.flush();
9312   Plan->setName(PlanName);
9313 
9314   // Fold Exit block into its predecessor if possible.
9315   // TODO: Fold block earlier once all VPlan transforms properly maintain a
9316   // VPBasicBlock as exit.
9317   VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExit());
9318 
9319   assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
9320   return Plan;
9321 }
9322 
9323 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9324   // Outer loop handling: They may require CFG and instruction level
9325   // transformations before even evaluating whether vectorization is profitable.
9326   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9327   // the vectorization pipeline.
9328   assert(!OrigLoop->isInnermost());
9329   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9330 
9331   // Create new empty VPlan
9332   auto Plan = std::make_unique<VPlan>();
9333 
9334   // Build hierarchical CFG
9335   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9336   HCFGBuilder.buildHierarchicalCFG();
9337 
9338   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
9339        VF *= 2)
9340     Plan->addVF(VF);
9341 
9342   if (EnableVPlanPredication) {
9343     VPlanPredicator VPP(*Plan);
9344     VPP.predicate();
9345 
9346     // Avoid running transformation to recipes until masked code generation in
9347     // VPlan-native path is in place.
9348     return Plan;
9349   }
9350 
9351   SmallPtrSet<Instruction *, 1> DeadInstructions;
9352   VPlanTransforms::VPInstructionsToVPRecipes(
9353       OrigLoop, Plan,
9354       [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
9355       DeadInstructions, *PSE.getSE());
9356 
9357   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(),
9358                         true, true);
9359   return Plan;
9360 }
9361 
9362 // Adjust the recipes for reductions. For in-loop reductions the chain of
9363 // instructions leading from the loop exit instr to the phi need to be converted
9364 // to reductions, with one operand being vector and the other being the scalar
9365 // reduction chain. For other reductions, a select is introduced between the phi
9366 // and live-out recipes when folding the tail.
9367 void LoopVectorizationPlanner::adjustRecipesForReductions(
9368     VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
9369     ElementCount MinVF) {
9370   for (auto &Reduction : CM.getInLoopReductionChains()) {
9371     PHINode *Phi = Reduction.first;
9372     const RecurrenceDescriptor &RdxDesc =
9373         Legal->getReductionVars().find(Phi)->second;
9374     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9375 
9376     if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc))
9377       continue;
9378 
9379     // ReductionOperations are orders top-down from the phi's use to the
9380     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
9381     // which of the two operands will remain scalar and which will be reduced.
9382     // For minmax the chain will be the select instructions.
9383     Instruction *Chain = Phi;
9384     for (Instruction *R : ReductionOperations) {
9385       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
9386       RecurKind Kind = RdxDesc.getRecurrenceKind();
9387 
9388       VPValue *ChainOp = Plan->getVPValue(Chain);
9389       unsigned FirstOpId;
9390       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
9391              "Only min/max recurrences allowed for inloop reductions");
9392       // Recognize a call to the llvm.fmuladd intrinsic.
9393       bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9394       assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) &&
9395              "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9396       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9397         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
9398                "Expected to replace a VPWidenSelectSC");
9399         FirstOpId = 1;
9400       } else {
9401         assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) ||
9402                 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) &&
9403                "Expected to replace a VPWidenSC");
9404         FirstOpId = 0;
9405       }
9406       unsigned VecOpId =
9407           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
9408       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
9409 
9410       auto *CondOp = CM.foldTailByMasking()
9411                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
9412                          : nullptr;
9413 
9414       if (IsFMulAdd) {
9415         // If the instruction is a call to the llvm.fmuladd intrinsic then we
9416         // need to create an fmul recipe to use as the vector operand for the
9417         // fadd reduction.
9418         VPInstruction *FMulRecipe = new VPInstruction(
9419             Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))});
9420         FMulRecipe->setFastMathFlags(R->getFastMathFlags());
9421         WidenRecipe->getParent()->insert(FMulRecipe,
9422                                          WidenRecipe->getIterator());
9423         VecOp = FMulRecipe;
9424       }
9425       VPReductionRecipe *RedRecipe =
9426           new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
9427       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9428       Plan->removeVPValueFor(R);
9429       Plan->addVPValue(R, RedRecipe);
9430       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
9431       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9432       WidenRecipe->eraseFromParent();
9433 
9434       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9435         VPRecipeBase *CompareRecipe =
9436             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
9437         assert(isa<VPWidenRecipe>(CompareRecipe) &&
9438                "Expected to replace a VPWidenSC");
9439         assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
9440                "Expected no remaining users");
9441         CompareRecipe->eraseFromParent();
9442       }
9443       Chain = R;
9444     }
9445   }
9446 
9447   // If tail is folded by masking, introduce selects between the phi
9448   // and the live-out instruction of each reduction, at the beginning of the
9449   // dedicated latch block.
9450   if (CM.foldTailByMasking()) {
9451     Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin());
9452     for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
9453       VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9454       if (!PhiR || PhiR->isInLoop())
9455         continue;
9456       VPValue *Cond =
9457           RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
9458       VPValue *Red = PhiR->getBackedgeValue();
9459       assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB &&
9460              "reduction recipe must be defined before latch");
9461       Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});
9462     }
9463   }
9464 }
9465 
9466 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9467 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9468                                VPSlotTracker &SlotTracker) const {
9469   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9470   IG->getInsertPos()->printAsOperand(O, false);
9471   O << ", ";
9472   getAddr()->printAsOperand(O, SlotTracker);
9473   VPValue *Mask = getMask();
9474   if (Mask) {
9475     O << ", ";
9476     Mask->printAsOperand(O, SlotTracker);
9477   }
9478 
9479   unsigned OpIdx = 0;
9480   for (unsigned i = 0; i < IG->getFactor(); ++i) {
9481     if (!IG->getMember(i))
9482       continue;
9483     if (getNumStoreOperands() > 0) {
9484       O << "\n" << Indent << "  store ";
9485       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9486       O << " to index " << i;
9487     } else {
9488       O << "\n" << Indent << "  ";
9489       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
9490       O << " = load from index " << i;
9491     }
9492     ++OpIdx;
9493   }
9494 }
9495 #endif
9496 
9497 void VPWidenCallRecipe::execute(VPTransformState &State) {
9498   State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
9499                                   *this, State);
9500 }
9501 
9502 void VPWidenSelectRecipe::execute(VPTransformState &State) {
9503   auto &I = *cast<SelectInst>(getUnderlyingInstr());
9504   State.ILV->setDebugLocFromInst(&I);
9505 
9506   // The condition can be loop invariant  but still defined inside the
9507   // loop. This means that we can't just use the original 'cond' value.
9508   // We have to take the 'vectorized' value and pick the first lane.
9509   // Instcombine will make this a no-op.
9510   auto *InvarCond =
9511       InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr;
9512 
9513   for (unsigned Part = 0; Part < State.UF; ++Part) {
9514     Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part);
9515     Value *Op0 = State.get(getOperand(1), Part);
9516     Value *Op1 = State.get(getOperand(2), Part);
9517     Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
9518     State.set(this, Sel, Part);
9519     State.ILV->addMetadata(Sel, &I);
9520   }
9521 }
9522 
9523 void VPWidenRecipe::execute(VPTransformState &State) {
9524   auto &I = *cast<Instruction>(getUnderlyingValue());
9525   auto &Builder = State.Builder;
9526   switch (I.getOpcode()) {
9527   case Instruction::Call:
9528   case Instruction::Br:
9529   case Instruction::PHI:
9530   case Instruction::GetElementPtr:
9531   case Instruction::Select:
9532     llvm_unreachable("This instruction is handled by a different recipe.");
9533   case Instruction::UDiv:
9534   case Instruction::SDiv:
9535   case Instruction::SRem:
9536   case Instruction::URem:
9537   case Instruction::Add:
9538   case Instruction::FAdd:
9539   case Instruction::Sub:
9540   case Instruction::FSub:
9541   case Instruction::FNeg:
9542   case Instruction::Mul:
9543   case Instruction::FMul:
9544   case Instruction::FDiv:
9545   case Instruction::FRem:
9546   case Instruction::Shl:
9547   case Instruction::LShr:
9548   case Instruction::AShr:
9549   case Instruction::And:
9550   case Instruction::Or:
9551   case Instruction::Xor: {
9552     // Just widen unops and binops.
9553     State.ILV->setDebugLocFromInst(&I);
9554 
9555     for (unsigned Part = 0; Part < State.UF; ++Part) {
9556       SmallVector<Value *, 2> Ops;
9557       for (VPValue *VPOp : operands())
9558         Ops.push_back(State.get(VPOp, Part));
9559 
9560       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
9561 
9562       if (auto *VecOp = dyn_cast<Instruction>(V)) {
9563         VecOp->copyIRFlags(&I);
9564 
9565         // If the instruction is vectorized and was in a basic block that needed
9566         // predication, we can't propagate poison-generating flags (nuw/nsw,
9567         // exact, etc.). The control flow has been linearized and the
9568         // instruction is no longer guarded by the predicate, which could make
9569         // the flag properties to no longer hold.
9570         if (State.MayGeneratePoisonRecipes.contains(this))
9571           VecOp->dropPoisonGeneratingFlags();
9572       }
9573 
9574       // Use this vector value for all users of the original instruction.
9575       State.set(this, V, Part);
9576       State.ILV->addMetadata(V, &I);
9577     }
9578 
9579     break;
9580   }
9581   case Instruction::ICmp:
9582   case Instruction::FCmp: {
9583     // Widen compares. Generate vector compares.
9584     bool FCmp = (I.getOpcode() == Instruction::FCmp);
9585     auto *Cmp = cast<CmpInst>(&I);
9586     State.ILV->setDebugLocFromInst(Cmp);
9587     for (unsigned Part = 0; Part < State.UF; ++Part) {
9588       Value *A = State.get(getOperand(0), Part);
9589       Value *B = State.get(getOperand(1), Part);
9590       Value *C = nullptr;
9591       if (FCmp) {
9592         // Propagate fast math flags.
9593         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
9594         Builder.setFastMathFlags(Cmp->getFastMathFlags());
9595         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
9596       } else {
9597         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
9598       }
9599       State.set(this, C, Part);
9600       State.ILV->addMetadata(C, &I);
9601     }
9602 
9603     break;
9604   }
9605 
9606   case Instruction::ZExt:
9607   case Instruction::SExt:
9608   case Instruction::FPToUI:
9609   case Instruction::FPToSI:
9610   case Instruction::FPExt:
9611   case Instruction::PtrToInt:
9612   case Instruction::IntToPtr:
9613   case Instruction::SIToFP:
9614   case Instruction::UIToFP:
9615   case Instruction::Trunc:
9616   case Instruction::FPTrunc:
9617   case Instruction::BitCast: {
9618     auto *CI = cast<CastInst>(&I);
9619     State.ILV->setDebugLocFromInst(CI);
9620 
9621     /// Vectorize casts.
9622     Type *DestTy = (State.VF.isScalar())
9623                        ? CI->getType()
9624                        : VectorType::get(CI->getType(), State.VF);
9625 
9626     for (unsigned Part = 0; Part < State.UF; ++Part) {
9627       Value *A = State.get(getOperand(0), Part);
9628       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
9629       State.set(this, Cast, Part);
9630       State.ILV->addMetadata(Cast, &I);
9631     }
9632     break;
9633   }
9634   default:
9635     // This instruction is not vectorized by simple widening.
9636     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
9637     llvm_unreachable("Unhandled instruction!");
9638   } // end of switch.
9639 }
9640 
9641 void VPWidenGEPRecipe::execute(VPTransformState &State) {
9642   auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
9643   // Construct a vector GEP by widening the operands of the scalar GEP as
9644   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
9645   // results in a vector of pointers when at least one operand of the GEP
9646   // is vector-typed. Thus, to keep the representation compact, we only use
9647   // vector-typed operands for loop-varying values.
9648 
9649   if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
9650     // If we are vectorizing, but the GEP has only loop-invariant operands,
9651     // the GEP we build (by only using vector-typed operands for
9652     // loop-varying values) would be a scalar pointer. Thus, to ensure we
9653     // produce a vector of pointers, we need to either arbitrarily pick an
9654     // operand to broadcast, or broadcast a clone of the original GEP.
9655     // Here, we broadcast a clone of the original.
9656     //
9657     // TODO: If at some point we decide to scalarize instructions having
9658     //       loop-invariant operands, this special case will no longer be
9659     //       required. We would add the scalarization decision to
9660     //       collectLoopScalars() and teach getVectorValue() to broadcast
9661     //       the lane-zero scalar value.
9662     auto *Clone = State.Builder.Insert(GEP->clone());
9663     for (unsigned Part = 0; Part < State.UF; ++Part) {
9664       Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone);
9665       State.set(this, EntryPart, Part);
9666       State.ILV->addMetadata(EntryPart, GEP);
9667     }
9668   } else {
9669     // If the GEP has at least one loop-varying operand, we are sure to
9670     // produce a vector of pointers. But if we are only unrolling, we want
9671     // to produce a scalar GEP for each unroll part. Thus, the GEP we
9672     // produce with the code below will be scalar (if VF == 1) or vector
9673     // (otherwise). Note that for the unroll-only case, we still maintain
9674     // values in the vector mapping with initVector, as we do for other
9675     // instructions.
9676     for (unsigned Part = 0; Part < State.UF; ++Part) {
9677       // The pointer operand of the new GEP. If it's loop-invariant, we
9678       // won't broadcast it.
9679       auto *Ptr = IsPtrLoopInvariant
9680                       ? State.get(getOperand(0), VPIteration(0, 0))
9681                       : State.get(getOperand(0), Part);
9682 
9683       // Collect all the indices for the new GEP. If any index is
9684       // loop-invariant, we won't broadcast it.
9685       SmallVector<Value *, 4> Indices;
9686       for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
9687         VPValue *Operand = getOperand(I);
9688         if (IsIndexLoopInvariant[I - 1])
9689           Indices.push_back(State.get(Operand, VPIteration(0, 0)));
9690         else
9691           Indices.push_back(State.get(Operand, Part));
9692       }
9693 
9694       // If the GEP instruction is vectorized and was in a basic block that
9695       // needed predication, we can't propagate the poison-generating 'inbounds'
9696       // flag. The control flow has been linearized and the GEP is no longer
9697       // guarded by the predicate, which could make the 'inbounds' properties to
9698       // no longer hold.
9699       bool IsInBounds =
9700           GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0;
9701 
9702       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
9703       // but it should be a vector, otherwise.
9704       auto *NewGEP = IsInBounds
9705                          ? State.Builder.CreateInBoundsGEP(
9706                                GEP->getSourceElementType(), Ptr, Indices)
9707                          : State.Builder.CreateGEP(GEP->getSourceElementType(),
9708                                                    Ptr, Indices);
9709       assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
9710              "NewGEP is not a pointer vector");
9711       State.set(this, NewGEP, Part);
9712       State.ILV->addMetadata(NewGEP, GEP);
9713     }
9714   }
9715 }
9716 
9717 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
9718   assert(!State.Instance && "Int or FP induction being replicated.");
9719   auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0);
9720   State.ILV->widenIntOrFpInduction(IV, this, State, CanonicalIV);
9721 }
9722 
9723 void VPWidenPHIRecipe::execute(VPTransformState &State) {
9724   State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this,
9725                                  State);
9726 }
9727 
9728 void VPBlendRecipe::execute(VPTransformState &State) {
9729   State.ILV->setDebugLocFromInst(Phi, &State.Builder);
9730   // We know that all PHIs in non-header blocks are converted into
9731   // selects, so we don't have to worry about the insertion order and we
9732   // can just use the builder.
9733   // At this point we generate the predication tree. There may be
9734   // duplications since this is a simple recursive scan, but future
9735   // optimizations will clean it up.
9736 
9737   unsigned NumIncoming = getNumIncomingValues();
9738 
9739   // Generate a sequence of selects of the form:
9740   // SELECT(Mask3, In3,
9741   //        SELECT(Mask2, In2,
9742   //               SELECT(Mask1, In1,
9743   //                      In0)))
9744   // Note that Mask0 is never used: lanes for which no path reaches this phi and
9745   // are essentially undef are taken from In0.
9746   InnerLoopVectorizer::VectorParts Entry(State.UF);
9747   for (unsigned In = 0; In < NumIncoming; ++In) {
9748     for (unsigned Part = 0; Part < State.UF; ++Part) {
9749       // We might have single edge PHIs (blocks) - use an identity
9750       // 'select' for the first PHI operand.
9751       Value *In0 = State.get(getIncomingValue(In), Part);
9752       if (In == 0)
9753         Entry[Part] = In0; // Initialize with the first incoming value.
9754       else {
9755         // Select between the current value and the previous incoming edge
9756         // based on the incoming mask.
9757         Value *Cond = State.get(getMask(In), Part);
9758         Entry[Part] =
9759             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
9760       }
9761     }
9762   }
9763   for (unsigned Part = 0; Part < State.UF; ++Part)
9764     State.set(this, Entry[Part], Part);
9765 }
9766 
9767 void VPInterleaveRecipe::execute(VPTransformState &State) {
9768   assert(!State.Instance && "Interleave group being replicated.");
9769   State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9770                                       getStoredValues(), getMask());
9771 }
9772 
9773 void VPReductionRecipe::execute(VPTransformState &State) {
9774   assert(!State.Instance && "Reduction being replicated.");
9775   Value *PrevInChain = State.get(getChainOp(), 0);
9776   RecurKind Kind = RdxDesc->getRecurrenceKind();
9777   bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc);
9778   // Propagate the fast-math flags carried by the underlying instruction.
9779   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
9780   State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags());
9781   for (unsigned Part = 0; Part < State.UF; ++Part) {
9782     Value *NewVecOp = State.get(getVecOp(), Part);
9783     if (VPValue *Cond = getCondOp()) {
9784       Value *NewCond = State.get(Cond, Part);
9785       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
9786       Value *Iden = RdxDesc->getRecurrenceIdentity(
9787           Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags());
9788       Value *IdenVec =
9789           State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
9790       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
9791       NewVecOp = Select;
9792     }
9793     Value *NewRed;
9794     Value *NextInChain;
9795     if (IsOrdered) {
9796       if (State.VF.isVector())
9797         NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp,
9798                                         PrevInChain);
9799       else
9800         NewRed = State.Builder.CreateBinOp(
9801             (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain,
9802             NewVecOp);
9803       PrevInChain = NewRed;
9804     } else {
9805       PrevInChain = State.get(getChainOp(), Part);
9806       NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
9807     }
9808     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9809       NextInChain =
9810           createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
9811                          NewRed, PrevInChain);
9812     } else if (IsOrdered)
9813       NextInChain = NewRed;
9814     else
9815       NextInChain = State.Builder.CreateBinOp(
9816           (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed,
9817           PrevInChain);
9818     State.set(this, NextInChain, Part);
9819   }
9820 }
9821 
9822 void VPReplicateRecipe::execute(VPTransformState &State) {
9823   if (State.Instance) { // Generate a single instance.
9824     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9825     State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance,
9826                                     IsPredicated, State);
9827     // Insert scalar instance packing it into a vector.
9828     if (AlsoPack && State.VF.isVector()) {
9829       // If we're constructing lane 0, initialize to start from poison.
9830       if (State.Instance->Lane.isFirstLane()) {
9831         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9832         Value *Poison = PoisonValue::get(
9833             VectorType::get(getUnderlyingValue()->getType(), State.VF));
9834         State.set(this, Poison, State.Instance->Part);
9835       }
9836       State.ILV->packScalarIntoVectorValue(this, *State.Instance, State);
9837     }
9838     return;
9839   }
9840 
9841   // Generate scalar instances for all VF lanes of all UF parts, unless the
9842   // instruction is uniform inwhich case generate only the first lane for each
9843   // of the UF parts.
9844   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
9845   assert((!State.VF.isScalable() || IsUniform) &&
9846          "Can't scalarize a scalable vector");
9847   for (unsigned Part = 0; Part < State.UF; ++Part)
9848     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9849       State.ILV->scalarizeInstruction(getUnderlyingInstr(), this,
9850                                       VPIteration(Part, Lane), IsPredicated,
9851                                       State);
9852 }
9853 
9854 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
9855   assert(State.Instance && "Branch on Mask works only on single instance.");
9856 
9857   unsigned Part = State.Instance->Part;
9858   unsigned Lane = State.Instance->Lane.getKnownLane();
9859 
9860   Value *ConditionBit = nullptr;
9861   VPValue *BlockInMask = getMask();
9862   if (BlockInMask) {
9863     ConditionBit = State.get(BlockInMask, Part);
9864     if (ConditionBit->getType()->isVectorTy())
9865       ConditionBit = State.Builder.CreateExtractElement(
9866           ConditionBit, State.Builder.getInt32(Lane));
9867   } else // Block in mask is all-one.
9868     ConditionBit = State.Builder.getTrue();
9869 
9870   // Replace the temporary unreachable terminator with a new conditional branch,
9871   // whose two destinations will be set later when they are created.
9872   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
9873   assert(isa<UnreachableInst>(CurrentTerminator) &&
9874          "Expected to replace unreachable terminator with conditional branch.");
9875   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
9876   CondBr->setSuccessor(0, nullptr);
9877   ReplaceInstWithInst(CurrentTerminator, CondBr);
9878 }
9879 
9880 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
9881   assert(State.Instance && "Predicated instruction PHI works per instance.");
9882   Instruction *ScalarPredInst =
9883       cast<Instruction>(State.get(getOperand(0), *State.Instance));
9884   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
9885   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
9886   assert(PredicatingBB && "Predicated block has no single predecessor.");
9887   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
9888          "operand must be VPReplicateRecipe");
9889 
9890   // By current pack/unpack logic we need to generate only a single phi node: if
9891   // a vector value for the predicated instruction exists at this point it means
9892   // the instruction has vector users only, and a phi for the vector value is
9893   // needed. In this case the recipe of the predicated instruction is marked to
9894   // also do that packing, thereby "hoisting" the insert-element sequence.
9895   // Otherwise, a phi node for the scalar value is needed.
9896   unsigned Part = State.Instance->Part;
9897   if (State.hasVectorValue(getOperand(0), Part)) {
9898     Value *VectorValue = State.get(getOperand(0), Part);
9899     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
9900     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
9901     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
9902     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
9903     if (State.hasVectorValue(this, Part))
9904       State.reset(this, VPhi, Part);
9905     else
9906       State.set(this, VPhi, Part);
9907     // NOTE: Currently we need to update the value of the operand, so the next
9908     // predicated iteration inserts its generated value in the correct vector.
9909     State.reset(getOperand(0), VPhi, Part);
9910   } else {
9911     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
9912     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
9913     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
9914                      PredicatingBB);
9915     Phi->addIncoming(ScalarPredInst, PredicatedBB);
9916     if (State.hasScalarValue(this, *State.Instance))
9917       State.reset(this, Phi, *State.Instance);
9918     else
9919       State.set(this, Phi, *State.Instance);
9920     // NOTE: Currently we need to update the value of the operand, so the next
9921     // predicated iteration inserts its generated value in the correct vector.
9922     State.reset(getOperand(0), Phi, *State.Instance);
9923   }
9924 }
9925 
9926 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9927   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9928 
9929   // Attempt to issue a wide load.
9930   LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
9931   StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
9932 
9933   assert((LI || SI) && "Invalid Load/Store instruction");
9934   assert((!SI || StoredValue) && "No stored value provided for widened store");
9935   assert((!LI || !StoredValue) && "Stored value provided for widened load");
9936 
9937   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9938 
9939   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9940   const Align Alignment = getLoadStoreAlignment(&Ingredient);
9941   bool CreateGatherScatter = !Consecutive;
9942 
9943   auto &Builder = State.Builder;
9944   InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
9945   bool isMaskRequired = getMask();
9946   if (isMaskRequired)
9947     for (unsigned Part = 0; Part < State.UF; ++Part)
9948       BlockInMaskParts[Part] = State.get(getMask(), Part);
9949 
9950   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
9951     // Calculate the pointer for the specific unroll-part.
9952     GetElementPtrInst *PartPtr = nullptr;
9953 
9954     bool InBounds = false;
9955     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
9956       InBounds = gep->isInBounds();
9957     if (Reverse) {
9958       // If the address is consecutive but reversed, then the
9959       // wide store needs to start at the last vector element.
9960       // RunTimeVF =  VScale * VF.getKnownMinValue()
9961       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
9962       Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF);
9963       // NumElt = -Part * RunTimeVF
9964       Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
9965       // LastLane = 1 - RunTimeVF
9966       Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
9967       PartPtr =
9968           cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
9969       PartPtr->setIsInBounds(InBounds);
9970       PartPtr = cast<GetElementPtrInst>(
9971           Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
9972       PartPtr->setIsInBounds(InBounds);
9973       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
9974         BlockInMaskParts[Part] =
9975             Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse");
9976     } else {
9977       Value *Increment =
9978           createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part);
9979       PartPtr = cast<GetElementPtrInst>(
9980           Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
9981       PartPtr->setIsInBounds(InBounds);
9982     }
9983 
9984     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
9985     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
9986   };
9987 
9988   // Handle Stores:
9989   if (SI) {
9990     State.ILV->setDebugLocFromInst(SI);
9991 
9992     for (unsigned Part = 0; Part < State.UF; ++Part) {
9993       Instruction *NewSI = nullptr;
9994       Value *StoredVal = State.get(StoredValue, Part);
9995       if (CreateGatherScatter) {
9996         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9997         Value *VectorGep = State.get(getAddr(), Part);
9998         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
9999                                             MaskPart);
10000       } else {
10001         if (Reverse) {
10002           // If we store to reverse consecutive memory locations, then we need
10003           // to reverse the order of elements in the stored value.
10004           StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
10005           // We don't want to update the value in the map as it might be used in
10006           // another expression. So don't call resetVectorValue(StoredVal).
10007         }
10008         auto *VecPtr =
10009             CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
10010         if (isMaskRequired)
10011           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
10012                                             BlockInMaskParts[Part]);
10013         else
10014           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
10015       }
10016       State.ILV->addMetadata(NewSI, SI);
10017     }
10018     return;
10019   }
10020 
10021   // Handle loads.
10022   assert(LI && "Must have a load instruction");
10023   State.ILV->setDebugLocFromInst(LI);
10024   for (unsigned Part = 0; Part < State.UF; ++Part) {
10025     Value *NewLI;
10026     if (CreateGatherScatter) {
10027       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
10028       Value *VectorGep = State.get(getAddr(), Part);
10029       NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
10030                                          nullptr, "wide.masked.gather");
10031       State.ILV->addMetadata(NewLI, LI);
10032     } else {
10033       auto *VecPtr =
10034           CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
10035       if (isMaskRequired)
10036         NewLI = Builder.CreateMaskedLoad(
10037             DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
10038             PoisonValue::get(DataTy), "wide.masked.load");
10039       else
10040         NewLI =
10041             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
10042 
10043       // Add metadata to the load, but setVectorValue to the reverse shuffle.
10044       State.ILV->addMetadata(NewLI, LI);
10045       if (Reverse)
10046         NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
10047     }
10048 
10049     State.set(this, NewLI, Part);
10050   }
10051 }
10052 
10053 // Determine how to lower the scalar epilogue, which depends on 1) optimising
10054 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
10055 // predication, and 4) a TTI hook that analyses whether the loop is suitable
10056 // for predication.
10057 static ScalarEpilogueLowering getScalarEpilogueLowering(
10058     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
10059     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
10060     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
10061     LoopVectorizationLegality &LVL) {
10062   // 1) OptSize takes precedence over all other options, i.e. if this is set,
10063   // don't look at hints or options, and don't request a scalar epilogue.
10064   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
10065   // LoopAccessInfo (due to code dependency and not being able to reliably get
10066   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
10067   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
10068   // versioning when the vectorization is forced, unlike hasOptSize. So revert
10069   // back to the old way and vectorize with versioning when forced. See D81345.)
10070   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
10071                                                       PGSOQueryType::IRPass) &&
10072                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
10073     return CM_ScalarEpilogueNotAllowedOptSize;
10074 
10075   // 2) If set, obey the directives
10076   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
10077     switch (PreferPredicateOverEpilogue) {
10078     case PreferPredicateTy::ScalarEpilogue:
10079       return CM_ScalarEpilogueAllowed;
10080     case PreferPredicateTy::PredicateElseScalarEpilogue:
10081       return CM_ScalarEpilogueNotNeededUsePredicate;
10082     case PreferPredicateTy::PredicateOrDontVectorize:
10083       return CM_ScalarEpilogueNotAllowedUsePredicate;
10084     };
10085   }
10086 
10087   // 3) If set, obey the hints
10088   switch (Hints.getPredicate()) {
10089   case LoopVectorizeHints::FK_Enabled:
10090     return CM_ScalarEpilogueNotNeededUsePredicate;
10091   case LoopVectorizeHints::FK_Disabled:
10092     return CM_ScalarEpilogueAllowed;
10093   };
10094 
10095   // 4) if the TTI hook indicates this is profitable, request predication.
10096   if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
10097                                        LVL.getLAI()))
10098     return CM_ScalarEpilogueNotNeededUsePredicate;
10099 
10100   return CM_ScalarEpilogueAllowed;
10101 }
10102 
10103 Value *VPTransformState::get(VPValue *Def, unsigned Part) {
10104   // If Values have been set for this Def return the one relevant for \p Part.
10105   if (hasVectorValue(Def, Part))
10106     return Data.PerPartOutput[Def][Part];
10107 
10108   if (!hasScalarValue(Def, {Part, 0})) {
10109     Value *IRV = Def->getLiveInIRValue();
10110     Value *B = ILV->getBroadcastInstrs(IRV);
10111     set(Def, B, Part);
10112     return B;
10113   }
10114 
10115   Value *ScalarValue = get(Def, {Part, 0});
10116   // If we aren't vectorizing, we can just copy the scalar map values over
10117   // to the vector map.
10118   if (VF.isScalar()) {
10119     set(Def, ScalarValue, Part);
10120     return ScalarValue;
10121   }
10122 
10123   auto *RepR = dyn_cast<VPReplicateRecipe>(Def);
10124   bool IsUniform = RepR && RepR->isUniform();
10125 
10126   unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
10127   // Check if there is a scalar value for the selected lane.
10128   if (!hasScalarValue(Def, {Part, LastLane})) {
10129     // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform.
10130     assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) &&
10131            "unexpected recipe found to be invariant");
10132     IsUniform = true;
10133     LastLane = 0;
10134   }
10135 
10136   auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));
10137   // Set the insert point after the last scalarized instruction or after the
10138   // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
10139   // will directly follow the scalar definitions.
10140   auto OldIP = Builder.saveIP();
10141   auto NewIP =
10142       isa<PHINode>(LastInst)
10143           ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
10144           : std::next(BasicBlock::iterator(LastInst));
10145   Builder.SetInsertPoint(&*NewIP);
10146 
10147   // However, if we are vectorizing, we need to construct the vector values.
10148   // If the value is known to be uniform after vectorization, we can just
10149   // broadcast the scalar value corresponding to lane zero for each unroll
10150   // iteration. Otherwise, we construct the vector values using
10151   // insertelement instructions. Since the resulting vectors are stored in
10152   // State, we will only generate the insertelements once.
10153   Value *VectorValue = nullptr;
10154   if (IsUniform) {
10155     VectorValue = ILV->getBroadcastInstrs(ScalarValue);
10156     set(Def, VectorValue, Part);
10157   } else {
10158     // Initialize packing with insertelements to start from undef.
10159     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
10160     Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
10161     set(Def, Undef, Part);
10162     for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
10163       ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this);
10164     VectorValue = get(Def, Part);
10165   }
10166   Builder.restoreIP(OldIP);
10167   return VectorValue;
10168 }
10169 
10170 // Process the loop in the VPlan-native vectorization path. This path builds
10171 // VPlan upfront in the vectorization pipeline, which allows to apply
10172 // VPlan-to-VPlan transformations from the very beginning without modifying the
10173 // input LLVM IR.
10174 static bool processLoopInVPlanNativePath(
10175     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
10176     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
10177     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
10178     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
10179     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
10180     LoopVectorizationRequirements &Requirements) {
10181 
10182   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
10183     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
10184     return false;
10185   }
10186   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
10187   Function *F = L->getHeader()->getParent();
10188   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
10189 
10190   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10191       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
10192 
10193   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
10194                                 &Hints, IAI);
10195   // Use the planner for outer loop vectorization.
10196   // TODO: CM is not used at this point inside the planner. Turn CM into an
10197   // optional argument if we don't need it in the future.
10198   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints,
10199                                Requirements, ORE);
10200 
10201   // Get user vectorization factor.
10202   ElementCount UserVF = Hints.getWidth();
10203 
10204   CM.collectElementTypesForWidening();
10205 
10206   // Plan how to best vectorize, return the best VF and its cost.
10207   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
10208 
10209   // If we are stress testing VPlan builds, do not attempt to generate vector
10210   // code. Masked vector code generation support will follow soon.
10211   // Also, do not attempt to vectorize if no vector code will be produced.
10212   if (VPlanBuildStressTest || EnableVPlanPredication ||
10213       VectorizationFactor::Disabled() == VF)
10214     return false;
10215 
10216   VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10217 
10218   {
10219     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10220                              F->getParent()->getDataLayout());
10221     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
10222                            &CM, BFI, PSI, Checks);
10223     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
10224                       << L->getHeader()->getParent()->getName() << "\"\n");
10225     LVP.executePlan(VF.Width, 1, BestPlan, LB, DT);
10226   }
10227 
10228   // Mark the loop as already vectorized to avoid vectorizing again.
10229   Hints.setAlreadyVectorized();
10230   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10231   return true;
10232 }
10233 
10234 // Emit a remark if there are stores to floats that required a floating point
10235 // extension. If the vectorized loop was generated with floating point there
10236 // will be a performance penalty from the conversion overhead and the change in
10237 // the vector width.
10238 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
10239   SmallVector<Instruction *, 4> Worklist;
10240   for (BasicBlock *BB : L->getBlocks()) {
10241     for (Instruction &Inst : *BB) {
10242       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
10243         if (S->getValueOperand()->getType()->isFloatTy())
10244           Worklist.push_back(S);
10245       }
10246     }
10247   }
10248 
10249   // Traverse the floating point stores upwards searching, for floating point
10250   // conversions.
10251   SmallPtrSet<const Instruction *, 4> Visited;
10252   SmallPtrSet<const Instruction *, 4> EmittedRemark;
10253   while (!Worklist.empty()) {
10254     auto *I = Worklist.pop_back_val();
10255     if (!L->contains(I))
10256       continue;
10257     if (!Visited.insert(I).second)
10258       continue;
10259 
10260     // Emit a remark if the floating point store required a floating
10261     // point conversion.
10262     // TODO: More work could be done to identify the root cause such as a
10263     // constant or a function return type and point the user to it.
10264     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
10265       ORE->emit([&]() {
10266         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
10267                                           I->getDebugLoc(), L->getHeader())
10268                << "floating point conversion changes vector width. "
10269                << "Mixed floating point precision requires an up/down "
10270                << "cast that will negatively impact performance.";
10271       });
10272 
10273     for (Use &Op : I->operands())
10274       if (auto *OpI = dyn_cast<Instruction>(Op))
10275         Worklist.push_back(OpI);
10276   }
10277 }
10278 
10279 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
10280     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10281                                !EnableLoopInterleaving),
10282       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10283                               !EnableLoopVectorization) {}
10284 
10285 bool LoopVectorizePass::processLoop(Loop *L) {
10286   assert((EnableVPlanNativePath || L->isInnermost()) &&
10287          "VPlan-native path is not enabled. Only process inner loops.");
10288 
10289 #ifndef NDEBUG
10290   const std::string DebugLocStr = getDebugLocString(L);
10291 #endif /* NDEBUG */
10292 
10293   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
10294                     << L->getHeader()->getParent()->getName() << "\" from "
10295                     << DebugLocStr << "\n");
10296 
10297   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
10298 
10299   LLVM_DEBUG(
10300       dbgs() << "LV: Loop hints:"
10301              << " force="
10302              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
10303                      ? "disabled"
10304                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
10305                             ? "enabled"
10306                             : "?"))
10307              << " width=" << Hints.getWidth()
10308              << " interleave=" << Hints.getInterleave() << "\n");
10309 
10310   // Function containing loop
10311   Function *F = L->getHeader()->getParent();
10312 
10313   // Looking at the diagnostic output is the only way to determine if a loop
10314   // was vectorized (other than looking at the IR or machine code), so it
10315   // is important to generate an optimization remark for each loop. Most of
10316   // these messages are generated as OptimizationRemarkAnalysis. Remarks
10317   // generated as OptimizationRemark and OptimizationRemarkMissed are
10318   // less verbose reporting vectorized loops and unvectorized loops that may
10319   // benefit from vectorization, respectively.
10320 
10321   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10322     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10323     return false;
10324   }
10325 
10326   PredicatedScalarEvolution PSE(*SE, *L);
10327 
10328   // Check if it is legal to vectorize the loop.
10329   LoopVectorizationRequirements Requirements;
10330   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
10331                                 &Requirements, &Hints, DB, AC, BFI, PSI);
10332   if (!LVL.canVectorize(EnableVPlanNativePath)) {
10333     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10334     Hints.emitRemarkWithHints();
10335     return false;
10336   }
10337 
10338   // Check the function attributes and profiles to find out if this function
10339   // should be optimized for size.
10340   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10341       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
10342 
10343   // Entrance to the VPlan-native vectorization path. Outer loops are processed
10344   // here. They may require CFG and instruction level transformations before
10345   // even evaluating whether vectorization is profitable. Since we cannot modify
10346   // the incoming IR, we need to build VPlan upfront in the vectorization
10347   // pipeline.
10348   if (!L->isInnermost())
10349     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10350                                         ORE, BFI, PSI, Hints, Requirements);
10351 
10352   assert(L->isInnermost() && "Inner loop expected.");
10353 
10354   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10355   // count by optimizing for size, to minimize overheads.
10356   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
10357   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10358     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10359                       << "This loop is worth vectorizing only if no scalar "
10360                       << "iteration overheads are incurred.");
10361     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10362       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10363     else {
10364       LLVM_DEBUG(dbgs() << "\n");
10365       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10366     }
10367   }
10368 
10369   // Check the function attributes to see if implicit floats are allowed.
10370   // FIXME: This check doesn't seem possibly correct -- what if the loop is
10371   // an integer loop and the vector instructions selected are purely integer
10372   // vector instructions?
10373   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10374     reportVectorizationFailure(
10375         "Can't vectorize when the NoImplicitFloat attribute is used",
10376         "loop not vectorized due to NoImplicitFloat attribute",
10377         "NoImplicitFloat", ORE, L);
10378     Hints.emitRemarkWithHints();
10379     return false;
10380   }
10381 
10382   // Check if the target supports potentially unsafe FP vectorization.
10383   // FIXME: Add a check for the type of safety issue (denormal, signaling)
10384   // for the target we're vectorizing for, to make sure none of the
10385   // additional fp-math flags can help.
10386   if (Hints.isPotentiallyUnsafe() &&
10387       TTI->isFPVectorizationPotentiallyUnsafe()) {
10388     reportVectorizationFailure(
10389         "Potentially unsafe FP op prevents vectorization",
10390         "loop not vectorized due to unsafe FP support.",
10391         "UnsafeFP", ORE, L);
10392     Hints.emitRemarkWithHints();
10393     return false;
10394   }
10395 
10396   bool AllowOrderedReductions;
10397   // If the flag is set, use that instead and override the TTI behaviour.
10398   if (ForceOrderedReductions.getNumOccurrences() > 0)
10399     AllowOrderedReductions = ForceOrderedReductions;
10400   else
10401     AllowOrderedReductions = TTI->enableOrderedReductions();
10402   if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10403     ORE->emit([&]() {
10404       auto *ExactFPMathInst = Requirements.getExactFPInst();
10405       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10406                                                  ExactFPMathInst->getDebugLoc(),
10407                                                  ExactFPMathInst->getParent())
10408              << "loop not vectorized: cannot prove it is safe to reorder "
10409                 "floating-point operations";
10410     });
10411     LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10412                          "reorder floating-point operations\n");
10413     Hints.emitRemarkWithHints();
10414     return false;
10415   }
10416 
10417   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10418   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10419 
10420   // If an override option has been passed in for interleaved accesses, use it.
10421   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
10422     UseInterleaved = EnableInterleavedMemAccesses;
10423 
10424   // Analyze interleaved memory accesses.
10425   if (UseInterleaved) {
10426     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
10427   }
10428 
10429   // Use the cost model.
10430   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10431                                 F, &Hints, IAI);
10432   CM.collectValuesToIgnore();
10433   CM.collectElementTypesForWidening();
10434 
10435   // Use the planner for vectorization.
10436   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints,
10437                                Requirements, ORE);
10438 
10439   // Get user vectorization factor and interleave count.
10440   ElementCount UserVF = Hints.getWidth();
10441   unsigned UserIC = Hints.getInterleave();
10442 
10443   // Plan how to best vectorize, return the best VF and its cost.
10444   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
10445 
10446   VectorizationFactor VF = VectorizationFactor::Disabled();
10447   unsigned IC = 1;
10448 
10449   if (MaybeVF) {
10450     VF = *MaybeVF;
10451     // Select the interleave count.
10452     IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());
10453   }
10454 
10455   // Identify the diagnostic messages that should be produced.
10456   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10457   bool VectorizeLoop = true, InterleaveLoop = true;
10458   if (VF.Width.isScalar()) {
10459     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10460     VecDiagMsg = std::make_pair(
10461         "VectorizationNotBeneficial",
10462         "the cost-model indicates that vectorization is not beneficial");
10463     VectorizeLoop = false;
10464   }
10465 
10466   if (!MaybeVF && UserIC > 1) {
10467     // Tell the user interleaving was avoided up-front, despite being explicitly
10468     // requested.
10469     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10470                          "interleaving should be avoided up front\n");
10471     IntDiagMsg = std::make_pair(
10472         "InterleavingAvoided",
10473         "Ignoring UserIC, because interleaving was avoided up front");
10474     InterleaveLoop = false;
10475   } else if (IC == 1 && UserIC <= 1) {
10476     // Tell the user interleaving is not beneficial.
10477     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10478     IntDiagMsg = std::make_pair(
10479         "InterleavingNotBeneficial",
10480         "the cost-model indicates that interleaving is not beneficial");
10481     InterleaveLoop = false;
10482     if (UserIC == 1) {
10483       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10484       IntDiagMsg.second +=
10485           " and is explicitly disabled or interleave count is set to 1";
10486     }
10487   } else if (IC > 1 && UserIC == 1) {
10488     // Tell the user interleaving is beneficial, but it explicitly disabled.
10489     LLVM_DEBUG(
10490         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10491     IntDiagMsg = std::make_pair(
10492         "InterleavingBeneficialButDisabled",
10493         "the cost-model indicates that interleaving is beneficial "
10494         "but is explicitly disabled or interleave count is set to 1");
10495     InterleaveLoop = false;
10496   }
10497 
10498   // Override IC if user provided an interleave count.
10499   IC = UserIC > 0 ? UserIC : IC;
10500 
10501   // Emit diagnostic messages, if any.
10502   const char *VAPassName = Hints.vectorizeAnalysisPassName();
10503   if (!VectorizeLoop && !InterleaveLoop) {
10504     // Do not vectorize or interleaving the loop.
10505     ORE->emit([&]() {
10506       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10507                                       L->getStartLoc(), L->getHeader())
10508              << VecDiagMsg.second;
10509     });
10510     ORE->emit([&]() {
10511       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10512                                       L->getStartLoc(), L->getHeader())
10513              << IntDiagMsg.second;
10514     });
10515     return false;
10516   } else if (!VectorizeLoop && InterleaveLoop) {
10517     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10518     ORE->emit([&]() {
10519       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10520                                         L->getStartLoc(), L->getHeader())
10521              << VecDiagMsg.second;
10522     });
10523   } else if (VectorizeLoop && !InterleaveLoop) {
10524     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10525                       << ") in " << DebugLocStr << '\n');
10526     ORE->emit([&]() {
10527       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10528                                         L->getStartLoc(), L->getHeader())
10529              << IntDiagMsg.second;
10530     });
10531   } else if (VectorizeLoop && InterleaveLoop) {
10532     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10533                       << ") in " << DebugLocStr << '\n');
10534     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10535   }
10536 
10537   bool DisableRuntimeUnroll = false;
10538   MDNode *OrigLoopID = L->getLoopID();
10539   {
10540     // Optimistically generate runtime checks. Drop them if they turn out to not
10541     // be profitable. Limit the scope of Checks, so the cleanup happens
10542     // immediately after vector codegeneration is done.
10543     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10544                              F->getParent()->getDataLayout());
10545     if (!VF.Width.isScalar() || IC > 1)
10546       Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate());
10547 
10548     using namespace ore;
10549     if (!VectorizeLoop) {
10550       assert(IC > 1 && "interleave count should not be 1 or 0");
10551       // If we decided that it is not legal to vectorize the loop, then
10552       // interleave it.
10553       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10554                                  &CM, BFI, PSI, Checks);
10555 
10556       VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10557       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT);
10558 
10559       ORE->emit([&]() {
10560         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10561                                   L->getHeader())
10562                << "interleaved loop (interleaved count: "
10563                << NV("InterleaveCount", IC) << ")";
10564       });
10565     } else {
10566       // If we decided that it is *legal* to vectorize the loop, then do it.
10567 
10568       // Consider vectorizing the epilogue too if it's profitable.
10569       VectorizationFactor EpilogueVF =
10570           CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
10571       if (EpilogueVF.Width.isVector()) {
10572 
10573         // The first pass vectorizes the main loop and creates a scalar epilogue
10574         // to be vectorized by executing the plan (potentially with a different
10575         // factor) again shortly afterwards.
10576         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10577         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10578                                            EPI, &LVL, &CM, BFI, PSI, Checks);
10579 
10580         VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
10581         LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV,
10582                         DT);
10583         ++LoopsVectorized;
10584 
10585         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10586         formLCSSARecursively(*L, *DT, LI, SE);
10587 
10588         // Second pass vectorizes the epilogue and adjusts the control flow
10589         // edges from the first pass.
10590         EPI.MainLoopVF = EPI.EpilogueVF;
10591         EPI.MainLoopUF = EPI.EpilogueUF;
10592         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10593                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
10594                                                  Checks);
10595 
10596         VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
10597 
10598         // Ensure that the start values for any VPReductionPHIRecipes are
10599         // updated before vectorising the epilogue loop.
10600         VPBasicBlock *Header = BestEpiPlan.getEntry()->getEntryBasicBlock();
10601         for (VPRecipeBase &R : Header->phis()) {
10602           if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10603             if (auto *Resume = MainILV.getReductionResumeValue(
10604                     ReductionPhi->getRecurrenceDescriptor())) {
10605               VPValue *StartVal = new VPValue(Resume);
10606               BestEpiPlan.addExternalDef(StartVal);
10607               ReductionPhi->setOperand(0, StartVal);
10608             }
10609           }
10610         }
10611 
10612         LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10613                         DT);
10614         ++LoopsEpilogueVectorized;
10615 
10616         if (!MainILV.areSafetyChecksAdded())
10617           DisableRuntimeUnroll = true;
10618       } else {
10619         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
10620                                &LVL, &CM, BFI, PSI, Checks);
10621 
10622         VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10623         LVP.executePlan(VF.Width, IC, BestPlan, LB, DT);
10624         ++LoopsVectorized;
10625 
10626         // Add metadata to disable runtime unrolling a scalar loop when there
10627         // are no runtime checks about strides and memory. A scalar loop that is
10628         // rarely used is not worth unrolling.
10629         if (!LB.areSafetyChecksAdded())
10630           DisableRuntimeUnroll = true;
10631       }
10632       // Report the vectorization decision.
10633       ORE->emit([&]() {
10634         return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
10635                                   L->getHeader())
10636                << "vectorized loop (vectorization width: "
10637                << NV("VectorizationFactor", VF.Width)
10638                << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
10639       });
10640     }
10641 
10642     if (ORE->allowExtraAnalysis(LV_NAME))
10643       checkMixedPrecision(L, ORE);
10644   }
10645 
10646   Optional<MDNode *> RemainderLoopID =
10647       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10648                                       LLVMLoopVectorizeFollowupEpilogue});
10649   if (RemainderLoopID.hasValue()) {
10650     L->setLoopID(RemainderLoopID.getValue());
10651   } else {
10652     if (DisableRuntimeUnroll)
10653       AddRuntimeUnrollDisableMetaData(L);
10654 
10655     // Mark the loop as already vectorized to avoid vectorizing again.
10656     Hints.setAlreadyVectorized();
10657   }
10658 
10659   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10660   return true;
10661 }
10662 
10663 LoopVectorizeResult LoopVectorizePass::runImpl(
10664     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10665     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
10666     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
10667     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
10668     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10669   SE = &SE_;
10670   LI = &LI_;
10671   TTI = &TTI_;
10672   DT = &DT_;
10673   BFI = &BFI_;
10674   TLI = TLI_;
10675   AA = &AA_;
10676   AC = &AC_;
10677   GetLAA = &GetLAA_;
10678   DB = &DB_;
10679   ORE = &ORE_;
10680   PSI = PSI_;
10681 
10682   // Don't attempt if
10683   // 1. the target claims to have no vector registers, and
10684   // 2. interleaving won't help ILP.
10685   //
10686   // The second condition is necessary because, even if the target has no
10687   // vector registers, loop vectorization may still enable scalar
10688   // interleaving.
10689   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10690       TTI->getMaxInterleaveFactor(1) < 2)
10691     return LoopVectorizeResult(false, false);
10692 
10693   bool Changed = false, CFGChanged = false;
10694 
10695   // The vectorizer requires loops to be in simplified form.
10696   // Since simplification may add new inner loops, it has to run before the
10697   // legality and profitability checks. This means running the loop vectorizer
10698   // will simplify all loops, regardless of whether anything end up being
10699   // vectorized.
10700   for (auto &L : *LI)
10701     Changed |= CFGChanged |=
10702         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10703 
10704   // Build up a worklist of inner-loops to vectorize. This is necessary as
10705   // the act of vectorizing or partially unrolling a loop creates new loops
10706   // and can invalidate iterators across the loops.
10707   SmallVector<Loop *, 8> Worklist;
10708 
10709   for (Loop *L : *LI)
10710     collectSupportedLoops(*L, LI, ORE, Worklist);
10711 
10712   LoopsAnalyzed += Worklist.size();
10713 
10714   // Now walk the identified inner loops.
10715   while (!Worklist.empty()) {
10716     Loop *L = Worklist.pop_back_val();
10717 
10718     // For the inner loops we actually process, form LCSSA to simplify the
10719     // transform.
10720     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10721 
10722     Changed |= CFGChanged |= processLoop(L);
10723   }
10724 
10725   // Process each loop nest in the function.
10726   return LoopVectorizeResult(Changed, CFGChanged);
10727 }
10728 
10729 PreservedAnalyses LoopVectorizePass::run(Function &F,
10730                                          FunctionAnalysisManager &AM) {
10731     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
10732     auto &LI = AM.getResult<LoopAnalysis>(F);
10733     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10734     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10735     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
10736     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10737     auto &AA = AM.getResult<AAManager>(F);
10738     auto &AC = AM.getResult<AssumptionAnalysis>(F);
10739     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10740     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10741 
10742     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
10743     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
10744         [&](Loop &L) -> const LoopAccessInfo & {
10745       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,      SE,
10746                                         TLI, TTI, nullptr, nullptr, nullptr};
10747       return LAM.getResult<LoopAccessAnalysis>(L, AR);
10748     };
10749     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10750     ProfileSummaryInfo *PSI =
10751         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10752     LoopVectorizeResult Result =
10753         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
10754     if (!Result.MadeAnyChange)
10755       return PreservedAnalyses::all();
10756     PreservedAnalyses PA;
10757 
10758     // We currently do not preserve loopinfo/dominator analyses with outer loop
10759     // vectorization. Until this is addressed, mark these analyses as preserved
10760     // only for non-VPlan-native path.
10761     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10762     if (!EnableVPlanNativePath) {
10763       PA.preserve<LoopAnalysis>();
10764       PA.preserve<DominatorTreeAnalysis>();
10765     }
10766 
10767     if (Result.MadeCFGChange) {
10768       // Making CFG changes likely means a loop got vectorized. Indicate that
10769       // extra simplification passes should be run.
10770       // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10771       // be run if runtime checks have been added.
10772       AM.getResult<ShouldRunExtraVectorPasses>(F);
10773       PA.preserve<ShouldRunExtraVectorPasses>();
10774     } else {
10775       PA.preserveSet<CFGAnalyses>();
10776     }
10777     return PA;
10778 }
10779 
10780 void LoopVectorizePass::printPipeline(
10781     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10782   static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10783       OS, MapClassName2PassName);
10784 
10785   OS << "<";
10786   OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10787   OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10788   OS << ">";
10789 }
10790