1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SmallPtrSet.h"
73 #include "llvm/ADT/SmallSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
91 #include "llvm/Analysis/ProfileSummaryInfo.h"
92 #include "llvm/Analysis/ScalarEvolution.h"
93 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
94 #include "llvm/Analysis/TargetLibraryInfo.h"
95 #include "llvm/Analysis/TargetTransformInfo.h"
96 #include "llvm/Analysis/VectorUtils.h"
97 #include "llvm/IR/Attributes.h"
98 #include "llvm/IR/BasicBlock.h"
99 #include "llvm/IR/CFG.h"
100 #include "llvm/IR/Constant.h"
101 #include "llvm/IR/Constants.h"
102 #include "llvm/IR/DataLayout.h"
103 #include "llvm/IR/DebugInfoMetadata.h"
104 #include "llvm/IR/DebugLoc.h"
105 #include "llvm/IR/DerivedTypes.h"
106 #include "llvm/IR/DiagnosticInfo.h"
107 #include "llvm/IR/Dominators.h"
108 #include "llvm/IR/Function.h"
109 #include "llvm/IR/IRBuilder.h"
110 #include "llvm/IR/InstrTypes.h"
111 #include "llvm/IR/Instruction.h"
112 #include "llvm/IR/Instructions.h"
113 #include "llvm/IR/IntrinsicInst.h"
114 #include "llvm/IR/Intrinsics.h"
115 #include "llvm/IR/Metadata.h"
116 #include "llvm/IR/Module.h"
117 #include "llvm/IR/Operator.h"
118 #include "llvm/IR/PatternMatch.h"
119 #include "llvm/IR/Type.h"
120 #include "llvm/IR/Use.h"
121 #include "llvm/IR/User.h"
122 #include "llvm/IR/Value.h"
123 #include "llvm/IR/ValueHandle.h"
124 #include "llvm/IR/Verifier.h"
125 #include "llvm/InitializePasses.h"
126 #include "llvm/Pass.h"
127 #include "llvm/Support/Casting.h"
128 #include "llvm/Support/CommandLine.h"
129 #include "llvm/Support/Compiler.h"
130 #include "llvm/Support/Debug.h"
131 #include "llvm/Support/ErrorHandling.h"
132 #include "llvm/Support/InstructionCost.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <functional>
147 #include <iterator>
148 #include <limits>
149 #include <map>
150 #include <memory>
151 #include <string>
152 #include <tuple>
153 #include <utility>
154 
155 using namespace llvm;
156 
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
159 
160 #ifndef NDEBUG
161 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
162 #endif
163 
164 /// @{
165 /// Metadata attribute names
166 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
167 const char LLVMLoopVectorizeFollowupVectorized[] =
168     "llvm.loop.vectorize.followup_vectorized";
169 const char LLVMLoopVectorizeFollowupEpilogue[] =
170     "llvm.loop.vectorize.followup_epilogue";
171 /// @}
172 
173 STATISTIC(LoopsVectorized, "Number of loops vectorized");
174 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
175 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
176 
177 static cl::opt<bool> EnableEpilogueVectorization(
178     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
179     cl::desc("Enable vectorization of epilogue loops."));
180 
181 static cl::opt<unsigned> EpilogueVectorizationForceVF(
182     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
183     cl::desc("When epilogue vectorization is enabled, and a value greater than "
184              "1 is specified, forces the given VF for all applicable epilogue "
185              "loops."));
186 
187 static cl::opt<unsigned> EpilogueVectorizationMinVF(
188     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
189     cl::desc("Only loops with vectorization factor equal to or larger than "
190              "the specified value are considered for epilogue vectorization."));
191 
192 /// Loops with a known constant trip count below this number are vectorized only
193 /// if no scalar iteration overheads are incurred.
194 static cl::opt<unsigned> TinyTripCountVectorThreshold(
195     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
196     cl::desc("Loops with a constant trip count that is smaller than this "
197              "value are vectorized only if no scalar iteration overheads "
198              "are incurred."));
199 
200 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
201     "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
202     cl::desc("The maximum allowed number of runtime memory checks with a "
203              "vectorize(enable) pragma."));
204 
205 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
206 // that predication is preferred, and this lists all options. I.e., the
207 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
208 // and predicate the instructions accordingly. If tail-folding fails, there are
209 // different fallback strategies depending on these values:
210 namespace PreferPredicateTy {
211   enum Option {
212     ScalarEpilogue = 0,
213     PredicateElseScalarEpilogue,
214     PredicateOrDontVectorize
215   };
216 } // namespace PreferPredicateTy
217 
218 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
219     "prefer-predicate-over-epilogue",
220     cl::init(PreferPredicateTy::ScalarEpilogue),
221     cl::Hidden,
222     cl::desc("Tail-folding and predication preferences over creating a scalar "
223              "epilogue loop."),
224     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
225                          "scalar-epilogue",
226                          "Don't tail-predicate loops, create scalar epilogue"),
227               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
228                          "predicate-else-scalar-epilogue",
229                          "prefer tail-folding, create scalar epilogue if tail "
230                          "folding fails."),
231               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
232                          "predicate-dont-vectorize",
233                          "prefers tail-folding, don't attempt vectorization if "
234                          "tail-folding fails.")));
235 
236 static cl::opt<bool> MaximizeBandwidth(
237     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
238     cl::desc("Maximize bandwidth when selecting vectorization factor which "
239              "will be determined by the smallest type in loop."));
240 
241 static cl::opt<bool> EnableInterleavedMemAccesses(
242     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
243     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
244 
245 /// An interleave-group may need masking if it resides in a block that needs
246 /// predication, or in order to mask away gaps.
247 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
248     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
249     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
250 
251 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
252     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
253     cl::desc("We don't interleave loops with a estimated constant trip count "
254              "below this number"));
255 
256 static cl::opt<unsigned> ForceTargetNumScalarRegs(
257     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
258     cl::desc("A flag that overrides the target's number of scalar registers."));
259 
260 static cl::opt<unsigned> ForceTargetNumVectorRegs(
261     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
262     cl::desc("A flag that overrides the target's number of vector registers."));
263 
264 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
265     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
266     cl::desc("A flag that overrides the target's max interleave factor for "
267              "scalar loops."));
268 
269 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
270     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
271     cl::desc("A flag that overrides the target's max interleave factor for "
272              "vectorized loops."));
273 
274 static cl::opt<unsigned> ForceTargetInstructionCost(
275     "force-target-instruction-cost", cl::init(0), cl::Hidden,
276     cl::desc("A flag that overrides the target's expected cost for "
277              "an instruction to a single constant value. Mostly "
278              "useful for getting consistent testing."));
279 
280 static cl::opt<bool> ForceTargetSupportsScalableVectors(
281     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
282     cl::desc(
283         "Pretend that scalable vectors are supported, even if the target does "
284         "not support them. This flag should only be used for testing."));
285 
286 static cl::opt<unsigned> SmallLoopCost(
287     "small-loop-cost", cl::init(20), cl::Hidden,
288     cl::desc(
289         "The cost of a loop that is considered 'small' by the interleaver."));
290 
291 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
292     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
293     cl::desc("Enable the use of the block frequency analysis to access PGO "
294              "heuristics minimizing code growth in cold regions and being more "
295              "aggressive in hot regions."));
296 
297 // Runtime interleave loops for load/store throughput.
298 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
299     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
300     cl::desc(
301         "Enable runtime interleaving until load/store ports are saturated"));
302 
303 /// Interleave small loops with scalar reductions.
304 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
305     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
306     cl::desc("Enable interleaving for loops with small iteration counts that "
307              "contain scalar reductions to expose ILP."));
308 
309 /// The number of stores in a loop that are allowed to need predication.
310 static cl::opt<unsigned> NumberOfStoresToPredicate(
311     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
312     cl::desc("Max number of stores to be predicated behind an if."));
313 
314 static cl::opt<bool> EnableIndVarRegisterHeur(
315     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
316     cl::desc("Count the induction variable only once when interleaving"));
317 
318 static cl::opt<bool> EnableCondStoresVectorization(
319     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
320     cl::desc("Enable if predication of stores during vectorization."));
321 
322 static cl::opt<unsigned> MaxNestedScalarReductionIC(
323     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
324     cl::desc("The maximum interleave count to use when interleaving a scalar "
325              "reduction in a nested loop."));
326 
327 static cl::opt<bool>
328     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
329                            cl::Hidden,
330                            cl::desc("Prefer in-loop vector reductions, "
331                                     "overriding the targets preference."));
332 
333 static cl::opt<bool> ForceOrderedReductions(
334     "force-ordered-reductions", cl::init(false), cl::Hidden,
335     cl::desc("Enable the vectorisation of loops with in-order (strict) "
336              "FP reductions"));
337 
338 static cl::opt<bool> PreferPredicatedReductionSelect(
339     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
340     cl::desc(
341         "Prefer predicating a reduction operation over an after loop select."));
342 
343 cl::opt<bool> EnableVPlanNativePath(
344     "enable-vplan-native-path", cl::init(false), cl::Hidden,
345     cl::desc("Enable VPlan-native vectorization path with "
346              "support for outer loop vectorization."));
347 
348 // FIXME: Remove this switch once we have divergence analysis. Currently we
349 // assume divergent non-backedge branches when this switch is true.
350 cl::opt<bool> EnableVPlanPredication(
351     "enable-vplan-predication", cl::init(false), cl::Hidden,
352     cl::desc("Enable VPlan-native vectorization path predicator with "
353              "support for outer loop vectorization."));
354 
355 // This flag enables the stress testing of the VPlan H-CFG construction in the
356 // VPlan-native vectorization path. It must be used in conjuction with
357 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
358 // verification of the H-CFGs built.
359 static cl::opt<bool> VPlanBuildStressTest(
360     "vplan-build-stress-test", cl::init(false), cl::Hidden,
361     cl::desc(
362         "Build VPlan for every supported loop nest in the function and bail "
363         "out right after the build (stress test the VPlan H-CFG construction "
364         "in the VPlan-native vectorization path)."));
365 
366 cl::opt<bool> llvm::EnableLoopInterleaving(
367     "interleave-loops", cl::init(true), cl::Hidden,
368     cl::desc("Enable loop interleaving in Loop vectorization passes"));
369 cl::opt<bool> llvm::EnableLoopVectorization(
370     "vectorize-loops", cl::init(true), cl::Hidden,
371     cl::desc("Run the Loop vectorization passes"));
372 
373 cl::opt<bool> PrintVPlansInDotFormat(
374     "vplan-print-in-dot-format", cl::init(false), cl::Hidden,
375     cl::desc("Use dot format instead of plain text when dumping VPlans"));
376 
377 /// A helper function that returns true if the given type is irregular. The
378 /// type is irregular if its allocated size doesn't equal the store size of an
379 /// element of the corresponding vector type.
380 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
381   // Determine if an array of N elements of type Ty is "bitcast compatible"
382   // with a <N x Ty> vector.
383   // This is only true if there is no padding between the array elements.
384   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
385 }
386 
387 /// A helper function that returns the reciprocal of the block probability of
388 /// predicated blocks. If we return X, we are assuming the predicated block
389 /// will execute once for every X iterations of the loop header.
390 ///
391 /// TODO: We should use actual block probability here, if available. Currently,
392 ///       we always assume predicated blocks have a 50% chance of executing.
393 static unsigned getReciprocalPredBlockProb() { return 2; }
394 
395 /// A helper function that returns an integer or floating-point constant with
396 /// value C.
397 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
398   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
399                            : ConstantFP::get(Ty, C);
400 }
401 
402 /// Returns "best known" trip count for the specified loop \p L as defined by
403 /// the following procedure:
404 ///   1) Returns exact trip count if it is known.
405 ///   2) Returns expected trip count according to profile data if any.
406 ///   3) Returns upper bound estimate if it is known.
407 ///   4) Returns None if all of the above failed.
408 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
409   // Check if exact trip count is known.
410   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
411     return ExpectedTC;
412 
413   // Check if there is an expected trip count available from profile data.
414   if (LoopVectorizeWithBlockFrequency)
415     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
416       return EstimatedTC;
417 
418   // Check if upper bound estimate is known.
419   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
420     return ExpectedTC;
421 
422   return None;
423 }
424 
425 // Forward declare GeneratedRTChecks.
426 class GeneratedRTChecks;
427 
428 namespace llvm {
429 
430 AnalysisKey ShouldRunExtraVectorPasses::Key;
431 
432 /// InnerLoopVectorizer vectorizes loops which contain only one basic
433 /// block to a specified vectorization factor (VF).
434 /// This class performs the widening of scalars into vectors, or multiple
435 /// scalars. This class also implements the following features:
436 /// * It inserts an epilogue loop for handling loops that don't have iteration
437 ///   counts that are known to be a multiple of the vectorization factor.
438 /// * It handles the code generation for reduction variables.
439 /// * Scalarization (implementation using scalars) of un-vectorizable
440 ///   instructions.
441 /// InnerLoopVectorizer does not perform any vectorization-legality
442 /// checks, and relies on the caller to check for the different legality
443 /// aspects. The InnerLoopVectorizer relies on the
444 /// LoopVectorizationLegality class to provide information about the induction
445 /// and reduction variables that were found to a given vectorization factor.
446 class InnerLoopVectorizer {
447 public:
448   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
449                       LoopInfo *LI, DominatorTree *DT,
450                       const TargetLibraryInfo *TLI,
451                       const TargetTransformInfo *TTI, AssumptionCache *AC,
452                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
453                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
454                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
455                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
456       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
457         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
458         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
459         PSI(PSI), RTChecks(RTChecks) {
460     // Query this against the original loop and save it here because the profile
461     // of the original loop header may change as the transformation happens.
462     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
463         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
464   }
465 
466   virtual ~InnerLoopVectorizer() = default;
467 
468   /// Create a new empty loop that will contain vectorized instructions later
469   /// on, while the old loop will be used as the scalar remainder. Control flow
470   /// is generated around the vectorized (and scalar epilogue) loops consisting
471   /// of various checks and bypasses. Return the pre-header block of the new
472   /// loop and the start value for the canonical induction, if it is != 0. The
473   /// latter is the case when vectorizing the epilogue loop. In the case of
474   /// epilogue vectorization, this function is overriden to handle the more
475   /// complex control flow around the loops.
476   virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton();
477 
478   /// Widen a single call instruction within the innermost loop.
479   void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
480                             VPTransformState &State);
481 
482   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
483   void fixVectorizedLoop(VPTransformState &State);
484 
485   // Return true if any runtime check is added.
486   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
487 
488   /// A type for vectorized values in the new loop. Each value from the
489   /// original loop, when vectorized, is represented by UF vector values in the
490   /// new unrolled loop, where UF is the unroll factor.
491   using VectorParts = SmallVector<Value *, 2>;
492 
493   /// Vectorize a single first-order recurrence or pointer induction PHINode in
494   /// a block. This method handles the induction variable canonicalization. It
495   /// supports both VF = 1 for unrolled loops and arbitrary length vectors.
496   void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR,
497                            VPTransformState &State);
498 
499   /// A helper function to scalarize a single Instruction in the innermost loop.
500   /// Generates a sequence of scalar instances for each lane between \p MinLane
501   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
502   /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
503   /// Instr's operands.
504   void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe,
505                             const VPIteration &Instance, bool IfPredicateInstr,
506                             VPTransformState &State);
507 
508   /// Construct the vector value of a scalarized value \p V one lane at a time.
509   void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
510                                  VPTransformState &State);
511 
512   /// Try to vectorize interleaved access group \p Group with the base address
513   /// given in \p Addr, optionally masking the vector operations if \p
514   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
515   /// values in the vectorized loop.
516   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
517                                 ArrayRef<VPValue *> VPDefs,
518                                 VPTransformState &State, VPValue *Addr,
519                                 ArrayRef<VPValue *> StoredValues,
520                                 VPValue *BlockInMask = nullptr);
521 
522   /// Set the debug location in the builder \p Ptr using the debug location in
523   /// \p V. If \p Ptr is None then it uses the class member's Builder.
524   void setDebugLocFromInst(const Value *V,
525                            Optional<IRBuilderBase *> CustomBuilder = None);
526 
527   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
528   void fixNonInductionPHIs(VPTransformState &State);
529 
530   /// Returns true if the reordering of FP operations is not allowed, but we are
531   /// able to vectorize with strict in-order reductions for the given RdxDesc.
532   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
533 
534   /// Create a broadcast instruction. This method generates a broadcast
535   /// instruction (shuffle) for loop invariant values and for the induction
536   /// value. If this is the induction variable then we extend it to N, N+1, ...
537   /// this is needed because each iteration in the loop corresponds to a SIMD
538   /// element.
539   virtual Value *getBroadcastInstrs(Value *V);
540 
541   /// Add metadata from one instruction to another.
542   ///
543   /// This includes both the original MDs from \p From and additional ones (\see
544   /// addNewMetadata).  Use this for *newly created* instructions in the vector
545   /// loop.
546   void addMetadata(Instruction *To, Instruction *From);
547 
548   /// Similar to the previous function but it adds the metadata to a
549   /// vector of instructions.
550   void addMetadata(ArrayRef<Value *> To, Instruction *From);
551 
552   // Returns the resume value (bc.merge.rdx) for a reduction as
553   // generated by fixReduction.
554   PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);
555 
556 protected:
557   friend class LoopVectorizationPlanner;
558 
559   /// A small list of PHINodes.
560   using PhiVector = SmallVector<PHINode *, 4>;
561 
562   /// A type for scalarized values in the new loop. Each value from the
563   /// original loop, when scalarized, is represented by UF x VF scalar values
564   /// in the new unrolled loop, where UF is the unroll factor and VF is the
565   /// vectorization factor.
566   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
567 
568   /// Set up the values of the IVs correctly when exiting the vector loop.
569   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
570                     Value *CountRoundDown, Value *EndValue,
571                     BasicBlock *MiddleBlock, BasicBlock *VectorHeader);
572 
573   /// Introduce a conditional branch (on true, condition to be set later) at the
574   /// end of the header=latch connecting it to itself (across the backedge) and
575   /// to the exit block of \p L.
576   void createHeaderBranch(Loop *L);
577 
578   /// Handle all cross-iteration phis in the header.
579   void fixCrossIterationPHIs(VPTransformState &State);
580 
581   /// Create the exit value of first order recurrences in the middle block and
582   /// update their users.
583   void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
584                                VPTransformState &State);
585 
586   /// Create code for the loop exit value of the reduction.
587   void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
588 
589   /// Clear NSW/NUW flags from reduction instructions if necessary.
590   void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
591                                VPTransformState &State);
592 
593   /// Fixup the LCSSA phi nodes in the unique exit block.  This simply
594   /// means we need to add the appropriate incoming value from the middle
595   /// block as exiting edges from the scalar epilogue loop (if present) are
596   /// already in place, and we exit the vector loop exclusively to the middle
597   /// block.
598   void fixLCSSAPHIs(VPTransformState &State);
599 
600   /// Iteratively sink the scalarized operands of a predicated instruction into
601   /// the block that was created for it.
602   void sinkScalarOperands(Instruction *PredInst);
603 
604   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
605   /// represented as.
606   void truncateToMinimalBitwidths(VPTransformState &State);
607 
608   /// Returns (and creates if needed) the original loop trip count.
609   Value *getOrCreateTripCount(BasicBlock *InsertBlock);
610 
611   /// Returns (and creates if needed) the trip count of the widened loop.
612   Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
613 
614   /// Returns a bitcasted value to the requested vector type.
615   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
616   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
617                                 const DataLayout &DL);
618 
619   /// Emit a bypass check to see if the vector trip count is zero, including if
620   /// it overflows.
621   void emitMinimumIterationCountCheck(BasicBlock *Bypass);
622 
623   /// Emit a bypass check to see if all of the SCEV assumptions we've
624   /// had to make are correct. Returns the block containing the checks or
625   /// nullptr if no checks have been added.
626   BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
627 
628   /// Emit bypass checks to check any memory assumptions we may have made.
629   /// Returns the block containing the checks or nullptr if no checks have been
630   /// added.
631   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
632 
633   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
634   /// vector loop preheader, middle block and scalar preheader. Also
635   /// allocate a loop object for the new vector loop and return it.
636   Loop *createVectorLoopSkeleton(StringRef Prefix);
637 
638   /// Create new phi nodes for the induction variables to resume iteration count
639   /// in the scalar epilogue, from where the vectorized loop left off.
640   /// In cases where the loop skeleton is more complicated (eg. epilogue
641   /// vectorization) and the resume values can come from an additional bypass
642   /// block, the \p AdditionalBypass pair provides information about the bypass
643   /// block and the end value on the edge from bypass to this loop.
644   void createInductionResumeValues(
645       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
646 
647   /// Complete the loop skeleton by adding debug MDs, creating appropriate
648   /// conditional branches in the middle block, preparing the builder and
649   /// running the verifier. Return the preheader of the completed vector loop.
650   BasicBlock *completeLoopSkeleton(MDNode *OrigLoopID);
651 
652   /// Add additional metadata to \p To that was not present on \p Orig.
653   ///
654   /// Currently this is used to add the noalias annotations based on the
655   /// inserted memchecks.  Use this for instructions that are *cloned* into the
656   /// vector loop.
657   void addNewMetadata(Instruction *To, const Instruction *Orig);
658 
659   /// Collect poison-generating recipes that may generate a poison value that is
660   /// used after vectorization, even when their operands are not poison. Those
661   /// recipes meet the following conditions:
662   ///  * Contribute to the address computation of a recipe generating a widen
663   ///    memory load/store (VPWidenMemoryInstructionRecipe or
664   ///    VPInterleaveRecipe).
665   ///  * Such a widen memory load/store has at least one underlying Instruction
666   ///    that is in a basic block that needs predication and after vectorization
667   ///    the generated instruction won't be predicated.
668   void collectPoisonGeneratingRecipes(VPTransformState &State);
669 
670   /// Allow subclasses to override and print debug traces before/after vplan
671   /// execution, when trace information is requested.
672   virtual void printDebugTracesAtStart(){};
673   virtual void printDebugTracesAtEnd(){};
674 
675   /// The original loop.
676   Loop *OrigLoop;
677 
678   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
679   /// dynamic knowledge to simplify SCEV expressions and converts them to a
680   /// more usable form.
681   PredicatedScalarEvolution &PSE;
682 
683   /// Loop Info.
684   LoopInfo *LI;
685 
686   /// Dominator Tree.
687   DominatorTree *DT;
688 
689   /// Alias Analysis.
690   AAResults *AA;
691 
692   /// Target Library Info.
693   const TargetLibraryInfo *TLI;
694 
695   /// Target Transform Info.
696   const TargetTransformInfo *TTI;
697 
698   /// Assumption Cache.
699   AssumptionCache *AC;
700 
701   /// Interface to emit optimization remarks.
702   OptimizationRemarkEmitter *ORE;
703 
704   /// LoopVersioning.  It's only set up (non-null) if memchecks were
705   /// used.
706   ///
707   /// This is currently only used to add no-alias metadata based on the
708   /// memchecks.  The actually versioning is performed manually.
709   std::unique_ptr<LoopVersioning> LVer;
710 
711   /// The vectorization SIMD factor to use. Each vector will have this many
712   /// vector elements.
713   ElementCount VF;
714 
715   /// The vectorization unroll factor to use. Each scalar is vectorized to this
716   /// many different vector instructions.
717   unsigned UF;
718 
719   /// The builder that we use
720   IRBuilder<> Builder;
721 
722   // --- Vectorization state ---
723 
724   /// The vector-loop preheader.
725   BasicBlock *LoopVectorPreHeader;
726 
727   /// The scalar-loop preheader.
728   BasicBlock *LoopScalarPreHeader;
729 
730   /// Middle Block between the vector and the scalar.
731   BasicBlock *LoopMiddleBlock;
732 
733   /// The unique ExitBlock of the scalar loop if one exists.  Note that
734   /// there can be multiple exiting edges reaching this block.
735   BasicBlock *LoopExitBlock;
736 
737   /// The scalar loop body.
738   BasicBlock *LoopScalarBody;
739 
740   /// A list of all bypass blocks. The first block is the entry of the loop.
741   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
742 
743   /// Store instructions that were predicated.
744   SmallVector<Instruction *, 4> PredicatedInstructions;
745 
746   /// Trip count of the original loop.
747   Value *TripCount = nullptr;
748 
749   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
750   Value *VectorTripCount = nullptr;
751 
752   /// The legality analysis.
753   LoopVectorizationLegality *Legal;
754 
755   /// The profitablity analysis.
756   LoopVectorizationCostModel *Cost;
757 
758   // Record whether runtime checks are added.
759   bool AddedSafetyChecks = false;
760 
761   // Holds the end values for each induction variable. We save the end values
762   // so we can later fix-up the external users of the induction variables.
763   DenseMap<PHINode *, Value *> IVEndValues;
764 
765   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
766   // fixed up at the end of vector code generation.
767   SmallVector<PHINode *, 8> OrigPHIsToFix;
768 
769   /// BFI and PSI are used to check for profile guided size optimizations.
770   BlockFrequencyInfo *BFI;
771   ProfileSummaryInfo *PSI;
772 
773   // Whether this loop should be optimized for size based on profile guided size
774   // optimizatios.
775   bool OptForSizeBasedOnProfile;
776 
777   /// Structure to hold information about generated runtime checks, responsible
778   /// for cleaning the checks, if vectorization turns out unprofitable.
779   GeneratedRTChecks &RTChecks;
780 
781   // Holds the resume values for reductions in the loops, used to set the
782   // correct start value of reduction PHIs when vectorizing the epilogue.
783   SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
784       ReductionResumeValues;
785 };
786 
787 class InnerLoopUnroller : public InnerLoopVectorizer {
788 public:
789   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
790                     LoopInfo *LI, DominatorTree *DT,
791                     const TargetLibraryInfo *TLI,
792                     const TargetTransformInfo *TTI, AssumptionCache *AC,
793                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
794                     LoopVectorizationLegality *LVL,
795                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
796                     ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
797       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
798                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
799                             BFI, PSI, Check) {}
800 
801 private:
802   Value *getBroadcastInstrs(Value *V) override;
803 };
804 
805 /// Encapsulate information regarding vectorization of a loop and its epilogue.
806 /// This information is meant to be updated and used across two stages of
807 /// epilogue vectorization.
808 struct EpilogueLoopVectorizationInfo {
809   ElementCount MainLoopVF = ElementCount::getFixed(0);
810   unsigned MainLoopUF = 0;
811   ElementCount EpilogueVF = ElementCount::getFixed(0);
812   unsigned EpilogueUF = 0;
813   BasicBlock *MainLoopIterationCountCheck = nullptr;
814   BasicBlock *EpilogueIterationCountCheck = nullptr;
815   BasicBlock *SCEVSafetyCheck = nullptr;
816   BasicBlock *MemSafetyCheck = nullptr;
817   Value *TripCount = nullptr;
818   Value *VectorTripCount = nullptr;
819 
820   EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
821                                 ElementCount EVF, unsigned EUF)
822       : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
823     assert(EUF == 1 &&
824            "A high UF for the epilogue loop is likely not beneficial.");
825   }
826 };
827 
828 /// An extension of the inner loop vectorizer that creates a skeleton for a
829 /// vectorized loop that has its epilogue (residual) also vectorized.
830 /// The idea is to run the vplan on a given loop twice, firstly to setup the
831 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
832 /// from the first step and vectorize the epilogue.  This is achieved by
833 /// deriving two concrete strategy classes from this base class and invoking
834 /// them in succession from the loop vectorizer planner.
835 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
836 public:
837   InnerLoopAndEpilogueVectorizer(
838       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
839       DominatorTree *DT, const TargetLibraryInfo *TLI,
840       const TargetTransformInfo *TTI, AssumptionCache *AC,
841       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
842       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
843       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
844       GeneratedRTChecks &Checks)
845       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
846                             EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI,
847                             Checks),
848         EPI(EPI) {}
849 
850   // Override this function to handle the more complex control flow around the
851   // three loops.
852   std::pair<BasicBlock *, Value *>
853   createVectorizedLoopSkeleton() final override {
854     return createEpilogueVectorizedLoopSkeleton();
855   }
856 
857   /// The interface for creating a vectorized skeleton using one of two
858   /// different strategies, each corresponding to one execution of the vplan
859   /// as described above.
860   virtual std::pair<BasicBlock *, Value *>
861   createEpilogueVectorizedLoopSkeleton() = 0;
862 
863   /// Holds and updates state information required to vectorize the main loop
864   /// and its epilogue in two separate passes. This setup helps us avoid
865   /// regenerating and recomputing runtime safety checks. It also helps us to
866   /// shorten the iteration-count-check path length for the cases where the
867   /// iteration count of the loop is so small that the main vector loop is
868   /// completely skipped.
869   EpilogueLoopVectorizationInfo &EPI;
870 };
871 
872 /// A specialized derived class of inner loop vectorizer that performs
873 /// vectorization of *main* loops in the process of vectorizing loops and their
874 /// epilogues.
875 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
876 public:
877   EpilogueVectorizerMainLoop(
878       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
879       DominatorTree *DT, const TargetLibraryInfo *TLI,
880       const TargetTransformInfo *TTI, AssumptionCache *AC,
881       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
882       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
883       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
884       GeneratedRTChecks &Check)
885       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
886                                        EPI, LVL, CM, BFI, PSI, Check) {}
887   /// Implements the interface for creating a vectorized skeleton using the
888   /// *main loop* strategy (ie the first pass of vplan execution).
889   std::pair<BasicBlock *, Value *>
890   createEpilogueVectorizedLoopSkeleton() final override;
891 
892 protected:
893   /// Emits an iteration count bypass check once for the main loop (when \p
894   /// ForEpilogue is false) and once for the epilogue loop (when \p
895   /// ForEpilogue is true).
896   BasicBlock *emitMinimumIterationCountCheck(BasicBlock *Bypass,
897                                              bool ForEpilogue);
898   void printDebugTracesAtStart() override;
899   void printDebugTracesAtEnd() override;
900 };
901 
902 // A specialized derived class of inner loop vectorizer that performs
903 // vectorization of *epilogue* loops in the process of vectorizing loops and
904 // their epilogues.
905 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
906 public:
907   EpilogueVectorizerEpilogueLoop(
908       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
909       DominatorTree *DT, const TargetLibraryInfo *TLI,
910       const TargetTransformInfo *TTI, AssumptionCache *AC,
911       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
912       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
913       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
914       GeneratedRTChecks &Checks)
915       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
916                                        EPI, LVL, CM, BFI, PSI, Checks) {}
917   /// Implements the interface for creating a vectorized skeleton using the
918   /// *epilogue loop* strategy (ie the second pass of vplan execution).
919   std::pair<BasicBlock *, Value *>
920   createEpilogueVectorizedLoopSkeleton() final override;
921 
922 protected:
923   /// Emits an iteration count bypass check after the main vector loop has
924   /// finished to see if there are any iterations left to execute by either
925   /// the vector epilogue or the scalar epilogue.
926   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
927                                                       BasicBlock *Bypass,
928                                                       BasicBlock *Insert);
929   void printDebugTracesAtStart() override;
930   void printDebugTracesAtEnd() override;
931 };
932 } // end namespace llvm
933 
934 /// Look for a meaningful debug location on the instruction or it's
935 /// operands.
936 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
937   if (!I)
938     return I;
939 
940   DebugLoc Empty;
941   if (I->getDebugLoc() != Empty)
942     return I;
943 
944   for (Use &Op : I->operands()) {
945     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
946       if (OpInst->getDebugLoc() != Empty)
947         return OpInst;
948   }
949 
950   return I;
951 }
952 
953 void InnerLoopVectorizer::setDebugLocFromInst(
954     const Value *V, Optional<IRBuilderBase *> CustomBuilder) {
955   IRBuilderBase *B = (CustomBuilder == None) ? &Builder : *CustomBuilder;
956   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) {
957     const DILocation *DIL = Inst->getDebugLoc();
958 
959     // When a FSDiscriminator is enabled, we don't need to add the multiply
960     // factors to the discriminators.
961     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
962         !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) {
963       // FIXME: For scalable vectors, assume vscale=1.
964       auto NewDIL =
965           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
966       if (NewDIL)
967         B->SetCurrentDebugLocation(NewDIL.getValue());
968       else
969         LLVM_DEBUG(dbgs()
970                    << "Failed to create new discriminator: "
971                    << DIL->getFilename() << " Line: " << DIL->getLine());
972     } else
973       B->SetCurrentDebugLocation(DIL);
974   } else
975     B->SetCurrentDebugLocation(DebugLoc());
976 }
977 
978 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
979 /// is passed, the message relates to that particular instruction.
980 #ifndef NDEBUG
981 static void debugVectorizationMessage(const StringRef Prefix,
982                                       const StringRef DebugMsg,
983                                       Instruction *I) {
984   dbgs() << "LV: " << Prefix << DebugMsg;
985   if (I != nullptr)
986     dbgs() << " " << *I;
987   else
988     dbgs() << '.';
989   dbgs() << '\n';
990 }
991 #endif
992 
993 /// Create an analysis remark that explains why vectorization failed
994 ///
995 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
996 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
997 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
998 /// the location of the remark.  \return the remark object that can be
999 /// streamed to.
1000 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
1001     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
1002   Value *CodeRegion = TheLoop->getHeader();
1003   DebugLoc DL = TheLoop->getStartLoc();
1004 
1005   if (I) {
1006     CodeRegion = I->getParent();
1007     // If there is no debug location attached to the instruction, revert back to
1008     // using the loop's.
1009     if (I->getDebugLoc())
1010       DL = I->getDebugLoc();
1011   }
1012 
1013   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
1014 }
1015 
1016 namespace llvm {
1017 
1018 /// Return a value for Step multiplied by VF.
1019 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
1020                        int64_t Step) {
1021   assert(Ty->isIntegerTy() && "Expected an integer step");
1022   Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue());
1023   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
1024 }
1025 
1026 /// Return the runtime value for VF.
1027 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
1028   Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
1029   return VF.isScalable() ? B.CreateVScale(EC) : EC;
1030 }
1031 
1032 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
1033                                   ElementCount VF) {
1034   assert(FTy->isFloatingPointTy() && "Expected floating point type!");
1035   Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
1036   Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
1037   return B.CreateUIToFP(RuntimeVF, FTy);
1038 }
1039 
1040 void reportVectorizationFailure(const StringRef DebugMsg,
1041                                 const StringRef OREMsg, const StringRef ORETag,
1042                                 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1043                                 Instruction *I) {
1044   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
1045   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1046   ORE->emit(
1047       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1048       << "loop not vectorized: " << OREMsg);
1049 }
1050 
1051 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1052                              OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1053                              Instruction *I) {
1054   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
1055   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1056   ORE->emit(
1057       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1058       << Msg);
1059 }
1060 
1061 } // end namespace llvm
1062 
1063 #ifndef NDEBUG
1064 /// \return string containing a file name and a line # for the given loop.
1065 static std::string getDebugLocString(const Loop *L) {
1066   std::string Result;
1067   if (L) {
1068     raw_string_ostream OS(Result);
1069     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1070       LoopDbgLoc.print(OS);
1071     else
1072       // Just print the module name.
1073       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1074     OS.flush();
1075   }
1076   return Result;
1077 }
1078 #endif
1079 
1080 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1081                                          const Instruction *Orig) {
1082   // If the loop was versioned with memchecks, add the corresponding no-alias
1083   // metadata.
1084   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1085     LVer->annotateInstWithNoAlias(To, Orig);
1086 }
1087 
1088 void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
1089     VPTransformState &State) {
1090 
1091   // Collect recipes in the backward slice of `Root` that may generate a poison
1092   // value that is used after vectorization.
1093   SmallPtrSet<VPRecipeBase *, 16> Visited;
1094   auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
1095     SmallVector<VPRecipeBase *, 16> Worklist;
1096     Worklist.push_back(Root);
1097 
1098     // Traverse the backward slice of Root through its use-def chain.
1099     while (!Worklist.empty()) {
1100       VPRecipeBase *CurRec = Worklist.back();
1101       Worklist.pop_back();
1102 
1103       if (!Visited.insert(CurRec).second)
1104         continue;
1105 
1106       // Prune search if we find another recipe generating a widen memory
1107       // instruction. Widen memory instructions involved in address computation
1108       // will lead to gather/scatter instructions, which don't need to be
1109       // handled.
1110       if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
1111           isa<VPInterleaveRecipe>(CurRec) ||
1112           isa<VPScalarIVStepsRecipe>(CurRec) ||
1113           isa<VPCanonicalIVPHIRecipe>(CurRec))
1114         continue;
1115 
1116       // This recipe contributes to the address computation of a widen
1117       // load/store. Collect recipe if its underlying instruction has
1118       // poison-generating flags.
1119       Instruction *Instr = CurRec->getUnderlyingInstr();
1120       if (Instr && Instr->hasPoisonGeneratingFlags())
1121         State.MayGeneratePoisonRecipes.insert(CurRec);
1122 
1123       // Add new definitions to the worklist.
1124       for (VPValue *operand : CurRec->operands())
1125         if (VPDef *OpDef = operand->getDef())
1126           Worklist.push_back(cast<VPRecipeBase>(OpDef));
1127     }
1128   });
1129 
1130   // Traverse all the recipes in the VPlan and collect the poison-generating
1131   // recipes in the backward slice starting at the address of a VPWidenRecipe or
1132   // VPInterleaveRecipe.
1133   auto Iter = depth_first(
1134       VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry()));
1135   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1136     for (VPRecipeBase &Recipe : *VPBB) {
1137       if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
1138         Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr();
1139         VPDef *AddrDef = WidenRec->getAddr()->getDef();
1140         if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr &&
1141             Legal->blockNeedsPredication(UnderlyingInstr->getParent()))
1142           collectPoisonGeneratingInstrsInBackwardSlice(
1143               cast<VPRecipeBase>(AddrDef));
1144       } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1145         VPDef *AddrDef = InterleaveRec->getAddr()->getDef();
1146         if (AddrDef) {
1147           // Check if any member of the interleave group needs predication.
1148           const InterleaveGroup<Instruction> *InterGroup =
1149               InterleaveRec->getInterleaveGroup();
1150           bool NeedPredication = false;
1151           for (int I = 0, NumMembers = InterGroup->getNumMembers();
1152                I < NumMembers; ++I) {
1153             Instruction *Member = InterGroup->getMember(I);
1154             if (Member)
1155               NeedPredication |=
1156                   Legal->blockNeedsPredication(Member->getParent());
1157           }
1158 
1159           if (NeedPredication)
1160             collectPoisonGeneratingInstrsInBackwardSlice(
1161                 cast<VPRecipeBase>(AddrDef));
1162         }
1163       }
1164     }
1165   }
1166 }
1167 
1168 void InnerLoopVectorizer::addMetadata(Instruction *To,
1169                                       Instruction *From) {
1170   propagateMetadata(To, From);
1171   addNewMetadata(To, From);
1172 }
1173 
1174 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1175                                       Instruction *From) {
1176   for (Value *V : To) {
1177     if (Instruction *I = dyn_cast<Instruction>(V))
1178       addMetadata(I, From);
1179   }
1180 }
1181 
1182 PHINode *InnerLoopVectorizer::getReductionResumeValue(
1183     const RecurrenceDescriptor &RdxDesc) {
1184   auto It = ReductionResumeValues.find(&RdxDesc);
1185   assert(It != ReductionResumeValues.end() &&
1186          "Expected to find a resume value for the reduction.");
1187   return It->second;
1188 }
1189 
1190 namespace llvm {
1191 
1192 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1193 // lowered.
1194 enum ScalarEpilogueLowering {
1195 
1196   // The default: allowing scalar epilogues.
1197   CM_ScalarEpilogueAllowed,
1198 
1199   // Vectorization with OptForSize: don't allow epilogues.
1200   CM_ScalarEpilogueNotAllowedOptSize,
1201 
1202   // A special case of vectorisation with OptForSize: loops with a very small
1203   // trip count are considered for vectorization under OptForSize, thereby
1204   // making sure the cost of their loop body is dominant, free of runtime
1205   // guards and scalar iteration overheads.
1206   CM_ScalarEpilogueNotAllowedLowTripLoop,
1207 
1208   // Loop hint predicate indicating an epilogue is undesired.
1209   CM_ScalarEpilogueNotNeededUsePredicate,
1210 
1211   // Directive indicating we must either tail fold or not vectorize
1212   CM_ScalarEpilogueNotAllowedUsePredicate
1213 };
1214 
1215 /// ElementCountComparator creates a total ordering for ElementCount
1216 /// for the purposes of using it in a set structure.
1217 struct ElementCountComparator {
1218   bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
1219     return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
1220            std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
1221   }
1222 };
1223 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
1224 
1225 /// LoopVectorizationCostModel - estimates the expected speedups due to
1226 /// vectorization.
1227 /// In many cases vectorization is not profitable. This can happen because of
1228 /// a number of reasons. In this class we mainly attempt to predict the
1229 /// expected speedup/slowdowns due to the supported instruction set. We use the
1230 /// TargetTransformInfo to query the different backends for the cost of
1231 /// different operations.
1232 class LoopVectorizationCostModel {
1233 public:
1234   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1235                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1236                              LoopVectorizationLegality *Legal,
1237                              const TargetTransformInfo &TTI,
1238                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1239                              AssumptionCache *AC,
1240                              OptimizationRemarkEmitter *ORE, const Function *F,
1241                              const LoopVectorizeHints *Hints,
1242                              InterleavedAccessInfo &IAI)
1243       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1244         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1245         Hints(Hints), InterleaveInfo(IAI) {}
1246 
1247   /// \return An upper bound for the vectorization factors (both fixed and
1248   /// scalable). If the factors are 0, vectorization and interleaving should be
1249   /// avoided up front.
1250   FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1251 
1252   /// \return True if runtime checks are required for vectorization, and false
1253   /// otherwise.
1254   bool runtimeChecksRequired();
1255 
1256   /// \return The most profitable vectorization factor and the cost of that VF.
1257   /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
1258   /// then this vectorization factor will be selected if vectorization is
1259   /// possible.
1260   VectorizationFactor
1261   selectVectorizationFactor(const ElementCountSet &CandidateVFs);
1262 
1263   VectorizationFactor
1264   selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1265                                     const LoopVectorizationPlanner &LVP);
1266 
1267   /// Setup cost-based decisions for user vectorization factor.
1268   /// \return true if the UserVF is a feasible VF to be chosen.
1269   bool selectUserVectorizationFactor(ElementCount UserVF) {
1270     collectUniformsAndScalars(UserVF);
1271     collectInstsToScalarize(UserVF);
1272     return expectedCost(UserVF).first.isValid();
1273   }
1274 
1275   /// \return The size (in bits) of the smallest and widest types in the code
1276   /// that needs to be vectorized. We ignore values that remain scalar such as
1277   /// 64 bit loop indices.
1278   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1279 
1280   /// \return The desired interleave count.
1281   /// If interleave count has been specified by metadata it will be returned.
1282   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1283   /// are the selected vectorization factor and the cost of the selected VF.
1284   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1285 
1286   /// Memory access instruction may be vectorized in more than one way.
1287   /// Form of instruction after vectorization depends on cost.
1288   /// This function takes cost-based decisions for Load/Store instructions
1289   /// and collects them in a map. This decisions map is used for building
1290   /// the lists of loop-uniform and loop-scalar instructions.
1291   /// The calculated cost is saved with widening decision in order to
1292   /// avoid redundant calculations.
1293   void setCostBasedWideningDecision(ElementCount VF);
1294 
1295   /// A struct that represents some properties of the register usage
1296   /// of a loop.
1297   struct RegisterUsage {
1298     /// Holds the number of loop invariant values that are used in the loop.
1299     /// The key is ClassID of target-provided register class.
1300     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1301     /// Holds the maximum number of concurrent live intervals in the loop.
1302     /// The key is ClassID of target-provided register class.
1303     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1304   };
1305 
1306   /// \return Returns information about the register usages of the loop for the
1307   /// given vectorization factors.
1308   SmallVector<RegisterUsage, 8>
1309   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1310 
1311   /// Collect values we want to ignore in the cost model.
1312   void collectValuesToIgnore();
1313 
1314   /// Collect all element types in the loop for which widening is needed.
1315   void collectElementTypesForWidening();
1316 
1317   /// Split reductions into those that happen in the loop, and those that happen
1318   /// outside. In loop reductions are collected into InLoopReductionChains.
1319   void collectInLoopReductions();
1320 
1321   /// Returns true if we should use strict in-order reductions for the given
1322   /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1323   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1324   /// of FP operations.
1325   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) {
1326     return !Hints->allowReordering() && RdxDesc.isOrdered();
1327   }
1328 
1329   /// \returns The smallest bitwidth each instruction can be represented with.
1330   /// The vector equivalents of these instructions should be truncated to this
1331   /// type.
1332   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1333     return MinBWs;
1334   }
1335 
1336   /// \returns True if it is more profitable to scalarize instruction \p I for
1337   /// vectorization factor \p VF.
1338   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1339     assert(VF.isVector() &&
1340            "Profitable to scalarize relevant only for VF > 1.");
1341 
1342     // Cost model is not run in the VPlan-native path - return conservative
1343     // result until this changes.
1344     if (EnableVPlanNativePath)
1345       return false;
1346 
1347     auto Scalars = InstsToScalarize.find(VF);
1348     assert(Scalars != InstsToScalarize.end() &&
1349            "VF not yet analyzed for scalarization profitability");
1350     return Scalars->second.find(I) != Scalars->second.end();
1351   }
1352 
1353   /// Returns true if \p I is known to be uniform after vectorization.
1354   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1355     if (VF.isScalar())
1356       return true;
1357 
1358     // Cost model is not run in the VPlan-native path - return conservative
1359     // result until this changes.
1360     if (EnableVPlanNativePath)
1361       return false;
1362 
1363     auto UniformsPerVF = Uniforms.find(VF);
1364     assert(UniformsPerVF != Uniforms.end() &&
1365            "VF not yet analyzed for uniformity");
1366     return UniformsPerVF->second.count(I);
1367   }
1368 
1369   /// Returns true if \p I is known to be scalar after vectorization.
1370   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1371     if (VF.isScalar())
1372       return true;
1373 
1374     // Cost model is not run in the VPlan-native path - return conservative
1375     // result until this changes.
1376     if (EnableVPlanNativePath)
1377       return false;
1378 
1379     auto ScalarsPerVF = Scalars.find(VF);
1380     assert(ScalarsPerVF != Scalars.end() &&
1381            "Scalar values are not calculated for VF");
1382     return ScalarsPerVF->second.count(I);
1383   }
1384 
1385   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1386   /// for vectorization factor \p VF.
1387   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1388     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1389            !isProfitableToScalarize(I, VF) &&
1390            !isScalarAfterVectorization(I, VF);
1391   }
1392 
1393   /// Decision that was taken during cost calculation for memory instruction.
1394   enum InstWidening {
1395     CM_Unknown,
1396     CM_Widen,         // For consecutive accesses with stride +1.
1397     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1398     CM_Interleave,
1399     CM_GatherScatter,
1400     CM_Scalarize
1401   };
1402 
1403   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1404   /// instruction \p I and vector width \p VF.
1405   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1406                            InstructionCost Cost) {
1407     assert(VF.isVector() && "Expected VF >=2");
1408     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1409   }
1410 
1411   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1412   /// interleaving group \p Grp and vector width \p VF.
1413   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1414                            ElementCount VF, InstWidening W,
1415                            InstructionCost Cost) {
1416     assert(VF.isVector() && "Expected VF >=2");
1417     /// Broadcast this decicion to all instructions inside the group.
1418     /// But the cost will be assigned to one instruction only.
1419     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1420       if (auto *I = Grp->getMember(i)) {
1421         if (Grp->getInsertPos() == I)
1422           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1423         else
1424           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1425       }
1426     }
1427   }
1428 
1429   /// Return the cost model decision for the given instruction \p I and vector
1430   /// width \p VF. Return CM_Unknown if this instruction did not pass
1431   /// through the cost modeling.
1432   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1433     assert(VF.isVector() && "Expected VF to be a vector VF");
1434     // Cost model is not run in the VPlan-native path - return conservative
1435     // result until this changes.
1436     if (EnableVPlanNativePath)
1437       return CM_GatherScatter;
1438 
1439     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1440     auto Itr = WideningDecisions.find(InstOnVF);
1441     if (Itr == WideningDecisions.end())
1442       return CM_Unknown;
1443     return Itr->second.first;
1444   }
1445 
1446   /// Return the vectorization cost for the given instruction \p I and vector
1447   /// width \p VF.
1448   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1449     assert(VF.isVector() && "Expected VF >=2");
1450     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1451     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1452            "The cost is not calculated");
1453     return WideningDecisions[InstOnVF].second;
1454   }
1455 
1456   /// Return True if instruction \p I is an optimizable truncate whose operand
1457   /// is an induction variable. Such a truncate will be removed by adding a new
1458   /// induction variable with the destination type.
1459   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1460     // If the instruction is not a truncate, return false.
1461     auto *Trunc = dyn_cast<TruncInst>(I);
1462     if (!Trunc)
1463       return false;
1464 
1465     // Get the source and destination types of the truncate.
1466     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1467     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1468 
1469     // If the truncate is free for the given types, return false. Replacing a
1470     // free truncate with an induction variable would add an induction variable
1471     // update instruction to each iteration of the loop. We exclude from this
1472     // check the primary induction variable since it will need an update
1473     // instruction regardless.
1474     Value *Op = Trunc->getOperand(0);
1475     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1476       return false;
1477 
1478     // If the truncated value is not an induction variable, return false.
1479     return Legal->isInductionPhi(Op);
1480   }
1481 
1482   /// Collects the instructions to scalarize for each predicated instruction in
1483   /// the loop.
1484   void collectInstsToScalarize(ElementCount VF);
1485 
1486   /// Collect Uniform and Scalar values for the given \p VF.
1487   /// The sets depend on CM decision for Load/Store instructions
1488   /// that may be vectorized as interleave, gather-scatter or scalarized.
1489   void collectUniformsAndScalars(ElementCount VF) {
1490     // Do the analysis once.
1491     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1492       return;
1493     setCostBasedWideningDecision(VF);
1494     collectLoopUniforms(VF);
1495     collectLoopScalars(VF);
1496   }
1497 
1498   /// Returns true if the target machine supports masked store operation
1499   /// for the given \p DataType and kind of access to \p Ptr.
1500   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1501     return Legal->isConsecutivePtr(DataType, Ptr) &&
1502            TTI.isLegalMaskedStore(DataType, Alignment);
1503   }
1504 
1505   /// Returns true if the target machine supports masked load operation
1506   /// for the given \p DataType and kind of access to \p Ptr.
1507   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1508     return Legal->isConsecutivePtr(DataType, Ptr) &&
1509            TTI.isLegalMaskedLoad(DataType, Alignment);
1510   }
1511 
1512   /// Returns true if the target machine can represent \p V as a masked gather
1513   /// or scatter operation.
1514   bool isLegalGatherOrScatter(Value *V,
1515                               ElementCount VF = ElementCount::getFixed(1)) {
1516     bool LI = isa<LoadInst>(V);
1517     bool SI = isa<StoreInst>(V);
1518     if (!LI && !SI)
1519       return false;
1520     auto *Ty = getLoadStoreType(V);
1521     Align Align = getLoadStoreAlignment(V);
1522     if (VF.isVector())
1523       Ty = VectorType::get(Ty, VF);
1524     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1525            (SI && TTI.isLegalMaskedScatter(Ty, Align));
1526   }
1527 
1528   /// Returns true if the target machine supports all of the reduction
1529   /// variables found for the given VF.
1530   bool canVectorizeReductions(ElementCount VF) const {
1531     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1532       const RecurrenceDescriptor &RdxDesc = Reduction.second;
1533       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1534     }));
1535   }
1536 
1537   /// Returns true if \p I is an instruction that will be scalarized with
1538   /// predication when vectorizing \p I with vectorization factor \p VF. Such
1539   /// instructions include conditional stores and instructions that may divide
1540   /// by zero.
1541   bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1542 
1543   // Returns true if \p I is an instruction that will be predicated either
1544   // through scalar predication or masked load/store or masked gather/scatter.
1545   // \p VF is the vectorization factor that will be used to vectorize \p I.
1546   // Superset of instructions that return true for isScalarWithPredication.
1547   bool isPredicatedInst(Instruction *I, ElementCount VF,
1548                         bool IsKnownUniform = false) {
1549     // When we know the load is uniform and the original scalar loop was not
1550     // predicated we don't need to mark it as a predicated instruction. Any
1551     // vectorised blocks created when tail-folding are something artificial we
1552     // have introduced and we know there is always at least one active lane.
1553     // That's why we call Legal->blockNeedsPredication here because it doesn't
1554     // query tail-folding.
1555     if (IsKnownUniform && isa<LoadInst>(I) &&
1556         !Legal->blockNeedsPredication(I->getParent()))
1557       return false;
1558     if (!blockNeedsPredicationForAnyReason(I->getParent()))
1559       return false;
1560     // Loads and stores that need some form of masked operation are predicated
1561     // instructions.
1562     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1563       return Legal->isMaskRequired(I);
1564     return isScalarWithPredication(I, VF);
1565   }
1566 
1567   /// Returns true if \p I is a memory instruction with consecutive memory
1568   /// access that can be widened.
1569   bool
1570   memoryInstructionCanBeWidened(Instruction *I,
1571                                 ElementCount VF = ElementCount::getFixed(1));
1572 
1573   /// Returns true if \p I is a memory instruction in an interleaved-group
1574   /// of memory accesses that can be vectorized with wide vector loads/stores
1575   /// and shuffles.
1576   bool
1577   interleavedAccessCanBeWidened(Instruction *I,
1578                                 ElementCount VF = ElementCount::getFixed(1));
1579 
1580   /// Check if \p Instr belongs to any interleaved access group.
1581   bool isAccessInterleaved(Instruction *Instr) {
1582     return InterleaveInfo.isInterleaved(Instr);
1583   }
1584 
1585   /// Get the interleaved access group that \p Instr belongs to.
1586   const InterleaveGroup<Instruction> *
1587   getInterleavedAccessGroup(Instruction *Instr) {
1588     return InterleaveInfo.getInterleaveGroup(Instr);
1589   }
1590 
1591   /// Returns true if we're required to use a scalar epilogue for at least
1592   /// the final iteration of the original loop.
1593   bool requiresScalarEpilogue(ElementCount VF) const {
1594     if (!isScalarEpilogueAllowed())
1595       return false;
1596     // If we might exit from anywhere but the latch, must run the exiting
1597     // iteration in scalar form.
1598     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1599       return true;
1600     return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();
1601   }
1602 
1603   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1604   /// loop hint annotation.
1605   bool isScalarEpilogueAllowed() const {
1606     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1607   }
1608 
1609   /// Returns true if all loop blocks should be masked to fold tail loop.
1610   bool foldTailByMasking() const { return FoldTailByMasking; }
1611 
1612   /// Returns true if the instructions in this block requires predication
1613   /// for any reason, e.g. because tail folding now requires a predicate
1614   /// or because the block in the original loop was predicated.
1615   bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1616     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1617   }
1618 
1619   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1620   /// nodes to the chain of instructions representing the reductions. Uses a
1621   /// MapVector to ensure deterministic iteration order.
1622   using ReductionChainMap =
1623       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1624 
1625   /// Return the chain of instructions representing an inloop reduction.
1626   const ReductionChainMap &getInLoopReductionChains() const {
1627     return InLoopReductionChains;
1628   }
1629 
1630   /// Returns true if the Phi is part of an inloop reduction.
1631   bool isInLoopReduction(PHINode *Phi) const {
1632     return InLoopReductionChains.count(Phi);
1633   }
1634 
1635   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1636   /// with factor VF.  Return the cost of the instruction, including
1637   /// scalarization overhead if it's needed.
1638   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1639 
1640   /// Estimate cost of a call instruction CI if it were vectorized with factor
1641   /// VF. Return the cost of the instruction, including scalarization overhead
1642   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1643   /// scalarized -
1644   /// i.e. either vector version isn't available, or is too expensive.
1645   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1646                                     bool &NeedToScalarize) const;
1647 
1648   /// Returns true if the per-lane cost of VectorizationFactor A is lower than
1649   /// that of B.
1650   bool isMoreProfitable(const VectorizationFactor &A,
1651                         const VectorizationFactor &B) const;
1652 
1653   /// Invalidates decisions already taken by the cost model.
1654   void invalidateCostModelingDecisions() {
1655     WideningDecisions.clear();
1656     Uniforms.clear();
1657     Scalars.clear();
1658   }
1659 
1660 private:
1661   unsigned NumPredStores = 0;
1662 
1663   /// Convenience function that returns the value of vscale_range iff
1664   /// vscale_range.min == vscale_range.max or otherwise returns the value
1665   /// returned by the corresponding TLI method.
1666   Optional<unsigned> getVScaleForTuning() const;
1667 
1668   /// \return An upper bound for the vectorization factors for both
1669   /// fixed and scalable vectorization, where the minimum-known number of
1670   /// elements is a power-of-2 larger than zero. If scalable vectorization is
1671   /// disabled or unsupported, then the scalable part will be equal to
1672   /// ElementCount::getScalable(0).
1673   FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1674                                            ElementCount UserVF,
1675                                            bool FoldTailByMasking);
1676 
1677   /// \return the maximized element count based on the targets vector
1678   /// registers and the loop trip-count, but limited to a maximum safe VF.
1679   /// This is a helper function of computeFeasibleMaxVF.
1680   /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure
1681   /// issue that occurred on one of the buildbots which cannot be reproduced
1682   /// without having access to the properietary compiler (see comments on
1683   /// D98509). The issue is currently under investigation and this workaround
1684   /// will be removed as soon as possible.
1685   ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
1686                                        unsigned SmallestType,
1687                                        unsigned WidestType,
1688                                        const ElementCount &MaxSafeVF,
1689                                        bool FoldTailByMasking);
1690 
1691   /// \return the maximum legal scalable VF, based on the safe max number
1692   /// of elements.
1693   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1694 
1695   /// The vectorization cost is a combination of the cost itself and a boolean
1696   /// indicating whether any of the contributing operations will actually
1697   /// operate on vector values after type legalization in the backend. If this
1698   /// latter value is false, then all operations will be scalarized (i.e. no
1699   /// vectorization has actually taken place).
1700   using VectorizationCostTy = std::pair<InstructionCost, bool>;
1701 
1702   /// Returns the expected execution cost. The unit of the cost does
1703   /// not matter because we use the 'cost' units to compare different
1704   /// vector widths. The cost that is returned is *not* normalized by
1705   /// the factor width. If \p Invalid is not nullptr, this function
1706   /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1707   /// each instruction that has an Invalid cost for the given VF.
1708   using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1709   VectorizationCostTy
1710   expectedCost(ElementCount VF,
1711                SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1712 
1713   /// Returns the execution time cost of an instruction for a given vector
1714   /// width. Vector width of one means scalar.
1715   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1716 
1717   /// The cost-computation logic from getInstructionCost which provides
1718   /// the vector type as an output parameter.
1719   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1720                                      Type *&VectorTy);
1721 
1722   /// Return the cost of instructions in an inloop reduction pattern, if I is
1723   /// part of that pattern.
1724   Optional<InstructionCost>
1725   getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1726                           TTI::TargetCostKind CostKind);
1727 
1728   /// Calculate vectorization cost of memory instruction \p I.
1729   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1730 
1731   /// The cost computation for scalarized memory instruction.
1732   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1733 
1734   /// The cost computation for interleaving group of memory instructions.
1735   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1736 
1737   /// The cost computation for Gather/Scatter instruction.
1738   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1739 
1740   /// The cost computation for widening instruction \p I with consecutive
1741   /// memory access.
1742   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1743 
1744   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1745   /// Load: scalar load + broadcast.
1746   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1747   /// element)
1748   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1749 
1750   /// Estimate the overhead of scalarizing an instruction. This is a
1751   /// convenience wrapper for the type-based getScalarizationOverhead API.
1752   InstructionCost getScalarizationOverhead(Instruction *I,
1753                                            ElementCount VF) const;
1754 
1755   /// Returns whether the instruction is a load or store and will be a emitted
1756   /// as a vector operation.
1757   bool isConsecutiveLoadOrStore(Instruction *I);
1758 
1759   /// Returns true if an artificially high cost for emulated masked memrefs
1760   /// should be used.
1761   bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1762 
1763   /// Map of scalar integer values to the smallest bitwidth they can be legally
1764   /// represented as. The vector equivalents of these values should be truncated
1765   /// to this type.
1766   MapVector<Instruction *, uint64_t> MinBWs;
1767 
1768   /// A type representing the costs for instructions if they were to be
1769   /// scalarized rather than vectorized. The entries are Instruction-Cost
1770   /// pairs.
1771   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1772 
1773   /// A set containing all BasicBlocks that are known to present after
1774   /// vectorization as a predicated block.
1775   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1776 
1777   /// Records whether it is allowed to have the original scalar loop execute at
1778   /// least once. This may be needed as a fallback loop in case runtime
1779   /// aliasing/dependence checks fail, or to handle the tail/remainder
1780   /// iterations when the trip count is unknown or doesn't divide by the VF,
1781   /// or as a peel-loop to handle gaps in interleave-groups.
1782   /// Under optsize and when the trip count is very small we don't allow any
1783   /// iterations to execute in the scalar loop.
1784   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1785 
1786   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1787   bool FoldTailByMasking = false;
1788 
1789   /// A map holding scalar costs for different vectorization factors. The
1790   /// presence of a cost for an instruction in the mapping indicates that the
1791   /// instruction will be scalarized when vectorizing with the associated
1792   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1793   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1794 
1795   /// Holds the instructions known to be uniform after vectorization.
1796   /// The data is collected per VF.
1797   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1798 
1799   /// Holds the instructions known to be scalar after vectorization.
1800   /// The data is collected per VF.
1801   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1802 
1803   /// Holds the instructions (address computations) that are forced to be
1804   /// scalarized.
1805   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1806 
1807   /// PHINodes of the reductions that should be expanded in-loop along with
1808   /// their associated chains of reduction operations, in program order from top
1809   /// (PHI) to bottom
1810   ReductionChainMap InLoopReductionChains;
1811 
1812   /// A Map of inloop reduction operations and their immediate chain operand.
1813   /// FIXME: This can be removed once reductions can be costed correctly in
1814   /// vplan. This was added to allow quick lookup to the inloop operations,
1815   /// without having to loop through InLoopReductionChains.
1816   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1817 
1818   /// Returns the expected difference in cost from scalarizing the expression
1819   /// feeding a predicated instruction \p PredInst. The instructions to
1820   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1821   /// non-negative return value implies the expression will be scalarized.
1822   /// Currently, only single-use chains are considered for scalarization.
1823   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1824                               ElementCount VF);
1825 
1826   /// Collect the instructions that are uniform after vectorization. An
1827   /// instruction is uniform if we represent it with a single scalar value in
1828   /// the vectorized loop corresponding to each vector iteration. Examples of
1829   /// uniform instructions include pointer operands of consecutive or
1830   /// interleaved memory accesses. Note that although uniformity implies an
1831   /// instruction will be scalar, the reverse is not true. In general, a
1832   /// scalarized instruction will be represented by VF scalar values in the
1833   /// vectorized loop, each corresponding to an iteration of the original
1834   /// scalar loop.
1835   void collectLoopUniforms(ElementCount VF);
1836 
1837   /// Collect the instructions that are scalar after vectorization. An
1838   /// instruction is scalar if it is known to be uniform or will be scalarized
1839   /// during vectorization. collectLoopScalars should only add non-uniform nodes
1840   /// to the list if they are used by a load/store instruction that is marked as
1841   /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1842   /// VF values in the vectorized loop, each corresponding to an iteration of
1843   /// the original scalar loop.
1844   void collectLoopScalars(ElementCount VF);
1845 
1846   /// Keeps cost model vectorization decision and cost for instructions.
1847   /// Right now it is used for memory instructions only.
1848   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1849                                 std::pair<InstWidening, InstructionCost>>;
1850 
1851   DecisionList WideningDecisions;
1852 
1853   /// Returns true if \p V is expected to be vectorized and it needs to be
1854   /// extracted.
1855   bool needsExtract(Value *V, ElementCount VF) const {
1856     Instruction *I = dyn_cast<Instruction>(V);
1857     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1858         TheLoop->isLoopInvariant(I))
1859       return false;
1860 
1861     // Assume we can vectorize V (and hence we need extraction) if the
1862     // scalars are not computed yet. This can happen, because it is called
1863     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1864     // the scalars are collected. That should be a safe assumption in most
1865     // cases, because we check if the operands have vectorizable types
1866     // beforehand in LoopVectorizationLegality.
1867     return Scalars.find(VF) == Scalars.end() ||
1868            !isScalarAfterVectorization(I, VF);
1869   };
1870 
1871   /// Returns a range containing only operands needing to be extracted.
1872   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1873                                                    ElementCount VF) const {
1874     return SmallVector<Value *, 4>(make_filter_range(
1875         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1876   }
1877 
1878   /// Determines if we have the infrastructure to vectorize loop \p L and its
1879   /// epilogue, assuming the main loop is vectorized by \p VF.
1880   bool isCandidateForEpilogueVectorization(const Loop &L,
1881                                            const ElementCount VF) const;
1882 
1883   /// Returns true if epilogue vectorization is considered profitable, and
1884   /// false otherwise.
1885   /// \p VF is the vectorization factor chosen for the original loop.
1886   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1887 
1888 public:
1889   /// The loop that we evaluate.
1890   Loop *TheLoop;
1891 
1892   /// Predicated scalar evolution analysis.
1893   PredicatedScalarEvolution &PSE;
1894 
1895   /// Loop Info analysis.
1896   LoopInfo *LI;
1897 
1898   /// Vectorization legality.
1899   LoopVectorizationLegality *Legal;
1900 
1901   /// Vector target information.
1902   const TargetTransformInfo &TTI;
1903 
1904   /// Target Library Info.
1905   const TargetLibraryInfo *TLI;
1906 
1907   /// Demanded bits analysis.
1908   DemandedBits *DB;
1909 
1910   /// Assumption cache.
1911   AssumptionCache *AC;
1912 
1913   /// Interface to emit optimization remarks.
1914   OptimizationRemarkEmitter *ORE;
1915 
1916   const Function *TheFunction;
1917 
1918   /// Loop Vectorize Hint.
1919   const LoopVectorizeHints *Hints;
1920 
1921   /// The interleave access information contains groups of interleaved accesses
1922   /// with the same stride and close to each other.
1923   InterleavedAccessInfo &InterleaveInfo;
1924 
1925   /// Values to ignore in the cost model.
1926   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1927 
1928   /// Values to ignore in the cost model when VF > 1.
1929   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1930 
1931   /// All element types found in the loop.
1932   SmallPtrSet<Type *, 16> ElementTypesInLoop;
1933 
1934   /// Profitable vector factors.
1935   SmallVector<VectorizationFactor, 8> ProfitableVFs;
1936 };
1937 } // end namespace llvm
1938 
1939 /// Helper struct to manage generating runtime checks for vectorization.
1940 ///
1941 /// The runtime checks are created up-front in temporary blocks to allow better
1942 /// estimating the cost and un-linked from the existing IR. After deciding to
1943 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1944 /// temporary blocks are completely removed.
1945 class GeneratedRTChecks {
1946   /// Basic block which contains the generated SCEV checks, if any.
1947   BasicBlock *SCEVCheckBlock = nullptr;
1948 
1949   /// The value representing the result of the generated SCEV checks. If it is
1950   /// nullptr, either no SCEV checks have been generated or they have been used.
1951   Value *SCEVCheckCond = nullptr;
1952 
1953   /// Basic block which contains the generated memory runtime checks, if any.
1954   BasicBlock *MemCheckBlock = nullptr;
1955 
1956   /// The value representing the result of the generated memory runtime checks.
1957   /// If it is nullptr, either no memory runtime checks have been generated or
1958   /// they have been used.
1959   Value *MemRuntimeCheckCond = nullptr;
1960 
1961   DominatorTree *DT;
1962   LoopInfo *LI;
1963 
1964   SCEVExpander SCEVExp;
1965   SCEVExpander MemCheckExp;
1966 
1967 public:
1968   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1969                     const DataLayout &DL)
1970       : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"),
1971         MemCheckExp(SE, DL, "scev.check") {}
1972 
1973   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1974   /// accurately estimate the cost of the runtime checks. The blocks are
1975   /// un-linked from the IR and is added back during vector code generation. If
1976   /// there is no vector code generation, the check blocks are removed
1977   /// completely.
1978   void Create(Loop *L, const LoopAccessInfo &LAI,
1979               const SCEVPredicate &Pred) {
1980 
1981     BasicBlock *LoopHeader = L->getHeader();
1982     BasicBlock *Preheader = L->getLoopPreheader();
1983 
1984     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1985     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1986     // may be used by SCEVExpander. The blocks will be un-linked from their
1987     // predecessors and removed from LI & DT at the end of the function.
1988     if (!Pred.isAlwaysTrue()) {
1989       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1990                                   nullptr, "vector.scevcheck");
1991 
1992       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1993           &Pred, SCEVCheckBlock->getTerminator());
1994     }
1995 
1996     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1997     if (RtPtrChecking.Need) {
1998       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1999       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
2000                                  "vector.memcheck");
2001 
2002       MemRuntimeCheckCond =
2003           addRuntimeChecks(MemCheckBlock->getTerminator(), L,
2004                            RtPtrChecking.getChecks(), MemCheckExp);
2005       assert(MemRuntimeCheckCond &&
2006              "no RT checks generated although RtPtrChecking "
2007              "claimed checks are required");
2008     }
2009 
2010     if (!MemCheckBlock && !SCEVCheckBlock)
2011       return;
2012 
2013     // Unhook the temporary block with the checks, update various places
2014     // accordingly.
2015     if (SCEVCheckBlock)
2016       SCEVCheckBlock->replaceAllUsesWith(Preheader);
2017     if (MemCheckBlock)
2018       MemCheckBlock->replaceAllUsesWith(Preheader);
2019 
2020     if (SCEVCheckBlock) {
2021       SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2022       new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
2023       Preheader->getTerminator()->eraseFromParent();
2024     }
2025     if (MemCheckBlock) {
2026       MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2027       new UnreachableInst(Preheader->getContext(), MemCheckBlock);
2028       Preheader->getTerminator()->eraseFromParent();
2029     }
2030 
2031     DT->changeImmediateDominator(LoopHeader, Preheader);
2032     if (MemCheckBlock) {
2033       DT->eraseNode(MemCheckBlock);
2034       LI->removeBlock(MemCheckBlock);
2035     }
2036     if (SCEVCheckBlock) {
2037       DT->eraseNode(SCEVCheckBlock);
2038       LI->removeBlock(SCEVCheckBlock);
2039     }
2040   }
2041 
2042   /// Remove the created SCEV & memory runtime check blocks & instructions, if
2043   /// unused.
2044   ~GeneratedRTChecks() {
2045     SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2046     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2047     if (!SCEVCheckCond)
2048       SCEVCleaner.markResultUsed();
2049 
2050     if (!MemRuntimeCheckCond)
2051       MemCheckCleaner.markResultUsed();
2052 
2053     if (MemRuntimeCheckCond) {
2054       auto &SE = *MemCheckExp.getSE();
2055       // Memory runtime check generation creates compares that use expanded
2056       // values. Remove them before running the SCEVExpanderCleaners.
2057       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2058         if (MemCheckExp.isInsertedInstruction(&I))
2059           continue;
2060         SE.forgetValue(&I);
2061         I.eraseFromParent();
2062       }
2063     }
2064     MemCheckCleaner.cleanup();
2065     SCEVCleaner.cleanup();
2066 
2067     if (SCEVCheckCond)
2068       SCEVCheckBlock->eraseFromParent();
2069     if (MemRuntimeCheckCond)
2070       MemCheckBlock->eraseFromParent();
2071   }
2072 
2073   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2074   /// adjusts the branches to branch to the vector preheader or \p Bypass,
2075   /// depending on the generated condition.
2076   BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2077                              BasicBlock *LoopVectorPreHeader,
2078                              BasicBlock *LoopExitBlock) {
2079     if (!SCEVCheckCond)
2080       return nullptr;
2081     if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond))
2082       if (C->isZero())
2083         return nullptr;
2084 
2085     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2086 
2087     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2088     // Create new preheader for vector loop.
2089     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2090       PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2091 
2092     SCEVCheckBlock->getTerminator()->eraseFromParent();
2093     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2094     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2095                                                 SCEVCheckBlock);
2096 
2097     DT->addNewBlock(SCEVCheckBlock, Pred);
2098     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2099 
2100     ReplaceInstWithInst(
2101         SCEVCheckBlock->getTerminator(),
2102         BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond));
2103     // Mark the check as used, to prevent it from being removed during cleanup.
2104     SCEVCheckCond = nullptr;
2105     return SCEVCheckBlock;
2106   }
2107 
2108   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2109   /// the branches to branch to the vector preheader or \p Bypass, depending on
2110   /// the generated condition.
2111   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2112                                    BasicBlock *LoopVectorPreHeader) {
2113     // Check if we generated code that checks in runtime if arrays overlap.
2114     if (!MemRuntimeCheckCond)
2115       return nullptr;
2116 
2117     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2118     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2119                                                 MemCheckBlock);
2120 
2121     DT->addNewBlock(MemCheckBlock, Pred);
2122     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2123     MemCheckBlock->moveBefore(LoopVectorPreHeader);
2124 
2125     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2126       PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2127 
2128     ReplaceInstWithInst(
2129         MemCheckBlock->getTerminator(),
2130         BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2131     MemCheckBlock->getTerminator()->setDebugLoc(
2132         Pred->getTerminator()->getDebugLoc());
2133 
2134     // Mark the check as used, to prevent it from being removed during cleanup.
2135     MemRuntimeCheckCond = nullptr;
2136     return MemCheckBlock;
2137   }
2138 };
2139 
2140 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2141 // vectorization. The loop needs to be annotated with #pragma omp simd
2142 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2143 // vector length information is not provided, vectorization is not considered
2144 // explicit. Interleave hints are not allowed either. These limitations will be
2145 // relaxed in the future.
2146 // Please, note that we are currently forced to abuse the pragma 'clang
2147 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2148 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2149 // provides *explicit vectorization hints* (LV can bypass legal checks and
2150 // assume that vectorization is legal). However, both hints are implemented
2151 // using the same metadata (llvm.loop.vectorize, processed by
2152 // LoopVectorizeHints). This will be fixed in the future when the native IR
2153 // representation for pragma 'omp simd' is introduced.
2154 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2155                                    OptimizationRemarkEmitter *ORE) {
2156   assert(!OuterLp->isInnermost() && "This is not an outer loop");
2157   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2158 
2159   // Only outer loops with an explicit vectorization hint are supported.
2160   // Unannotated outer loops are ignored.
2161   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2162     return false;
2163 
2164   Function *Fn = OuterLp->getHeader()->getParent();
2165   if (!Hints.allowVectorization(Fn, OuterLp,
2166                                 true /*VectorizeOnlyWhenForced*/)) {
2167     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2168     return false;
2169   }
2170 
2171   if (Hints.getInterleave() > 1) {
2172     // TODO: Interleave support is future work.
2173     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2174                          "outer loops.\n");
2175     Hints.emitRemarkWithHints();
2176     return false;
2177   }
2178 
2179   return true;
2180 }
2181 
2182 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2183                                   OptimizationRemarkEmitter *ORE,
2184                                   SmallVectorImpl<Loop *> &V) {
2185   // Collect inner loops and outer loops without irreducible control flow. For
2186   // now, only collect outer loops that have explicit vectorization hints. If we
2187   // are stress testing the VPlan H-CFG construction, we collect the outermost
2188   // loop of every loop nest.
2189   if (L.isInnermost() || VPlanBuildStressTest ||
2190       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2191     LoopBlocksRPO RPOT(&L);
2192     RPOT.perform(LI);
2193     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2194       V.push_back(&L);
2195       // TODO: Collect inner loops inside marked outer loops in case
2196       // vectorization fails for the outer loop. Do not invoke
2197       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2198       // already known to be reducible. We can use an inherited attribute for
2199       // that.
2200       return;
2201     }
2202   }
2203   for (Loop *InnerL : L)
2204     collectSupportedLoops(*InnerL, LI, ORE, V);
2205 }
2206 
2207 namespace {
2208 
2209 /// The LoopVectorize Pass.
2210 struct LoopVectorize : public FunctionPass {
2211   /// Pass identification, replacement for typeid
2212   static char ID;
2213 
2214   LoopVectorizePass Impl;
2215 
2216   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2217                          bool VectorizeOnlyWhenForced = false)
2218       : FunctionPass(ID),
2219         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2220     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2221   }
2222 
2223   bool runOnFunction(Function &F) override {
2224     if (skipFunction(F))
2225       return false;
2226 
2227     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2228     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2229     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2230     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2231     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2232     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2233     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2234     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2235     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2236     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2237     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2238     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2239     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2240 
2241     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2242         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2243 
2244     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2245                         GetLAA, *ORE, PSI).MadeAnyChange;
2246   }
2247 
2248   void getAnalysisUsage(AnalysisUsage &AU) const override {
2249     AU.addRequired<AssumptionCacheTracker>();
2250     AU.addRequired<BlockFrequencyInfoWrapperPass>();
2251     AU.addRequired<DominatorTreeWrapperPass>();
2252     AU.addRequired<LoopInfoWrapperPass>();
2253     AU.addRequired<ScalarEvolutionWrapperPass>();
2254     AU.addRequired<TargetTransformInfoWrapperPass>();
2255     AU.addRequired<AAResultsWrapperPass>();
2256     AU.addRequired<LoopAccessLegacyAnalysis>();
2257     AU.addRequired<DemandedBitsWrapperPass>();
2258     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2259     AU.addRequired<InjectTLIMappingsLegacy>();
2260 
2261     // We currently do not preserve loopinfo/dominator analyses with outer loop
2262     // vectorization. Until this is addressed, mark these analyses as preserved
2263     // only for non-VPlan-native path.
2264     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2265     if (!EnableVPlanNativePath) {
2266       AU.addPreserved<LoopInfoWrapperPass>();
2267       AU.addPreserved<DominatorTreeWrapperPass>();
2268     }
2269 
2270     AU.addPreserved<BasicAAWrapperPass>();
2271     AU.addPreserved<GlobalsAAWrapperPass>();
2272     AU.addRequired<ProfileSummaryInfoWrapperPass>();
2273   }
2274 };
2275 
2276 } // end anonymous namespace
2277 
2278 //===----------------------------------------------------------------------===//
2279 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2280 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2281 //===----------------------------------------------------------------------===//
2282 
2283 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2284   // We need to place the broadcast of invariant variables outside the loop,
2285   // but only if it's proven safe to do so. Else, broadcast will be inside
2286   // vector loop body.
2287   Instruction *Instr = dyn_cast<Instruction>(V);
2288   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2289                      (!Instr ||
2290                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2291   // Place the code for broadcasting invariant variables in the new preheader.
2292   IRBuilder<>::InsertPointGuard Guard(Builder);
2293   if (SafeToHoist)
2294     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2295 
2296   // Broadcast the scalar into all locations in the vector.
2297   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2298 
2299   return Shuf;
2300 }
2301 
2302 /// This function adds
2303 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
2304 /// to each vector element of Val. The sequence starts at StartIndex.
2305 /// \p Opcode is relevant for FP induction variable.
2306 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
2307                             Instruction::BinaryOps BinOp, ElementCount VF,
2308                             IRBuilderBase &Builder) {
2309   assert(VF.isVector() && "only vector VFs are supported");
2310 
2311   // Create and check the types.
2312   auto *ValVTy = cast<VectorType>(Val->getType());
2313   ElementCount VLen = ValVTy->getElementCount();
2314 
2315   Type *STy = Val->getType()->getScalarType();
2316   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2317          "Induction Step must be an integer or FP");
2318   assert(Step->getType() == STy && "Step has wrong type");
2319 
2320   SmallVector<Constant *, 8> Indices;
2321 
2322   // Create a vector of consecutive numbers from zero to VF.
2323   VectorType *InitVecValVTy = ValVTy;
2324   if (STy->isFloatingPointTy()) {
2325     Type *InitVecValSTy =
2326         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
2327     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2328   }
2329   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2330 
2331   // Splat the StartIdx
2332   Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
2333 
2334   if (STy->isIntegerTy()) {
2335     InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2336     Step = Builder.CreateVectorSplat(VLen, Step);
2337     assert(Step->getType() == Val->getType() && "Invalid step vec");
2338     // FIXME: The newly created binary instructions should contain nsw/nuw
2339     // flags, which can be found from the original scalar operations.
2340     Step = Builder.CreateMul(InitVec, Step);
2341     return Builder.CreateAdd(Val, Step, "induction");
2342   }
2343 
2344   // Floating point induction.
2345   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2346          "Binary Opcode should be specified for FP induction");
2347   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2348   InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
2349 
2350   Step = Builder.CreateVectorSplat(VLen, Step);
2351   Value *MulOp = Builder.CreateFMul(InitVec, Step);
2352   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2353 }
2354 
2355 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
2356 /// variable on which to base the steps, \p Step is the size of the step.
2357 static void buildScalarSteps(Value *ScalarIV, Value *Step,
2358                              const InductionDescriptor &ID, VPValue *Def,
2359                              VPTransformState &State) {
2360   IRBuilderBase &Builder = State.Builder;
2361   // We shouldn't have to build scalar steps if we aren't vectorizing.
2362   assert(State.VF.isVector() && "VF should be greater than one");
2363   // Get the value type and ensure it and the step have the same integer type.
2364   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2365   assert(ScalarIVTy == Step->getType() &&
2366          "Val and Step should have the same type");
2367 
2368   // We build scalar steps for both integer and floating-point induction
2369   // variables. Here, we determine the kind of arithmetic we will perform.
2370   Instruction::BinaryOps AddOp;
2371   Instruction::BinaryOps MulOp;
2372   if (ScalarIVTy->isIntegerTy()) {
2373     AddOp = Instruction::Add;
2374     MulOp = Instruction::Mul;
2375   } else {
2376     AddOp = ID.getInductionOpcode();
2377     MulOp = Instruction::FMul;
2378   }
2379 
2380   // Determine the number of scalars we need to generate for each unroll
2381   // iteration.
2382   bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def);
2383   unsigned Lanes = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2384   // Compute the scalar steps and save the results in State.
2385   Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2386                                      ScalarIVTy->getScalarSizeInBits());
2387   Type *VecIVTy = nullptr;
2388   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2389   if (!FirstLaneOnly && State.VF.isScalable()) {
2390     VecIVTy = VectorType::get(ScalarIVTy, State.VF);
2391     UnitStepVec =
2392         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
2393     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
2394     SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV);
2395   }
2396 
2397   for (unsigned Part = 0; Part < State.UF; ++Part) {
2398     Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
2399 
2400     if (!FirstLaneOnly && State.VF.isScalable()) {
2401       auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
2402       auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2403       if (ScalarIVTy->isFloatingPointTy())
2404         InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2405       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2406       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2407       State.set(Def, Add, Part);
2408       // It's useful to record the lane values too for the known minimum number
2409       // of elements so we do those below. This improves the code quality when
2410       // trying to extract the first element, for example.
2411     }
2412 
2413     if (ScalarIVTy->isFloatingPointTy())
2414       StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2415 
2416     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2417       Value *StartIdx = Builder.CreateBinOp(
2418           AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2419       // The step returned by `createStepForVF` is a runtime-evaluated value
2420       // when VF is scalable. Otherwise, it should be folded into a Constant.
2421       assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
2422              "Expected StartIdx to be folded to a constant when VF is not "
2423              "scalable");
2424       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2425       auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2426       State.set(Def, Add, VPIteration(Part, Lane));
2427     }
2428   }
2429 }
2430 
2431 // Generate code for the induction step. Note that induction steps are
2432 // required to be loop-invariant
2433 static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE,
2434                               Instruction *InsertBefore,
2435                               Loop *OrigLoop = nullptr) {
2436   const DataLayout &DL = SE.getDataLayout();
2437   assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) &&
2438          "Induction step should be loop invariant");
2439   if (auto *E = dyn_cast<SCEVUnknown>(Step))
2440     return E->getValue();
2441 
2442   SCEVExpander Exp(SE, DL, "induction");
2443   return Exp.expandCodeFor(Step, Step->getType(), InsertBefore);
2444 }
2445 
2446 /// Compute the transformed value of Index at offset StartValue using step
2447 /// StepValue.
2448 /// For integer induction, returns StartValue + Index * StepValue.
2449 /// For pointer induction, returns StartValue[Index * StepValue].
2450 /// FIXME: The newly created binary instructions should contain nsw/nuw
2451 /// flags, which can be found from the original scalar operations.
2452 static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index,
2453                                    Value *StartValue, Value *Step,
2454                                    const InductionDescriptor &ID) {
2455   assert(Index->getType()->getScalarType() == Step->getType() &&
2456          "Index scalar type does not match StepValue type");
2457 
2458   // Note: the IR at this point is broken. We cannot use SE to create any new
2459   // SCEV and then expand it, hoping that SCEV's simplification will give us
2460   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2461   // lead to various SCEV crashes. So all we can do is to use builder and rely
2462   // on InstCombine for future simplifications. Here we handle some trivial
2463   // cases only.
2464   auto CreateAdd = [&B](Value *X, Value *Y) {
2465     assert(X->getType() == Y->getType() && "Types don't match!");
2466     if (auto *CX = dyn_cast<ConstantInt>(X))
2467       if (CX->isZero())
2468         return Y;
2469     if (auto *CY = dyn_cast<ConstantInt>(Y))
2470       if (CY->isZero())
2471         return X;
2472     return B.CreateAdd(X, Y);
2473   };
2474 
2475   // We allow X to be a vector type, in which case Y will potentially be
2476   // splatted into a vector with the same element count.
2477   auto CreateMul = [&B](Value *X, Value *Y) {
2478     assert(X->getType()->getScalarType() == Y->getType() &&
2479            "Types don't match!");
2480     if (auto *CX = dyn_cast<ConstantInt>(X))
2481       if (CX->isOne())
2482         return Y;
2483     if (auto *CY = dyn_cast<ConstantInt>(Y))
2484       if (CY->isOne())
2485         return X;
2486     VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2487     if (XVTy && !isa<VectorType>(Y->getType()))
2488       Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2489     return B.CreateMul(X, Y);
2490   };
2491 
2492   switch (ID.getKind()) {
2493   case InductionDescriptor::IK_IntInduction: {
2494     assert(!isa<VectorType>(Index->getType()) &&
2495            "Vector indices not supported for integer inductions yet");
2496     assert(Index->getType() == StartValue->getType() &&
2497            "Index type does not match StartValue type");
2498     if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2499       return B.CreateSub(StartValue, Index);
2500     auto *Offset = CreateMul(Index, Step);
2501     return CreateAdd(StartValue, Offset);
2502   }
2503   case InductionDescriptor::IK_PtrInduction: {
2504     assert(isa<Constant>(Step) &&
2505            "Expected constant step for pointer induction");
2506     return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step));
2507   }
2508   case InductionDescriptor::IK_FpInduction: {
2509     assert(!isa<VectorType>(Index->getType()) &&
2510            "Vector indices not supported for FP inductions yet");
2511     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2512     auto InductionBinOp = ID.getInductionBinOp();
2513     assert(InductionBinOp &&
2514            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2515             InductionBinOp->getOpcode() == Instruction::FSub) &&
2516            "Original bin op should be defined for FP induction");
2517 
2518     Value *MulExp = B.CreateFMul(Step, Index);
2519     return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2520                          "induction");
2521   }
2522   case InductionDescriptor::IK_NoInduction:
2523     return nullptr;
2524   }
2525   llvm_unreachable("invalid enum");
2526 }
2527 
2528 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2529                                                     const VPIteration &Instance,
2530                                                     VPTransformState &State) {
2531   Value *ScalarInst = State.get(Def, Instance);
2532   Value *VectorValue = State.get(Def, Instance.Part);
2533   VectorValue = Builder.CreateInsertElement(
2534       VectorValue, ScalarInst,
2535       Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2536   State.set(Def, VectorValue, Instance.Part);
2537 }
2538 
2539 // Return whether we allow using masked interleave-groups (for dealing with
2540 // strided loads/stores that reside in predicated blocks, or for dealing
2541 // with gaps).
2542 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2543   // If an override option has been passed in for interleaved accesses, use it.
2544   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2545     return EnableMaskedInterleavedMemAccesses;
2546 
2547   return TTI.enableMaskedInterleavedAccessVectorization();
2548 }
2549 
2550 // Try to vectorize the interleave group that \p Instr belongs to.
2551 //
2552 // E.g. Translate following interleaved load group (factor = 3):
2553 //   for (i = 0; i < N; i+=3) {
2554 //     R = Pic[i];             // Member of index 0
2555 //     G = Pic[i+1];           // Member of index 1
2556 //     B = Pic[i+2];           // Member of index 2
2557 //     ... // do something to R, G, B
2558 //   }
2559 // To:
2560 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2561 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2562 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2563 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2564 //
2565 // Or translate following interleaved store group (factor = 3):
2566 //   for (i = 0; i < N; i+=3) {
2567 //     ... do something to R, G, B
2568 //     Pic[i]   = R;           // Member of index 0
2569 //     Pic[i+1] = G;           // Member of index 1
2570 //     Pic[i+2] = B;           // Member of index 2
2571 //   }
2572 // To:
2573 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2574 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2575 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2576 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2577 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2578 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2579     const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2580     VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2581     VPValue *BlockInMask) {
2582   Instruction *Instr = Group->getInsertPos();
2583   const DataLayout &DL = Instr->getModule()->getDataLayout();
2584 
2585   // Prepare for the vector type of the interleaved load/store.
2586   Type *ScalarTy = getLoadStoreType(Instr);
2587   unsigned InterleaveFactor = Group->getFactor();
2588   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2589   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2590 
2591   // Prepare for the new pointers.
2592   SmallVector<Value *, 2> AddrParts;
2593   unsigned Index = Group->getIndex(Instr);
2594 
2595   // TODO: extend the masked interleaved-group support to reversed access.
2596   assert((!BlockInMask || !Group->isReverse()) &&
2597          "Reversed masked interleave-group not supported.");
2598 
2599   // If the group is reverse, adjust the index to refer to the last vector lane
2600   // instead of the first. We adjust the index from the first vector lane,
2601   // rather than directly getting the pointer for lane VF - 1, because the
2602   // pointer operand of the interleaved access is supposed to be uniform. For
2603   // uniform instructions, we're only required to generate a value for the
2604   // first vector lane in each unroll iteration.
2605   if (Group->isReverse())
2606     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2607 
2608   for (unsigned Part = 0; Part < UF; Part++) {
2609     Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2610     setDebugLocFromInst(AddrPart);
2611 
2612     // Notice current instruction could be any index. Need to adjust the address
2613     // to the member of index 0.
2614     //
2615     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2616     //       b = A[i];       // Member of index 0
2617     // Current pointer is pointed to A[i+1], adjust it to A[i].
2618     //
2619     // E.g.  A[i+1] = a;     // Member of index 1
2620     //       A[i]   = b;     // Member of index 0
2621     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2622     // Current pointer is pointed to A[i+2], adjust it to A[i].
2623 
2624     bool InBounds = false;
2625     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2626       InBounds = gep->isInBounds();
2627     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2628     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2629 
2630     // Cast to the vector pointer type.
2631     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2632     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2633     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2634   }
2635 
2636   setDebugLocFromInst(Instr);
2637   Value *PoisonVec = PoisonValue::get(VecTy);
2638 
2639   Value *MaskForGaps = nullptr;
2640   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2641     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2642     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2643   }
2644 
2645   // Vectorize the interleaved load group.
2646   if (isa<LoadInst>(Instr)) {
2647     // For each unroll part, create a wide load for the group.
2648     SmallVector<Value *, 2> NewLoads;
2649     for (unsigned Part = 0; Part < UF; Part++) {
2650       Instruction *NewLoad;
2651       if (BlockInMask || MaskForGaps) {
2652         assert(useMaskedInterleavedAccesses(*TTI) &&
2653                "masked interleaved groups are not allowed.");
2654         Value *GroupMask = MaskForGaps;
2655         if (BlockInMask) {
2656           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2657           Value *ShuffledMask = Builder.CreateShuffleVector(
2658               BlockInMaskPart,
2659               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2660               "interleaved.mask");
2661           GroupMask = MaskForGaps
2662                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2663                                                 MaskForGaps)
2664                           : ShuffledMask;
2665         }
2666         NewLoad =
2667             Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2668                                      GroupMask, PoisonVec, "wide.masked.vec");
2669       }
2670       else
2671         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2672                                             Group->getAlign(), "wide.vec");
2673       Group->addMetadata(NewLoad);
2674       NewLoads.push_back(NewLoad);
2675     }
2676 
2677     // For each member in the group, shuffle out the appropriate data from the
2678     // wide loads.
2679     unsigned J = 0;
2680     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2681       Instruction *Member = Group->getMember(I);
2682 
2683       // Skip the gaps in the group.
2684       if (!Member)
2685         continue;
2686 
2687       auto StrideMask =
2688           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2689       for (unsigned Part = 0; Part < UF; Part++) {
2690         Value *StridedVec = Builder.CreateShuffleVector(
2691             NewLoads[Part], StrideMask, "strided.vec");
2692 
2693         // If this member has different type, cast the result type.
2694         if (Member->getType() != ScalarTy) {
2695           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2696           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2697           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2698         }
2699 
2700         if (Group->isReverse())
2701           StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2702 
2703         State.set(VPDefs[J], StridedVec, Part);
2704       }
2705       ++J;
2706     }
2707     return;
2708   }
2709 
2710   // The sub vector type for current instruction.
2711   auto *SubVT = VectorType::get(ScalarTy, VF);
2712 
2713   // Vectorize the interleaved store group.
2714   MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2715   assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2716          "masked interleaved groups are not allowed.");
2717   assert((!MaskForGaps || !VF.isScalable()) &&
2718          "masking gaps for scalable vectors is not yet supported.");
2719   for (unsigned Part = 0; Part < UF; Part++) {
2720     // Collect the stored vector from each member.
2721     SmallVector<Value *, 4> StoredVecs;
2722     for (unsigned i = 0; i < InterleaveFactor; i++) {
2723       assert((Group->getMember(i) || MaskForGaps) &&
2724              "Fail to get a member from an interleaved store group");
2725       Instruction *Member = Group->getMember(i);
2726 
2727       // Skip the gaps in the group.
2728       if (!Member) {
2729         Value *Undef = PoisonValue::get(SubVT);
2730         StoredVecs.push_back(Undef);
2731         continue;
2732       }
2733 
2734       Value *StoredVec = State.get(StoredValues[i], Part);
2735 
2736       if (Group->isReverse())
2737         StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
2738 
2739       // If this member has different type, cast it to a unified type.
2740 
2741       if (StoredVec->getType() != SubVT)
2742         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2743 
2744       StoredVecs.push_back(StoredVec);
2745     }
2746 
2747     // Concatenate all vectors into a wide vector.
2748     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2749 
2750     // Interleave the elements in the wide vector.
2751     Value *IVec = Builder.CreateShuffleVector(
2752         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2753         "interleaved.vec");
2754 
2755     Instruction *NewStoreInstr;
2756     if (BlockInMask || MaskForGaps) {
2757       Value *GroupMask = MaskForGaps;
2758       if (BlockInMask) {
2759         Value *BlockInMaskPart = State.get(BlockInMask, Part);
2760         Value *ShuffledMask = Builder.CreateShuffleVector(
2761             BlockInMaskPart,
2762             createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2763             "interleaved.mask");
2764         GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And,
2765                                                       ShuffledMask, MaskForGaps)
2766                                 : ShuffledMask;
2767       }
2768       NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2769                                                 Group->getAlign(), GroupMask);
2770     } else
2771       NewStoreInstr =
2772           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2773 
2774     Group->addMetadata(NewStoreInstr);
2775   }
2776 }
2777 
2778 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2779                                                VPReplicateRecipe *RepRecipe,
2780                                                const VPIteration &Instance,
2781                                                bool IfPredicateInstr,
2782                                                VPTransformState &State) {
2783   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2784 
2785   // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2786   // the first lane and part.
2787   if (isa<NoAliasScopeDeclInst>(Instr))
2788     if (!Instance.isFirstIteration())
2789       return;
2790 
2791   setDebugLocFromInst(Instr);
2792 
2793   // Does this instruction return a value ?
2794   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2795 
2796   Instruction *Cloned = Instr->clone();
2797   if (!IsVoidRetTy)
2798     Cloned->setName(Instr->getName() + ".cloned");
2799 
2800   // If the scalarized instruction contributes to the address computation of a
2801   // widen masked load/store which was in a basic block that needed predication
2802   // and is not predicated after vectorization, we can't propagate
2803   // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized
2804   // instruction could feed a poison value to the base address of the widen
2805   // load/store.
2806   if (State.MayGeneratePoisonRecipes.contains(RepRecipe))
2807     Cloned->dropPoisonGeneratingFlags();
2808 
2809   State.Builder.SetInsertPoint(Builder.GetInsertBlock(),
2810                                Builder.GetInsertPoint());
2811   // Replace the operands of the cloned instructions with their scalar
2812   // equivalents in the new loop.
2813   for (auto &I : enumerate(RepRecipe->operands())) {
2814     auto InputInstance = Instance;
2815     VPValue *Operand = I.value();
2816     VPReplicateRecipe *OperandR = dyn_cast<VPReplicateRecipe>(Operand);
2817     if (OperandR && OperandR->isUniform())
2818       InputInstance.Lane = VPLane::getFirstLane();
2819     Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2820   }
2821   addNewMetadata(Cloned, Instr);
2822 
2823   // Place the cloned scalar in the new loop.
2824   Builder.Insert(Cloned);
2825 
2826   State.set(RepRecipe, Cloned, Instance);
2827 
2828   // If we just cloned a new assumption, add it the assumption cache.
2829   if (auto *II = dyn_cast<AssumeInst>(Cloned))
2830     AC->registerAssumption(II);
2831 
2832   // End if-block.
2833   if (IfPredicateInstr)
2834     PredicatedInstructions.push_back(Cloned);
2835 }
2836 
2837 void InnerLoopVectorizer::createHeaderBranch(Loop *L) {
2838   BasicBlock *Header = L->getHeader();
2839   assert(!L->getLoopLatch() && "loop should not have a latch at this point");
2840 
2841   IRBuilder<> B(Header->getTerminator());
2842   Instruction *OldInst =
2843       getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
2844   setDebugLocFromInst(OldInst, &B);
2845 
2846   // Connect the header to the exit and header blocks and replace the old
2847   // terminator.
2848   B.CreateCondBr(B.getTrue(), L->getUniqueExitBlock(), Header);
2849 
2850   // Now we have two terminators. Remove the old one from the block.
2851   Header->getTerminator()->eraseFromParent();
2852 }
2853 
2854 Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) {
2855   if (TripCount)
2856     return TripCount;
2857 
2858   assert(InsertBlock);
2859   IRBuilder<> Builder(InsertBlock->getTerminator());
2860   // Find the loop boundaries.
2861   ScalarEvolution *SE = PSE.getSE();
2862   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2863   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
2864          "Invalid loop count");
2865 
2866   Type *IdxTy = Legal->getWidestInductionType();
2867   assert(IdxTy && "No type for induction");
2868 
2869   // The exit count might have the type of i64 while the phi is i32. This can
2870   // happen if we have an induction variable that is sign extended before the
2871   // compare. The only way that we get a backedge taken count is that the
2872   // induction variable was signed and as such will not overflow. In such a case
2873   // truncation is legal.
2874   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2875       IdxTy->getPrimitiveSizeInBits())
2876     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2877   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2878 
2879   // Get the total trip count from the count by adding 1.
2880   const SCEV *ExitCount = SE->getAddExpr(
2881       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2882 
2883   const DataLayout &DL = InsertBlock->getModule()->getDataLayout();
2884 
2885   // Expand the trip count and place the new instructions in the preheader.
2886   // Notice that the pre-header does not change, only the loop body.
2887   SCEVExpander Exp(*SE, DL, "induction");
2888 
2889   // Count holds the overall loop count (N).
2890   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2891                                 InsertBlock->getTerminator());
2892 
2893   if (TripCount->getType()->isPointerTy())
2894     TripCount =
2895         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2896                                     InsertBlock->getTerminator());
2897 
2898   return TripCount;
2899 }
2900 
2901 Value *
2902 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2903   if (VectorTripCount)
2904     return VectorTripCount;
2905 
2906   Value *TC = getOrCreateTripCount(InsertBlock);
2907   IRBuilder<> Builder(InsertBlock->getTerminator());
2908 
2909   Type *Ty = TC->getType();
2910   // This is where we can make the step a runtime constant.
2911   Value *Step = createStepForVF(Builder, Ty, VF, UF);
2912 
2913   // If the tail is to be folded by masking, round the number of iterations N
2914   // up to a multiple of Step instead of rounding down. This is done by first
2915   // adding Step-1 and then rounding down. Note that it's ok if this addition
2916   // overflows: the vector induction variable will eventually wrap to zero given
2917   // that it starts at zero and its Step is a power of two; the loop will then
2918   // exit, with the last early-exit vector comparison also producing all-true.
2919   if (Cost->foldTailByMasking()) {
2920     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2921            "VF*UF must be a power of 2 when folding tail by masking");
2922     Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
2923     TC = Builder.CreateAdd(
2924         TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
2925   }
2926 
2927   // Now we need to generate the expression for the part of the loop that the
2928   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2929   // iterations are not required for correctness, or N - Step, otherwise. Step
2930   // is equal to the vectorization factor (number of SIMD elements) times the
2931   // unroll factor (number of SIMD instructions).
2932   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2933 
2934   // There are cases where we *must* run at least one iteration in the remainder
2935   // loop.  See the cost model for when this can happen.  If the step evenly
2936   // divides the trip count, we set the remainder to be equal to the step. If
2937   // the step does not evenly divide the trip count, no adjustment is necessary
2938   // since there will already be scalar iterations. Note that the minimum
2939   // iterations check ensures that N >= Step.
2940   if (Cost->requiresScalarEpilogue(VF)) {
2941     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2942     R = Builder.CreateSelect(IsZero, Step, R);
2943   }
2944 
2945   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2946 
2947   return VectorTripCount;
2948 }
2949 
2950 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2951                                                    const DataLayout &DL) {
2952   // Verify that V is a vector type with same number of elements as DstVTy.
2953   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
2954   unsigned VF = DstFVTy->getNumElements();
2955   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
2956   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2957   Type *SrcElemTy = SrcVecTy->getElementType();
2958   Type *DstElemTy = DstFVTy->getElementType();
2959   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2960          "Vector elements must have same size");
2961 
2962   // Do a direct cast if element types are castable.
2963   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2964     return Builder.CreateBitOrPointerCast(V, DstFVTy);
2965   }
2966   // V cannot be directly casted to desired vector type.
2967   // May happen when V is a floating point vector but DstVTy is a vector of
2968   // pointers or vice-versa. Handle this using a two-step bitcast using an
2969   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2970   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2971          "Only one type should be a pointer type");
2972   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2973          "Only one type should be a floating point type");
2974   Type *IntTy =
2975       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2976   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
2977   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2978   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2979 }
2980 
2981 void InnerLoopVectorizer::emitMinimumIterationCountCheck(BasicBlock *Bypass) {
2982   Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
2983   // Reuse existing vector loop preheader for TC checks.
2984   // Note that new preheader block is generated for vector loop.
2985   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2986   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2987 
2988   // Generate code to check if the loop's trip count is less than VF * UF, or
2989   // equal to it in case a scalar epilogue is required; this implies that the
2990   // vector trip count is zero. This check also covers the case where adding one
2991   // to the backedge-taken count overflowed leading to an incorrect trip count
2992   // of zero. In this case we will also jump to the scalar loop.
2993   auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
2994                                             : ICmpInst::ICMP_ULT;
2995 
2996   // If tail is to be folded, vector loop takes care of all iterations.
2997   Value *CheckMinIters = Builder.getFalse();
2998   if (!Cost->foldTailByMasking()) {
2999     Value *Step = createStepForVF(Builder, Count->getType(), VF, UF);
3000     CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
3001   }
3002   // Create new preheader for vector loop.
3003   LoopVectorPreHeader =
3004       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3005                  "vector.ph");
3006 
3007   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
3008                                DT->getNode(Bypass)->getIDom()) &&
3009          "TC check is expected to dominate Bypass");
3010 
3011   // Update dominator for Bypass & LoopExit (if needed).
3012   DT->changeImmediateDominator(Bypass, TCCheckBlock);
3013   if (!Cost->requiresScalarEpilogue(VF))
3014     // If there is an epilogue which must run, there's no edge from the
3015     // middle block to exit blocks  and thus no need to update the immediate
3016     // dominator of the exit blocks.
3017     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3018 
3019   ReplaceInstWithInst(
3020       TCCheckBlock->getTerminator(),
3021       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3022   LoopBypassBlocks.push_back(TCCheckBlock);
3023 }
3024 
3025 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
3026 
3027   BasicBlock *const SCEVCheckBlock =
3028       RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
3029   if (!SCEVCheckBlock)
3030     return nullptr;
3031 
3032   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3033            (OptForSizeBasedOnProfile &&
3034             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
3035          "Cannot SCEV check stride or overflow when optimizing for size");
3036 
3037 
3038   // Update dominator only if this is first RT check.
3039   if (LoopBypassBlocks.empty()) {
3040     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3041     if (!Cost->requiresScalarEpilogue(VF))
3042       // If there is an epilogue which must run, there's no edge from the
3043       // middle block to exit blocks  and thus no need to update the immediate
3044       // dominator of the exit blocks.
3045       DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3046   }
3047 
3048   LoopBypassBlocks.push_back(SCEVCheckBlock);
3049   AddedSafetyChecks = true;
3050   return SCEVCheckBlock;
3051 }
3052 
3053 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
3054   // VPlan-native path does not do any analysis for runtime checks currently.
3055   if (EnableVPlanNativePath)
3056     return nullptr;
3057 
3058   BasicBlock *const MemCheckBlock =
3059       RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
3060 
3061   // Check if we generated code that checks in runtime if arrays overlap. We put
3062   // the checks into a separate block to make the more common case of few
3063   // elements faster.
3064   if (!MemCheckBlock)
3065     return nullptr;
3066 
3067   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3068     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3069            "Cannot emit memory checks when optimizing for size, unless forced "
3070            "to vectorize.");
3071     ORE->emit([&]() {
3072       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3073                                         OrigLoop->getStartLoc(),
3074                                         OrigLoop->getHeader())
3075              << "Code-size may be reduced by not forcing "
3076                 "vectorization, or by source-code modifications "
3077                 "eliminating the need for runtime checks "
3078                 "(e.g., adding 'restrict').";
3079     });
3080   }
3081 
3082   LoopBypassBlocks.push_back(MemCheckBlock);
3083 
3084   AddedSafetyChecks = true;
3085 
3086   // We currently don't use LoopVersioning for the actual loop cloning but we
3087   // still use it to add the noalias metadata.
3088   LVer = std::make_unique<LoopVersioning>(
3089       *Legal->getLAI(),
3090       Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3091       DT, PSE.getSE());
3092   LVer->prepareNoAliasMetadata();
3093   return MemCheckBlock;
3094 }
3095 
3096 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3097   LoopScalarBody = OrigLoop->getHeader();
3098   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3099   assert(LoopVectorPreHeader && "Invalid loop structure");
3100   LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3101   assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) &&
3102          "multiple exit loop without required epilogue?");
3103 
3104   LoopMiddleBlock =
3105       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3106                  LI, nullptr, Twine(Prefix) + "middle.block");
3107   LoopScalarPreHeader =
3108       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3109                  nullptr, Twine(Prefix) + "scalar.ph");
3110 
3111   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3112 
3113   // Set up the middle block terminator.  Two cases:
3114   // 1) If we know that we must execute the scalar epilogue, emit an
3115   //    unconditional branch.
3116   // 2) Otherwise, we must have a single unique exit block (due to how we
3117   //    implement the multiple exit case).  In this case, set up a conditonal
3118   //    branch from the middle block to the loop scalar preheader, and the
3119   //    exit block.  completeLoopSkeleton will update the condition to use an
3120   //    iteration check, if required to decide whether to execute the remainder.
3121   BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ?
3122     BranchInst::Create(LoopScalarPreHeader) :
3123     BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3124                        Builder.getTrue());
3125   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3126   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3127 
3128   // We intentionally don't let SplitBlock to update LoopInfo since
3129   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3130   // LoopVectorBody is explicitly added to the correct place few lines later.
3131   BasicBlock *LoopVectorBody =
3132       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3133                  nullptr, nullptr, Twine(Prefix) + "vector.body");
3134 
3135   // Update dominator for loop exit.
3136   if (!Cost->requiresScalarEpilogue(VF))
3137     // If there is an epilogue which must run, there's no edge from the
3138     // middle block to exit blocks  and thus no need to update the immediate
3139     // dominator of the exit blocks.
3140     DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3141 
3142   // Create and register the new vector loop.
3143   Loop *Lp = LI->AllocateLoop();
3144   Loop *ParentLoop = OrigLoop->getParentLoop();
3145 
3146   // Insert the new loop into the loop nest and register the new basic blocks
3147   // before calling any utilities such as SCEV that require valid LoopInfo.
3148   if (ParentLoop) {
3149     ParentLoop->addChildLoop(Lp);
3150   } else {
3151     LI->addTopLevelLoop(Lp);
3152   }
3153   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3154   return Lp;
3155 }
3156 
3157 void InnerLoopVectorizer::createInductionResumeValues(
3158     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3159   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3160           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3161          "Inconsistent information about additional bypass.");
3162 
3163   Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3164   assert(VectorTripCount && "Expected valid arguments");
3165   // We are going to resume the execution of the scalar loop.
3166   // Go over all of the induction variables that we found and fix the
3167   // PHIs that are left in the scalar version of the loop.
3168   // The starting values of PHI nodes depend on the counter of the last
3169   // iteration in the vectorized loop.
3170   // If we come from a bypass edge then we need to start from the original
3171   // start value.
3172   Instruction *OldInduction = Legal->getPrimaryInduction();
3173   for (auto &InductionEntry : Legal->getInductionVars()) {
3174     PHINode *OrigPhi = InductionEntry.first;
3175     InductionDescriptor II = InductionEntry.second;
3176 
3177     // Create phi nodes to merge from the  backedge-taken check block.
3178     PHINode *BCResumeVal =
3179         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3180                         LoopScalarPreHeader->getTerminator());
3181     // Copy original phi DL over to the new one.
3182     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3183     Value *&EndValue = IVEndValues[OrigPhi];
3184     Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3185     if (OrigPhi == OldInduction) {
3186       // We know what the end value is.
3187       EndValue = VectorTripCount;
3188     } else {
3189       IRBuilder<> B(LoopVectorPreHeader->getTerminator());
3190 
3191       // Fast-math-flags propagate from the original induction instruction.
3192       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3193         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3194 
3195       Type *StepType = II.getStep()->getType();
3196       Instruction::CastOps CastOp =
3197           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3198       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3199       Value *Step =
3200           CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3201       EndValue = emitTransformedIndex(B, CRD, II.getStartValue(), Step, II);
3202       EndValue->setName("ind.end");
3203 
3204       // Compute the end value for the additional bypass (if applicable).
3205       if (AdditionalBypass.first) {
3206         B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3207         CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3208                                          StepType, true);
3209         Value *Step =
3210             CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3211         CRD =
3212             B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
3213         EndValueFromAdditionalBypass =
3214             emitTransformedIndex(B, CRD, II.getStartValue(), Step, II);
3215         EndValueFromAdditionalBypass->setName("ind.end");
3216       }
3217     }
3218     // The new PHI merges the original incoming value, in case of a bypass,
3219     // or the value at the end of the vectorized loop.
3220     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3221 
3222     // Fix the scalar body counter (PHI node).
3223     // The old induction's phi node in the scalar body needs the truncated
3224     // value.
3225     for (BasicBlock *BB : LoopBypassBlocks)
3226       BCResumeVal->addIncoming(II.getStartValue(), BB);
3227 
3228     if (AdditionalBypass.first)
3229       BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3230                                             EndValueFromAdditionalBypass);
3231 
3232     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3233   }
3234 }
3235 
3236 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(MDNode *OrigLoopID) {
3237   // The trip counts should be cached by now.
3238   Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
3239   Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3240 
3241   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3242 
3243   // Add a check in the middle block to see if we have completed
3244   // all of the iterations in the first vector loop.  Three cases:
3245   // 1) If we require a scalar epilogue, there is no conditional branch as
3246   //    we unconditionally branch to the scalar preheader.  Do nothing.
3247   // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3248   //    Thus if tail is to be folded, we know we don't need to run the
3249   //    remainder and we can use the previous value for the condition (true).
3250   // 3) Otherwise, construct a runtime check.
3251   if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) {
3252     Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3253                                         Count, VectorTripCount, "cmp.n",
3254                                         LoopMiddleBlock->getTerminator());
3255 
3256     // Here we use the same DebugLoc as the scalar loop latch terminator instead
3257     // of the corresponding compare because they may have ended up with
3258     // different line numbers and we want to avoid awkward line stepping while
3259     // debugging. Eg. if the compare has got a line number inside the loop.
3260     CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3261     cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3262   }
3263 
3264 #ifdef EXPENSIVE_CHECKS
3265   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3266   LI->verify(*DT);
3267 #endif
3268 
3269   return LoopVectorPreHeader;
3270 }
3271 
3272 std::pair<BasicBlock *, Value *>
3273 InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3274   /*
3275    In this function we generate a new loop. The new loop will contain
3276    the vectorized instructions while the old loop will continue to run the
3277    scalar remainder.
3278 
3279        [ ] <-- loop iteration number check.
3280     /   |
3281    /    v
3282   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3283   |  /  |
3284   | /   v
3285   ||   [ ]     <-- vector pre header.
3286   |/    |
3287   |     v
3288   |    [  ] \
3289   |    [  ]_|   <-- vector loop.
3290   |     |
3291   |     v
3292   \   -[ ]   <--- middle-block.
3293    \/   |
3294    /\   v
3295    | ->[ ]     <--- new preheader.
3296    |    |
3297  (opt)  v      <-- edge from middle to exit iff epilogue is not required.
3298    |   [ ] \
3299    |   [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue).
3300     \   |
3301      \  v
3302       >[ ]     <-- exit block(s).
3303    ...
3304    */
3305 
3306   // Get the metadata of the original loop before it gets modified.
3307   MDNode *OrigLoopID = OrigLoop->getLoopID();
3308 
3309   // Workaround!  Compute the trip count of the original loop and cache it
3310   // before we start modifying the CFG.  This code has a systemic problem
3311   // wherein it tries to run analysis over partially constructed IR; this is
3312   // wrong, and not simply for SCEV.  The trip count of the original loop
3313   // simply happens to be prone to hitting this in practice.  In theory, we
3314   // can hit the same issue for any SCEV, or ValueTracking query done during
3315   // mutation.  See PR49900.
3316   getOrCreateTripCount(OrigLoop->getLoopPreheader());
3317 
3318   // Create an empty vector loop, and prepare basic blocks for the runtime
3319   // checks.
3320   Loop *Lp = createVectorLoopSkeleton("");
3321 
3322   // Now, compare the new count to zero. If it is zero skip the vector loop and
3323   // jump to the scalar loop. This check also covers the case where the
3324   // backedge-taken count is uint##_max: adding one to it will overflow leading
3325   // to an incorrect trip count of zero. In this (rare) case we will also jump
3326   // to the scalar loop.
3327   emitMinimumIterationCountCheck(LoopScalarPreHeader);
3328 
3329   // Generate the code to check any assumptions that we've made for SCEV
3330   // expressions.
3331   emitSCEVChecks(LoopScalarPreHeader);
3332 
3333   // Generate the code that checks in runtime if arrays overlap. We put the
3334   // checks into a separate block to make the more common case of few elements
3335   // faster.
3336   emitMemRuntimeChecks(LoopScalarPreHeader);
3337 
3338   createHeaderBranch(Lp);
3339 
3340   // Emit phis for the new starting index of the scalar loop.
3341   createInductionResumeValues();
3342 
3343   return {completeLoopSkeleton(OrigLoopID), nullptr};
3344 }
3345 
3346 // Fix up external users of the induction variable. At this point, we are
3347 // in LCSSA form, with all external PHIs that use the IV having one input value,
3348 // coming from the remainder loop. We need those PHIs to also have a correct
3349 // value for the IV when arriving directly from the middle block.
3350 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3351                                        const InductionDescriptor &II,
3352                                        Value *CountRoundDown, Value *EndValue,
3353                                        BasicBlock *MiddleBlock,
3354                                        BasicBlock *VectorHeader) {
3355   // There are two kinds of external IV usages - those that use the value
3356   // computed in the last iteration (the PHI) and those that use the penultimate
3357   // value (the value that feeds into the phi from the loop latch).
3358   // We allow both, but they, obviously, have different values.
3359 
3360   assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3361 
3362   DenseMap<Value *, Value *> MissingVals;
3363 
3364   // An external user of the last iteration's value should see the value that
3365   // the remainder loop uses to initialize its own IV.
3366   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3367   for (User *U : PostInc->users()) {
3368     Instruction *UI = cast<Instruction>(U);
3369     if (!OrigLoop->contains(UI)) {
3370       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3371       MissingVals[UI] = EndValue;
3372     }
3373   }
3374 
3375   // An external user of the penultimate value need to see EndValue - Step.
3376   // The simplest way to get this is to recompute it from the constituent SCEVs,
3377   // that is Start + (Step * (CRD - 1)).
3378   for (User *U : OrigPhi->users()) {
3379     auto *UI = cast<Instruction>(U);
3380     if (!OrigLoop->contains(UI)) {
3381       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3382 
3383       IRBuilder<> B(MiddleBlock->getTerminator());
3384 
3385       // Fast-math-flags propagate from the original induction instruction.
3386       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3387         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3388 
3389       Value *CountMinusOne = B.CreateSub(
3390           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3391       Value *CMO =
3392           !II.getStep()->getType()->isIntegerTy()
3393               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3394                              II.getStep()->getType())
3395               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3396       CMO->setName("cast.cmo");
3397 
3398       Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(),
3399                                     VectorHeader->getTerminator());
3400       Value *Escape =
3401           emitTransformedIndex(B, CMO, II.getStartValue(), Step, II);
3402       Escape->setName("ind.escape");
3403       MissingVals[UI] = Escape;
3404     }
3405   }
3406 
3407   for (auto &I : MissingVals) {
3408     PHINode *PHI = cast<PHINode>(I.first);
3409     // One corner case we have to handle is two IVs "chasing" each-other,
3410     // that is %IV2 = phi [...], [ %IV1, %latch ]
3411     // In this case, if IV1 has an external use, we need to avoid adding both
3412     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3413     // don't already have an incoming value for the middle block.
3414     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3415       PHI->addIncoming(I.second, MiddleBlock);
3416   }
3417 }
3418 
3419 namespace {
3420 
3421 struct CSEDenseMapInfo {
3422   static bool canHandle(const Instruction *I) {
3423     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3424            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3425   }
3426 
3427   static inline Instruction *getEmptyKey() {
3428     return DenseMapInfo<Instruction *>::getEmptyKey();
3429   }
3430 
3431   static inline Instruction *getTombstoneKey() {
3432     return DenseMapInfo<Instruction *>::getTombstoneKey();
3433   }
3434 
3435   static unsigned getHashValue(const Instruction *I) {
3436     assert(canHandle(I) && "Unknown instruction!");
3437     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3438                                                            I->value_op_end()));
3439   }
3440 
3441   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3442     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3443         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3444       return LHS == RHS;
3445     return LHS->isIdenticalTo(RHS);
3446   }
3447 };
3448 
3449 } // end anonymous namespace
3450 
3451 ///Perform cse of induction variable instructions.
3452 static void cse(BasicBlock *BB) {
3453   // Perform simple cse.
3454   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3455   for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3456     if (!CSEDenseMapInfo::canHandle(&In))
3457       continue;
3458 
3459     // Check if we can replace this instruction with any of the
3460     // visited instructions.
3461     if (Instruction *V = CSEMap.lookup(&In)) {
3462       In.replaceAllUsesWith(V);
3463       In.eraseFromParent();
3464       continue;
3465     }
3466 
3467     CSEMap[&In] = &In;
3468   }
3469 }
3470 
3471 InstructionCost
3472 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3473                                               bool &NeedToScalarize) const {
3474   Function *F = CI->getCalledFunction();
3475   Type *ScalarRetTy = CI->getType();
3476   SmallVector<Type *, 4> Tys, ScalarTys;
3477   for (auto &ArgOp : CI->args())
3478     ScalarTys.push_back(ArgOp->getType());
3479 
3480   // Estimate cost of scalarized vector call. The source operands are assumed
3481   // to be vectors, so we need to extract individual elements from there,
3482   // execute VF scalar calls, and then gather the result into the vector return
3483   // value.
3484   InstructionCost ScalarCallCost =
3485       TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
3486   if (VF.isScalar())
3487     return ScalarCallCost;
3488 
3489   // Compute corresponding vector type for return value and arguments.
3490   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3491   for (Type *ScalarTy : ScalarTys)
3492     Tys.push_back(ToVectorTy(ScalarTy, VF));
3493 
3494   // Compute costs of unpacking argument values for the scalar calls and
3495   // packing the return values to a vector.
3496   InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
3497 
3498   InstructionCost Cost =
3499       ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3500 
3501   // If we can't emit a vector call for this function, then the currently found
3502   // cost is the cost we need to return.
3503   NeedToScalarize = true;
3504   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3505   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3506 
3507   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3508     return Cost;
3509 
3510   // If the corresponding vector cost is cheaper, return its cost.
3511   InstructionCost VectorCallCost =
3512       TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
3513   if (VectorCallCost < Cost) {
3514     NeedToScalarize = false;
3515     Cost = VectorCallCost;
3516   }
3517   return Cost;
3518 }
3519 
3520 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3521   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3522     return Elt;
3523   return VectorType::get(Elt, VF);
3524 }
3525 
3526 InstructionCost
3527 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3528                                                    ElementCount VF) const {
3529   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3530   assert(ID && "Expected intrinsic call!");
3531   Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3532   FastMathFlags FMF;
3533   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3534     FMF = FPMO->getFastMathFlags();
3535 
3536   SmallVector<const Value *> Arguments(CI->args());
3537   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3538   SmallVector<Type *> ParamTys;
3539   std::transform(FTy->param_begin(), FTy->param_end(),
3540                  std::back_inserter(ParamTys),
3541                  [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3542 
3543   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3544                                     dyn_cast<IntrinsicInst>(CI));
3545   return TTI.getIntrinsicInstrCost(CostAttrs,
3546                                    TargetTransformInfo::TCK_RecipThroughput);
3547 }
3548 
3549 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3550   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3551   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3552   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3553 }
3554 
3555 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3556   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3557   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3558   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3559 }
3560 
3561 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3562   // For every instruction `I` in MinBWs, truncate the operands, create a
3563   // truncated version of `I` and reextend its result. InstCombine runs
3564   // later and will remove any ext/trunc pairs.
3565   SmallPtrSet<Value *, 4> Erased;
3566   for (const auto &KV : Cost->getMinimalBitwidths()) {
3567     // If the value wasn't vectorized, we must maintain the original scalar
3568     // type. The absence of the value from State indicates that it
3569     // wasn't vectorized.
3570     // FIXME: Should not rely on getVPValue at this point.
3571     VPValue *Def = State.Plan->getVPValue(KV.first, true);
3572     if (!State.hasAnyVectorValue(Def))
3573       continue;
3574     for (unsigned Part = 0; Part < UF; ++Part) {
3575       Value *I = State.get(Def, Part);
3576       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3577         continue;
3578       Type *OriginalTy = I->getType();
3579       Type *ScalarTruncatedTy =
3580           IntegerType::get(OriginalTy->getContext(), KV.second);
3581       auto *TruncatedTy = VectorType::get(
3582           ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
3583       if (TruncatedTy == OriginalTy)
3584         continue;
3585 
3586       IRBuilder<> B(cast<Instruction>(I));
3587       auto ShrinkOperand = [&](Value *V) -> Value * {
3588         if (auto *ZI = dyn_cast<ZExtInst>(V))
3589           if (ZI->getSrcTy() == TruncatedTy)
3590             return ZI->getOperand(0);
3591         return B.CreateZExtOrTrunc(V, TruncatedTy);
3592       };
3593 
3594       // The actual instruction modification depends on the instruction type,
3595       // unfortunately.
3596       Value *NewI = nullptr;
3597       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3598         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3599                              ShrinkOperand(BO->getOperand(1)));
3600 
3601         // Any wrapping introduced by shrinking this operation shouldn't be
3602         // considered undefined behavior. So, we can't unconditionally copy
3603         // arithmetic wrapping flags to NewI.
3604         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3605       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3606         NewI =
3607             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3608                          ShrinkOperand(CI->getOperand(1)));
3609       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3610         NewI = B.CreateSelect(SI->getCondition(),
3611                               ShrinkOperand(SI->getTrueValue()),
3612                               ShrinkOperand(SI->getFalseValue()));
3613       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3614         switch (CI->getOpcode()) {
3615         default:
3616           llvm_unreachable("Unhandled cast!");
3617         case Instruction::Trunc:
3618           NewI = ShrinkOperand(CI->getOperand(0));
3619           break;
3620         case Instruction::SExt:
3621           NewI = B.CreateSExtOrTrunc(
3622               CI->getOperand(0),
3623               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3624           break;
3625         case Instruction::ZExt:
3626           NewI = B.CreateZExtOrTrunc(
3627               CI->getOperand(0),
3628               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3629           break;
3630         }
3631       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3632         auto Elements0 =
3633             cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
3634         auto *O0 = B.CreateZExtOrTrunc(
3635             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3636         auto Elements1 =
3637             cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
3638         auto *O1 = B.CreateZExtOrTrunc(
3639             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3640 
3641         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3642       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3643         // Don't do anything with the operands, just extend the result.
3644         continue;
3645       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3646         auto Elements =
3647             cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
3648         auto *O0 = B.CreateZExtOrTrunc(
3649             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3650         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3651         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3652       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3653         auto Elements =
3654             cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
3655         auto *O0 = B.CreateZExtOrTrunc(
3656             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3657         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3658       } else {
3659         // If we don't know what to do, be conservative and don't do anything.
3660         continue;
3661       }
3662 
3663       // Lastly, extend the result.
3664       NewI->takeName(cast<Instruction>(I));
3665       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3666       I->replaceAllUsesWith(Res);
3667       cast<Instruction>(I)->eraseFromParent();
3668       Erased.insert(I);
3669       State.reset(Def, Res, Part);
3670     }
3671   }
3672 
3673   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3674   for (const auto &KV : Cost->getMinimalBitwidths()) {
3675     // If the value wasn't vectorized, we must maintain the original scalar
3676     // type. The absence of the value from State indicates that it
3677     // wasn't vectorized.
3678     // FIXME: Should not rely on getVPValue at this point.
3679     VPValue *Def = State.Plan->getVPValue(KV.first, true);
3680     if (!State.hasAnyVectorValue(Def))
3681       continue;
3682     for (unsigned Part = 0; Part < UF; ++Part) {
3683       Value *I = State.get(Def, Part);
3684       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3685       if (Inst && Inst->use_empty()) {
3686         Value *NewI = Inst->getOperand(0);
3687         Inst->eraseFromParent();
3688         State.reset(Def, NewI, Part);
3689       }
3690     }
3691   }
3692 }
3693 
3694 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
3695   // Insert truncates and extends for any truncated instructions as hints to
3696   // InstCombine.
3697   if (VF.isVector())
3698     truncateToMinimalBitwidths(State);
3699 
3700   // Fix widened non-induction PHIs by setting up the PHI operands.
3701   if (OrigPHIsToFix.size()) {
3702     assert(EnableVPlanNativePath &&
3703            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3704     fixNonInductionPHIs(State);
3705   }
3706 
3707   // At this point every instruction in the original loop is widened to a
3708   // vector form. Now we need to fix the recurrences in the loop. These PHI
3709   // nodes are currently empty because we did not want to introduce cycles.
3710   // This is the second stage of vectorizing recurrences.
3711   fixCrossIterationPHIs(State);
3712 
3713   // Forget the original basic block.
3714   PSE.getSE()->forgetLoop(OrigLoop);
3715 
3716   Loop *VectorLoop = LI->getLoopFor(State.CFG.PrevBB);
3717   // If we inserted an edge from the middle block to the unique exit block,
3718   // update uses outside the loop (phis) to account for the newly inserted
3719   // edge.
3720   if (!Cost->requiresScalarEpilogue(VF)) {
3721     // Fix-up external users of the induction variables.
3722     for (auto &Entry : Legal->getInductionVars())
3723       fixupIVUsers(Entry.first, Entry.second,
3724                    getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
3725                    IVEndValues[Entry.first], LoopMiddleBlock,
3726                    VectorLoop->getHeader());
3727 
3728     fixLCSSAPHIs(State);
3729   }
3730 
3731   for (Instruction *PI : PredicatedInstructions)
3732     sinkScalarOperands(&*PI);
3733 
3734   // Remove redundant induction instructions.
3735   cse(VectorLoop->getHeader());
3736 
3737   // Set/update profile weights for the vector and remainder loops as original
3738   // loop iterations are now distributed among them. Note that original loop
3739   // represented by LoopScalarBody becomes remainder loop after vectorization.
3740   //
3741   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3742   // end up getting slightly roughened result but that should be OK since
3743   // profile is not inherently precise anyway. Note also possible bypass of
3744   // vector code caused by legality checks is ignored, assigning all the weight
3745   // to the vector loop, optimistically.
3746   //
3747   // For scalable vectorization we can't know at compile time how many iterations
3748   // of the loop are handled in one vector iteration, so instead assume a pessimistic
3749   // vscale of '1'.
3750   setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop,
3751                                LI->getLoopFor(LoopScalarBody),
3752                                VF.getKnownMinValue() * UF);
3753 }
3754 
3755 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
3756   // In order to support recurrences we need to be able to vectorize Phi nodes.
3757   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3758   // stage #2: We now need to fix the recurrences by adding incoming edges to
3759   // the currently empty PHI nodes. At this point every instruction in the
3760   // original loop is widened to a vector form so we can use them to construct
3761   // the incoming edges.
3762   VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock();
3763   for (VPRecipeBase &R : Header->phis()) {
3764     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
3765       fixReduction(ReductionPhi, State);
3766     else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3767       fixFirstOrderRecurrence(FOR, State);
3768   }
3769 }
3770 
3771 void InnerLoopVectorizer::fixFirstOrderRecurrence(
3772     VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
3773   // This is the second phase of vectorizing first-order recurrences. An
3774   // overview of the transformation is described below. Suppose we have the
3775   // following loop.
3776   //
3777   //   for (int i = 0; i < n; ++i)
3778   //     b[i] = a[i] - a[i - 1];
3779   //
3780   // There is a first-order recurrence on "a". For this loop, the shorthand
3781   // scalar IR looks like:
3782   //
3783   //   scalar.ph:
3784   //     s_init = a[-1]
3785   //     br scalar.body
3786   //
3787   //   scalar.body:
3788   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3789   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3790   //     s2 = a[i]
3791   //     b[i] = s2 - s1
3792   //     br cond, scalar.body, ...
3793   //
3794   // In this example, s1 is a recurrence because it's value depends on the
3795   // previous iteration. In the first phase of vectorization, we created a
3796   // vector phi v1 for s1. We now complete the vectorization and produce the
3797   // shorthand vector IR shown below (for VF = 4, UF = 1).
3798   //
3799   //   vector.ph:
3800   //     v_init = vector(..., ..., ..., a[-1])
3801   //     br vector.body
3802   //
3803   //   vector.body
3804   //     i = phi [0, vector.ph], [i+4, vector.body]
3805   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3806   //     v2 = a[i, i+1, i+2, i+3];
3807   //     v3 = vector(v1(3), v2(0, 1, 2))
3808   //     b[i, i+1, i+2, i+3] = v2 - v3
3809   //     br cond, vector.body, middle.block
3810   //
3811   //   middle.block:
3812   //     x = v2(3)
3813   //     br scalar.ph
3814   //
3815   //   scalar.ph:
3816   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3817   //     br scalar.body
3818   //
3819   // After execution completes the vector loop, we extract the next value of
3820   // the recurrence (x) to use as the initial value in the scalar loop.
3821 
3822   // Extract the last vector element in the middle block. This will be the
3823   // initial value for the recurrence when jumping to the scalar loop.
3824   VPValue *PreviousDef = PhiR->getBackedgeValue();
3825   Value *Incoming = State.get(PreviousDef, UF - 1);
3826   auto *ExtractForScalar = Incoming;
3827   auto *IdxTy = Builder.getInt32Ty();
3828   if (VF.isVector()) {
3829     auto *One = ConstantInt::get(IdxTy, 1);
3830     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3831     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3832     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3833     ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
3834                                                     "vector.recur.extract");
3835   }
3836   // Extract the second last element in the middle block if the
3837   // Phi is used outside the loop. We need to extract the phi itself
3838   // and not the last element (the phi update in the current iteration). This
3839   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3840   // when the scalar loop is not run at all.
3841   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3842   if (VF.isVector()) {
3843     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3844     auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
3845     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3846         Incoming, Idx, "vector.recur.extract.for.phi");
3847   } else if (UF > 1)
3848     // When loop is unrolled without vectorizing, initialize
3849     // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
3850     // of `Incoming`. This is analogous to the vectorized case above: extracting
3851     // the second last element when VF > 1.
3852     ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
3853 
3854   // Fix the initial value of the original recurrence in the scalar loop.
3855   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3856   PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
3857   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3858   auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
3859   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3860     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3861     Start->addIncoming(Incoming, BB);
3862   }
3863 
3864   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3865   Phi->setName("scalar.recur");
3866 
3867   // Finally, fix users of the recurrence outside the loop. The users will need
3868   // either the last value of the scalar recurrence or the last value of the
3869   // vector recurrence we extracted in the middle block. Since the loop is in
3870   // LCSSA form, we just need to find all the phi nodes for the original scalar
3871   // recurrence in the exit block, and then add an edge for the middle block.
3872   // Note that LCSSA does not imply single entry when the original scalar loop
3873   // had multiple exiting edges (as we always run the last iteration in the
3874   // scalar epilogue); in that case, there is no edge from middle to exit and
3875   // and thus no phis which needed updated.
3876   if (!Cost->requiresScalarEpilogue(VF))
3877     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
3878       if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi))
3879         LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3880 }
3881 
3882 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
3883                                        VPTransformState &State) {
3884   PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
3885   // Get it's reduction variable descriptor.
3886   assert(Legal->isReductionVariable(OrigPhi) &&
3887          "Unable to find the reduction variable");
3888   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
3889 
3890   RecurKind RK = RdxDesc.getRecurrenceKind();
3891   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3892   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3893   setDebugLocFromInst(ReductionStartValue);
3894 
3895   VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
3896   // This is the vector-clone of the value that leaves the loop.
3897   Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
3898 
3899   // Wrap flags are in general invalid after vectorization, clear them.
3900   clearReductionWrapFlags(RdxDesc, State);
3901 
3902   // Before each round, move the insertion point right between
3903   // the PHIs and the values we are going to write.
3904   // This allows us to write both PHINodes and the extractelement
3905   // instructions.
3906   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3907 
3908   setDebugLocFromInst(LoopExitInst);
3909 
3910   Type *PhiTy = OrigPhi->getType();
3911   BasicBlock *VectorLoopLatch =
3912       LI->getLoopFor(State.CFG.PrevBB)->getLoopLatch();
3913   // If tail is folded by masking, the vector value to leave the loop should be
3914   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3915   // instead of the former. For an inloop reduction the reduction will already
3916   // be predicated, and does not need to be handled here.
3917   if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
3918     for (unsigned Part = 0; Part < UF; ++Part) {
3919       Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
3920       Value *Sel = nullptr;
3921       for (User *U : VecLoopExitInst->users()) {
3922         if (isa<SelectInst>(U)) {
3923           assert(!Sel && "Reduction exit feeding two selects");
3924           Sel = U;
3925         } else
3926           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3927       }
3928       assert(Sel && "Reduction exit feeds no select");
3929       State.reset(LoopExitInstDef, Sel, Part);
3930 
3931       // If the target can create a predicated operator for the reduction at no
3932       // extra cost in the loop (for example a predicated vadd), it can be
3933       // cheaper for the select to remain in the loop than be sunk out of it,
3934       // and so use the select value for the phi instead of the old
3935       // LoopExitValue.
3936       if (PreferPredicatedReductionSelect ||
3937           TTI->preferPredicatedReductionSelect(
3938               RdxDesc.getOpcode(), PhiTy,
3939               TargetTransformInfo::ReductionFlags())) {
3940         auto *VecRdxPhi =
3941             cast<PHINode>(State.get(PhiR, Part));
3942         VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel);
3943       }
3944     }
3945   }
3946 
3947   // If the vector reduction can be performed in a smaller type, we truncate
3948   // then extend the loop exit value to enable InstCombine to evaluate the
3949   // entire expression in the smaller type.
3950   if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
3951     assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
3952     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3953     Builder.SetInsertPoint(VectorLoopLatch->getTerminator());
3954     VectorParts RdxParts(UF);
3955     for (unsigned Part = 0; Part < UF; ++Part) {
3956       RdxParts[Part] = State.get(LoopExitInstDef, Part);
3957       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3958       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3959                                         : Builder.CreateZExt(Trunc, VecTy);
3960       for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users()))
3961         if (U != Trunc) {
3962           U->replaceUsesOfWith(RdxParts[Part], Extnd);
3963           RdxParts[Part] = Extnd;
3964         }
3965     }
3966     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3967     for (unsigned Part = 0; Part < UF; ++Part) {
3968       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3969       State.reset(LoopExitInstDef, RdxParts[Part], Part);
3970     }
3971   }
3972 
3973   // Reduce all of the unrolled parts into a single vector.
3974   Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
3975   unsigned Op = RecurrenceDescriptor::getOpcode(RK);
3976 
3977   // The middle block terminator has already been assigned a DebugLoc here (the
3978   // OrigLoop's single latch terminator). We want the whole middle block to
3979   // appear to execute on this line because: (a) it is all compiler generated,
3980   // (b) these instructions are always executed after evaluating the latch
3981   // conditional branch, and (c) other passes may add new predecessors which
3982   // terminate on this line. This is the easiest way to ensure we don't
3983   // accidentally cause an extra step back into the loop while debugging.
3984   setDebugLocFromInst(LoopMiddleBlock->getTerminator());
3985   if (PhiR->isOrdered())
3986     ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
3987   else {
3988     // Floating-point operations should have some FMF to enable the reduction.
3989     IRBuilderBase::FastMathFlagGuard FMFG(Builder);
3990     Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
3991     for (unsigned Part = 1; Part < UF; ++Part) {
3992       Value *RdxPart = State.get(LoopExitInstDef, Part);
3993       if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
3994         ReducedPartRdx = Builder.CreateBinOp(
3995             (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
3996       } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK))
3997         ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK,
3998                                            ReducedPartRdx, RdxPart);
3999       else
4000         ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
4001     }
4002   }
4003 
4004   // Create the reduction after the loop. Note that inloop reductions create the
4005   // target reduction in the loop using a Reduction recipe.
4006   if (VF.isVector() && !PhiR->isInLoop()) {
4007     ReducedPartRdx =
4008         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi);
4009     // If the reduction can be performed in a smaller type, we need to extend
4010     // the reduction to the wider type before we branch to the original loop.
4011     if (PhiTy != RdxDesc.getRecurrenceType())
4012       ReducedPartRdx = RdxDesc.isSigned()
4013                            ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
4014                            : Builder.CreateZExt(ReducedPartRdx, PhiTy);
4015   }
4016 
4017   PHINode *ResumePhi =
4018       dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
4019 
4020   // Create a phi node that merges control-flow from the backedge-taken check
4021   // block and the middle block.
4022   PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
4023                                         LoopScalarPreHeader->getTerminator());
4024 
4025   // If we are fixing reductions in the epilogue loop then we should already
4026   // have created a bc.merge.rdx Phi after the main vector body. Ensure that
4027   // we carry over the incoming values correctly.
4028   for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
4029     if (Incoming == LoopMiddleBlock)
4030       BCBlockPhi->addIncoming(ReducedPartRdx, Incoming);
4031     else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming))
4032       BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
4033                               Incoming);
4034     else
4035       BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
4036   }
4037 
4038   // Set the resume value for this reduction
4039   ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
4040 
4041   // Now, we need to fix the users of the reduction variable
4042   // inside and outside of the scalar remainder loop.
4043 
4044   // We know that the loop is in LCSSA form. We need to update the PHI nodes
4045   // in the exit blocks.  See comment on analogous loop in
4046   // fixFirstOrderRecurrence for a more complete explaination of the logic.
4047   if (!Cost->requiresScalarEpilogue(VF))
4048     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4049       if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst))
4050         LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4051 
4052   // Fix the scalar loop reduction variable with the incoming reduction sum
4053   // from the vector body and from the backedge value.
4054   int IncomingEdgeBlockIdx =
4055       OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4056   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4057   // Pick the other block.
4058   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4059   OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4060   OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4061 }
4062 
4063 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
4064                                                   VPTransformState &State) {
4065   RecurKind RK = RdxDesc.getRecurrenceKind();
4066   if (RK != RecurKind::Add && RK != RecurKind::Mul)
4067     return;
4068 
4069   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4070   assert(LoopExitInstr && "null loop exit instruction");
4071   SmallVector<Instruction *, 8> Worklist;
4072   SmallPtrSet<Instruction *, 8> Visited;
4073   Worklist.push_back(LoopExitInstr);
4074   Visited.insert(LoopExitInstr);
4075 
4076   while (!Worklist.empty()) {
4077     Instruction *Cur = Worklist.pop_back_val();
4078     if (isa<OverflowingBinaryOperator>(Cur))
4079       for (unsigned Part = 0; Part < UF; ++Part) {
4080         // FIXME: Should not rely on getVPValue at this point.
4081         Value *V = State.get(State.Plan->getVPValue(Cur, true), Part);
4082         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4083       }
4084 
4085     for (User *U : Cur->users()) {
4086       Instruction *UI = cast<Instruction>(U);
4087       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4088           Visited.insert(UI).second)
4089         Worklist.push_back(UI);
4090     }
4091   }
4092 }
4093 
4094 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) {
4095   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4096     if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
4097       // Some phis were already hand updated by the reduction and recurrence
4098       // code above, leave them alone.
4099       continue;
4100 
4101     auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4102     // Non-instruction incoming values will have only one value.
4103 
4104     VPLane Lane = VPLane::getFirstLane();
4105     if (isa<Instruction>(IncomingValue) &&
4106         !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue),
4107                                            VF))
4108       Lane = VPLane::getLastLaneForVF(VF);
4109 
4110     // Can be a loop invariant incoming value or the last scalar value to be
4111     // extracted from the vectorized loop.
4112     // FIXME: Should not rely on getVPValue at this point.
4113     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4114     Value *lastIncomingValue =
4115         OrigLoop->isLoopInvariant(IncomingValue)
4116             ? IncomingValue
4117             : State.get(State.Plan->getVPValue(IncomingValue, true),
4118                         VPIteration(UF - 1, Lane));
4119     LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4120   }
4121 }
4122 
4123 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4124   // The basic block and loop containing the predicated instruction.
4125   auto *PredBB = PredInst->getParent();
4126   auto *VectorLoop = LI->getLoopFor(PredBB);
4127 
4128   // Initialize a worklist with the operands of the predicated instruction.
4129   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4130 
4131   // Holds instructions that we need to analyze again. An instruction may be
4132   // reanalyzed if we don't yet know if we can sink it or not.
4133   SmallVector<Instruction *, 8> InstsToReanalyze;
4134 
4135   // Returns true if a given use occurs in the predicated block. Phi nodes use
4136   // their operands in their corresponding predecessor blocks.
4137   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4138     auto *I = cast<Instruction>(U.getUser());
4139     BasicBlock *BB = I->getParent();
4140     if (auto *Phi = dyn_cast<PHINode>(I))
4141       BB = Phi->getIncomingBlock(
4142           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4143     return BB == PredBB;
4144   };
4145 
4146   // Iteratively sink the scalarized operands of the predicated instruction
4147   // into the block we created for it. When an instruction is sunk, it's
4148   // operands are then added to the worklist. The algorithm ends after one pass
4149   // through the worklist doesn't sink a single instruction.
4150   bool Changed;
4151   do {
4152     // Add the instructions that need to be reanalyzed to the worklist, and
4153     // reset the changed indicator.
4154     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4155     InstsToReanalyze.clear();
4156     Changed = false;
4157 
4158     while (!Worklist.empty()) {
4159       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4160 
4161       // We can't sink an instruction if it is a phi node, is not in the loop,
4162       // or may have side effects.
4163       if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
4164           I->mayHaveSideEffects())
4165         continue;
4166 
4167       // If the instruction is already in PredBB, check if we can sink its
4168       // operands. In that case, VPlan's sinkScalarOperands() succeeded in
4169       // sinking the scalar instruction I, hence it appears in PredBB; but it
4170       // may have failed to sink I's operands (recursively), which we try
4171       // (again) here.
4172       if (I->getParent() == PredBB) {
4173         Worklist.insert(I->op_begin(), I->op_end());
4174         continue;
4175       }
4176 
4177       // It's legal to sink the instruction if all its uses occur in the
4178       // predicated block. Otherwise, there's nothing to do yet, and we may
4179       // need to reanalyze the instruction.
4180       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4181         InstsToReanalyze.push_back(I);
4182         continue;
4183       }
4184 
4185       // Move the instruction to the beginning of the predicated block, and add
4186       // it's operands to the worklist.
4187       I->moveBefore(&*PredBB->getFirstInsertionPt());
4188       Worklist.insert(I->op_begin(), I->op_end());
4189 
4190       // The sinking may have enabled other instructions to be sunk, so we will
4191       // need to iterate.
4192       Changed = true;
4193     }
4194   } while (Changed);
4195 }
4196 
4197 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
4198   for (PHINode *OrigPhi : OrigPHIsToFix) {
4199     VPWidenPHIRecipe *VPPhi =
4200         cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi));
4201     PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4202     // Make sure the builder has a valid insert point.
4203     Builder.SetInsertPoint(NewPhi);
4204     for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4205       VPValue *Inc = VPPhi->getIncomingValue(i);
4206       VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4207       NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4208     }
4209   }
4210 }
4211 
4212 bool InnerLoopVectorizer::useOrderedReductions(
4213     const RecurrenceDescriptor &RdxDesc) {
4214   return Cost->useOrderedReductions(RdxDesc);
4215 }
4216 
4217 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
4218                                               VPWidenPHIRecipe *PhiR,
4219                                               VPTransformState &State) {
4220   PHINode *P = cast<PHINode>(PN);
4221   if (EnableVPlanNativePath) {
4222     // Currently we enter here in the VPlan-native path for non-induction
4223     // PHIs where all control flow is uniform. We simply widen these PHIs.
4224     // Create a vector phi with no operands - the vector phi operands will be
4225     // set at the end of vector code generation.
4226     Type *VecTy = (State.VF.isScalar())
4227                       ? PN->getType()
4228                       : VectorType::get(PN->getType(), State.VF);
4229     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4230     State.set(PhiR, VecPhi, 0);
4231     OrigPHIsToFix.push_back(P);
4232 
4233     return;
4234   }
4235 
4236   assert(PN->getParent() == OrigLoop->getHeader() &&
4237          "Non-header phis should have been handled elsewhere");
4238 
4239   // In order to support recurrences we need to be able to vectorize Phi nodes.
4240   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4241   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4242   // this value when we vectorize all of the instructions that use the PHI.
4243 
4244   assert(!Legal->isReductionVariable(P) &&
4245          "reductions should be handled elsewhere");
4246 
4247   setDebugLocFromInst(P);
4248 
4249   // This PHINode must be an induction variable.
4250   // Make sure that we know about it.
4251   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4252 
4253   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4254   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4255 
4256   auto *IVR = PhiR->getParent()->getPlan()->getCanonicalIV();
4257   PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
4258 
4259   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4260   // which can be found from the original scalar operations.
4261   switch (II.getKind()) {
4262   case InductionDescriptor::IK_NoInduction:
4263     llvm_unreachable("Unknown induction");
4264   case InductionDescriptor::IK_IntInduction:
4265   case InductionDescriptor::IK_FpInduction:
4266     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4267   case InductionDescriptor::IK_PtrInduction: {
4268     // Handle the pointer induction variable case.
4269     assert(P->getType()->isPointerTy() && "Unexpected type.");
4270 
4271     if (all_of(PhiR->users(), [PhiR](const VPUser *U) {
4272           return cast<VPRecipeBase>(U)->usesScalars(PhiR);
4273         })) {
4274       // This is the normalized GEP that starts counting at zero.
4275       Value *PtrInd =
4276           Builder.CreateSExtOrTrunc(CanonicalIV, II.getStep()->getType());
4277       // Determine the number of scalars we need to generate for each unroll
4278       // iteration. If the instruction is uniform, we only need to generate the
4279       // first lane. Otherwise, we generate all VF values.
4280       bool IsUniform = vputils::onlyFirstLaneUsed(PhiR);
4281       assert((IsUniform || !State.VF.isScalable()) &&
4282              "Cannot scalarize a scalable VF");
4283       unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
4284 
4285       for (unsigned Part = 0; Part < UF; ++Part) {
4286         Value *PartStart =
4287             createStepForVF(Builder, PtrInd->getType(), VF, Part);
4288 
4289         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4290           Value *Idx = Builder.CreateAdd(
4291               PartStart, ConstantInt::get(PtrInd->getType(), Lane));
4292           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4293 
4294           Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(),
4295                                         State.CFG.PrevBB->getTerminator());
4296           Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx,
4297                                                 II.getStartValue(), Step, II);
4298           SclrGep->setName("next.gep");
4299           State.set(PhiR, SclrGep, VPIteration(Part, Lane));
4300         }
4301       }
4302       return;
4303     }
4304     assert(isa<SCEVConstant>(II.getStep()) &&
4305            "Induction step not a SCEV constant!");
4306     Type *PhiType = II.getStep()->getType();
4307 
4308     // Build a pointer phi
4309     Value *ScalarStartValue = PhiR->getStartValue()->getLiveInIRValue();
4310     Type *ScStValueType = ScalarStartValue->getType();
4311     PHINode *NewPointerPhi =
4312         PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
4313     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4314 
4315     // A pointer induction, performed by using a gep
4316     BasicBlock *LoopLatch = LI->getLoopFor(State.CFG.PrevBB)->getLoopLatch();
4317     Instruction *InductionLoc = LoopLatch->getTerminator();
4318     const SCEV *ScalarStep = II.getStep();
4319     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4320     Value *ScalarStepValue =
4321         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4322     Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF);
4323     Value *NumUnrolledElems =
4324         Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
4325     Value *InductionGEP = GetElementPtrInst::Create(
4326         II.getElementType(), NewPointerPhi,
4327         Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
4328         InductionLoc);
4329     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4330 
4331     // Create UF many actual address geps that use the pointer
4332     // phi as base and a vectorized version of the step value
4333     // (<step*0, ..., step*N>) as offset.
4334     for (unsigned Part = 0; Part < State.UF; ++Part) {
4335       Type *VecPhiType = VectorType::get(PhiType, State.VF);
4336       Value *StartOffsetScalar =
4337           Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
4338       Value *StartOffset =
4339           Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
4340       // Create a vector of consecutive numbers from zero to VF.
4341       StartOffset =
4342           Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType));
4343 
4344       Value *GEP = Builder.CreateGEP(
4345           II.getElementType(), NewPointerPhi,
4346           Builder.CreateMul(
4347               StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue),
4348               "vector.gep"));
4349       State.set(PhiR, GEP, Part);
4350     }
4351   }
4352   }
4353 }
4354 
4355 /// A helper function for checking whether an integer division-related
4356 /// instruction may divide by zero (in which case it must be predicated if
4357 /// executed conditionally in the scalar code).
4358 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4359 /// Non-zero divisors that are non compile-time constants will not be
4360 /// converted into multiplication, so we will still end up scalarizing
4361 /// the division, but can do so w/o predication.
4362 static bool mayDivideByZero(Instruction &I) {
4363   assert((I.getOpcode() == Instruction::UDiv ||
4364           I.getOpcode() == Instruction::SDiv ||
4365           I.getOpcode() == Instruction::URem ||
4366           I.getOpcode() == Instruction::SRem) &&
4367          "Unexpected instruction");
4368   Value *Divisor = I.getOperand(1);
4369   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4370   return !CInt || CInt->isZero();
4371 }
4372 
4373 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4374                                                VPUser &ArgOperands,
4375                                                VPTransformState &State) {
4376   assert(!isa<DbgInfoIntrinsic>(I) &&
4377          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4378   setDebugLocFromInst(&I);
4379 
4380   Module *M = I.getParent()->getParent()->getParent();
4381   auto *CI = cast<CallInst>(&I);
4382 
4383   SmallVector<Type *, 4> Tys;
4384   for (Value *ArgOperand : CI->args())
4385     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4386 
4387   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4388 
4389   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4390   // version of the instruction.
4391   // Is it beneficial to perform intrinsic call compared to lib call?
4392   bool NeedToScalarize = false;
4393   InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4394   InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
4395   bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
4396   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4397          "Instruction should be scalarized elsewhere.");
4398   assert((IntrinsicCost.isValid() || CallCost.isValid()) &&
4399          "Either the intrinsic cost or vector call cost must be valid");
4400 
4401   for (unsigned Part = 0; Part < UF; ++Part) {
4402     SmallVector<Type *, 2> TysForDecl = {CI->getType()};
4403     SmallVector<Value *, 4> Args;
4404     for (auto &I : enumerate(ArgOperands.operands())) {
4405       // Some intrinsics have a scalar argument - don't replace it with a
4406       // vector.
4407       Value *Arg;
4408       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4409         Arg = State.get(I.value(), Part);
4410       else {
4411         Arg = State.get(I.value(), VPIteration(0, 0));
4412         if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index()))
4413           TysForDecl.push_back(Arg->getType());
4414       }
4415       Args.push_back(Arg);
4416     }
4417 
4418     Function *VectorF;
4419     if (UseVectorIntrinsic) {
4420       // Use vector version of the intrinsic.
4421       if (VF.isVector())
4422         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4423       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4424       assert(VectorF && "Can't retrieve vector intrinsic.");
4425     } else {
4426       // Use vector version of the function call.
4427       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4428 #ifndef NDEBUG
4429       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4430              "Can't create vector function.");
4431 #endif
4432         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4433     }
4434       SmallVector<OperandBundleDef, 1> OpBundles;
4435       CI->getOperandBundlesAsDefs(OpBundles);
4436       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4437 
4438       if (isa<FPMathOperator>(V))
4439         V->copyFastMathFlags(CI);
4440 
4441       State.set(Def, V, Part);
4442       addMetadata(V, &I);
4443   }
4444 }
4445 
4446 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4447   // We should not collect Scalars more than once per VF. Right now, this
4448   // function is called from collectUniformsAndScalars(), which already does
4449   // this check. Collecting Scalars for VF=1 does not make any sense.
4450   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4451          "This function should not be visited twice for the same VF");
4452 
4453   // This avoids any chances of creating a REPLICATE recipe during planning
4454   // since that would result in generation of scalarized code during execution,
4455   // which is not supported for scalable vectors.
4456   if (VF.isScalable()) {
4457     Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
4458     return;
4459   }
4460 
4461   SmallSetVector<Instruction *, 8> Worklist;
4462 
4463   // These sets are used to seed the analysis with pointers used by memory
4464   // accesses that will remain scalar.
4465   SmallSetVector<Instruction *, 8> ScalarPtrs;
4466   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4467   auto *Latch = TheLoop->getLoopLatch();
4468 
4469   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4470   // The pointer operands of loads and stores will be scalar as long as the
4471   // memory access is not a gather or scatter operation. The value operand of a
4472   // store will remain scalar if the store is scalarized.
4473   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4474     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4475     assert(WideningDecision != CM_Unknown &&
4476            "Widening decision should be ready at this moment");
4477     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4478       if (Ptr == Store->getValueOperand())
4479         return WideningDecision == CM_Scalarize;
4480     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4481            "Ptr is neither a value or pointer operand");
4482     return WideningDecision != CM_GatherScatter;
4483   };
4484 
4485   // A helper that returns true if the given value is a bitcast or
4486   // getelementptr instruction contained in the loop.
4487   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4488     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4489             isa<GetElementPtrInst>(V)) &&
4490            !TheLoop->isLoopInvariant(V);
4491   };
4492 
4493   // A helper that evaluates a memory access's use of a pointer. If the use will
4494   // be a scalar use and the pointer is only used by memory accesses, we place
4495   // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4496   // PossibleNonScalarPtrs.
4497   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4498     // We only care about bitcast and getelementptr instructions contained in
4499     // the loop.
4500     if (!isLoopVaryingBitCastOrGEP(Ptr))
4501       return;
4502 
4503     // If the pointer has already been identified as scalar (e.g., if it was
4504     // also identified as uniform), there's nothing to do.
4505     auto *I = cast<Instruction>(Ptr);
4506     if (Worklist.count(I))
4507       return;
4508 
4509     // If the use of the pointer will be a scalar use, and all users of the
4510     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4511     // place the pointer in PossibleNonScalarPtrs.
4512     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4513           return isa<LoadInst>(U) || isa<StoreInst>(U);
4514         }))
4515       ScalarPtrs.insert(I);
4516     else
4517       PossibleNonScalarPtrs.insert(I);
4518   };
4519 
4520   // We seed the scalars analysis with three classes of instructions: (1)
4521   // instructions marked uniform-after-vectorization and (2) bitcast,
4522   // getelementptr and (pointer) phi instructions used by memory accesses
4523   // requiring a scalar use.
4524   //
4525   // (1) Add to the worklist all instructions that have been identified as
4526   // uniform-after-vectorization.
4527   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4528 
4529   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4530   // memory accesses requiring a scalar use. The pointer operands of loads and
4531   // stores will be scalar as long as the memory accesses is not a gather or
4532   // scatter operation. The value operand of a store will remain scalar if the
4533   // store is scalarized.
4534   for (auto *BB : TheLoop->blocks())
4535     for (auto &I : *BB) {
4536       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4537         evaluatePtrUse(Load, Load->getPointerOperand());
4538       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4539         evaluatePtrUse(Store, Store->getPointerOperand());
4540         evaluatePtrUse(Store, Store->getValueOperand());
4541       }
4542     }
4543   for (auto *I : ScalarPtrs)
4544     if (!PossibleNonScalarPtrs.count(I)) {
4545       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4546       Worklist.insert(I);
4547     }
4548 
4549   // Insert the forced scalars.
4550   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4551   // induction variable when the PHI user is scalarized.
4552   auto ForcedScalar = ForcedScalars.find(VF);
4553   if (ForcedScalar != ForcedScalars.end())
4554     for (auto *I : ForcedScalar->second)
4555       Worklist.insert(I);
4556 
4557   // Expand the worklist by looking through any bitcasts and getelementptr
4558   // instructions we've already identified as scalar. This is similar to the
4559   // expansion step in collectLoopUniforms(); however, here we're only
4560   // expanding to include additional bitcasts and getelementptr instructions.
4561   unsigned Idx = 0;
4562   while (Idx != Worklist.size()) {
4563     Instruction *Dst = Worklist[Idx++];
4564     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4565       continue;
4566     auto *Src = cast<Instruction>(Dst->getOperand(0));
4567     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4568           auto *J = cast<Instruction>(U);
4569           return !TheLoop->contains(J) || Worklist.count(J) ||
4570                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4571                   isScalarUse(J, Src));
4572         })) {
4573       Worklist.insert(Src);
4574       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4575     }
4576   }
4577 
4578   // An induction variable will remain scalar if all users of the induction
4579   // variable and induction variable update remain scalar.
4580   for (auto &Induction : Legal->getInductionVars()) {
4581     auto *Ind = Induction.first;
4582     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4583 
4584     // If tail-folding is applied, the primary induction variable will be used
4585     // to feed a vector compare.
4586     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4587       continue;
4588 
4589     // Returns true if \p Indvar is a pointer induction that is used directly by
4590     // load/store instruction \p I.
4591     auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
4592                                               Instruction *I) {
4593       return Induction.second.getKind() ==
4594                  InductionDescriptor::IK_PtrInduction &&
4595              (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
4596              Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
4597     };
4598 
4599     // Determine if all users of the induction variable are scalar after
4600     // vectorization.
4601     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4602       auto *I = cast<Instruction>(U);
4603       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4604              IsDirectLoadStoreFromPtrIndvar(Ind, I);
4605     });
4606     if (!ScalarInd)
4607       continue;
4608 
4609     // Determine if all users of the induction variable update instruction are
4610     // scalar after vectorization.
4611     auto ScalarIndUpdate =
4612         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4613           auto *I = cast<Instruction>(U);
4614           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4615                  IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
4616         });
4617     if (!ScalarIndUpdate)
4618       continue;
4619 
4620     // The induction variable and its update instruction will remain scalar.
4621     Worklist.insert(Ind);
4622     Worklist.insert(IndUpdate);
4623     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4624     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4625                       << "\n");
4626   }
4627 
4628   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4629 }
4630 
4631 bool LoopVectorizationCostModel::isScalarWithPredication(
4632     Instruction *I, ElementCount VF) const {
4633   if (!blockNeedsPredicationForAnyReason(I->getParent()))
4634     return false;
4635   switch(I->getOpcode()) {
4636   default:
4637     break;
4638   case Instruction::Load:
4639   case Instruction::Store: {
4640     if (!Legal->isMaskRequired(I))
4641       return false;
4642     auto *Ptr = getLoadStorePointerOperand(I);
4643     auto *Ty = getLoadStoreType(I);
4644     Type *VTy = Ty;
4645     if (VF.isVector())
4646       VTy = VectorType::get(Ty, VF);
4647     const Align Alignment = getLoadStoreAlignment(I);
4648     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4649                                 TTI.isLegalMaskedGather(VTy, Alignment))
4650                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4651                                 TTI.isLegalMaskedScatter(VTy, Alignment));
4652   }
4653   case Instruction::UDiv:
4654   case Instruction::SDiv:
4655   case Instruction::SRem:
4656   case Instruction::URem:
4657     return mayDivideByZero(*I);
4658   }
4659   return false;
4660 }
4661 
4662 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4663     Instruction *I, ElementCount VF) {
4664   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4665   assert(getWideningDecision(I, VF) == CM_Unknown &&
4666          "Decision should not be set yet.");
4667   auto *Group = getInterleavedAccessGroup(I);
4668   assert(Group && "Must have a group.");
4669 
4670   // If the instruction's allocated size doesn't equal it's type size, it
4671   // requires padding and will be scalarized.
4672   auto &DL = I->getModule()->getDataLayout();
4673   auto *ScalarTy = getLoadStoreType(I);
4674   if (hasIrregularType(ScalarTy, DL))
4675     return false;
4676 
4677   // Check if masking is required.
4678   // A Group may need masking for one of two reasons: it resides in a block that
4679   // needs predication, or it was decided to use masking to deal with gaps
4680   // (either a gap at the end of a load-access that may result in a speculative
4681   // load, or any gaps in a store-access).
4682   bool PredicatedAccessRequiresMasking =
4683       blockNeedsPredicationForAnyReason(I->getParent()) &&
4684       Legal->isMaskRequired(I);
4685   bool LoadAccessWithGapsRequiresEpilogMasking =
4686       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4687       !isScalarEpilogueAllowed();
4688   bool StoreAccessWithGapsRequiresMasking =
4689       isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4690   if (!PredicatedAccessRequiresMasking &&
4691       !LoadAccessWithGapsRequiresEpilogMasking &&
4692       !StoreAccessWithGapsRequiresMasking)
4693     return true;
4694 
4695   // If masked interleaving is required, we expect that the user/target had
4696   // enabled it, because otherwise it either wouldn't have been created or
4697   // it should have been invalidated by the CostModel.
4698   assert(useMaskedInterleavedAccesses(TTI) &&
4699          "Masked interleave-groups for predicated accesses are not enabled.");
4700 
4701   if (Group->isReverse())
4702     return false;
4703 
4704   auto *Ty = getLoadStoreType(I);
4705   const Align Alignment = getLoadStoreAlignment(I);
4706   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4707                           : TTI.isLegalMaskedStore(Ty, Alignment);
4708 }
4709 
4710 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4711     Instruction *I, ElementCount VF) {
4712   // Get and ensure we have a valid memory instruction.
4713   assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
4714 
4715   auto *Ptr = getLoadStorePointerOperand(I);
4716   auto *ScalarTy = getLoadStoreType(I);
4717 
4718   // In order to be widened, the pointer should be consecutive, first of all.
4719   if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
4720     return false;
4721 
4722   // If the instruction is a store located in a predicated block, it will be
4723   // scalarized.
4724   if (isScalarWithPredication(I, VF))
4725     return false;
4726 
4727   // If the instruction's allocated size doesn't equal it's type size, it
4728   // requires padding and will be scalarized.
4729   auto &DL = I->getModule()->getDataLayout();
4730   if (hasIrregularType(ScalarTy, DL))
4731     return false;
4732 
4733   return true;
4734 }
4735 
4736 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4737   // We should not collect Uniforms more than once per VF. Right now,
4738   // this function is called from collectUniformsAndScalars(), which
4739   // already does this check. Collecting Uniforms for VF=1 does not make any
4740   // sense.
4741 
4742   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
4743          "This function should not be visited twice for the same VF");
4744 
4745   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4746   // not analyze again.  Uniforms.count(VF) will return 1.
4747   Uniforms[VF].clear();
4748 
4749   // We now know that the loop is vectorizable!
4750   // Collect instructions inside the loop that will remain uniform after
4751   // vectorization.
4752 
4753   // Global values, params and instructions outside of current loop are out of
4754   // scope.
4755   auto isOutOfScope = [&](Value *V) -> bool {
4756     Instruction *I = dyn_cast<Instruction>(V);
4757     return (!I || !TheLoop->contains(I));
4758   };
4759 
4760   // Worklist containing uniform instructions demanding lane 0.
4761   SetVector<Instruction *> Worklist;
4762   BasicBlock *Latch = TheLoop->getLoopLatch();
4763 
4764   // Add uniform instructions demanding lane 0 to the worklist. Instructions
4765   // that are scalar with predication must not be considered uniform after
4766   // vectorization, because that would create an erroneous replicating region
4767   // where only a single instance out of VF should be formed.
4768   // TODO: optimize such seldom cases if found important, see PR40816.
4769   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4770     if (isOutOfScope(I)) {
4771       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
4772                         << *I << "\n");
4773       return;
4774     }
4775     if (isScalarWithPredication(I, VF)) {
4776       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4777                         << *I << "\n");
4778       return;
4779     }
4780     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4781     Worklist.insert(I);
4782   };
4783 
4784   // Start with the conditional branch. If the branch condition is an
4785   // instruction contained in the loop that is only used by the branch, it is
4786   // uniform.
4787   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4788   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4789     addToWorklistIfAllowed(Cmp);
4790 
4791   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
4792     InstWidening WideningDecision = getWideningDecision(I, VF);
4793     assert(WideningDecision != CM_Unknown &&
4794            "Widening decision should be ready at this moment");
4795 
4796     // A uniform memory op is itself uniform.  We exclude uniform stores
4797     // here as they demand the last lane, not the first one.
4798     if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
4799       assert(WideningDecision == CM_Scalarize);
4800       return true;
4801     }
4802 
4803     return (WideningDecision == CM_Widen ||
4804             WideningDecision == CM_Widen_Reverse ||
4805             WideningDecision == CM_Interleave);
4806   };
4807 
4808 
4809   // Returns true if Ptr is the pointer operand of a memory access instruction
4810   // I, and I is known to not require scalarization.
4811   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4812     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4813   };
4814 
4815   // Holds a list of values which are known to have at least one uniform use.
4816   // Note that there may be other uses which aren't uniform.  A "uniform use"
4817   // here is something which only demands lane 0 of the unrolled iterations;
4818   // it does not imply that all lanes produce the same value (e.g. this is not
4819   // the usual meaning of uniform)
4820   SetVector<Value *> HasUniformUse;
4821 
4822   // Scan the loop for instructions which are either a) known to have only
4823   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
4824   for (auto *BB : TheLoop->blocks())
4825     for (auto &I : *BB) {
4826       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
4827         switch (II->getIntrinsicID()) {
4828         case Intrinsic::sideeffect:
4829         case Intrinsic::experimental_noalias_scope_decl:
4830         case Intrinsic::assume:
4831         case Intrinsic::lifetime_start:
4832         case Intrinsic::lifetime_end:
4833           if (TheLoop->hasLoopInvariantOperands(&I))
4834             addToWorklistIfAllowed(&I);
4835           break;
4836         default:
4837           break;
4838         }
4839       }
4840 
4841       // ExtractValue instructions must be uniform, because the operands are
4842       // known to be loop-invariant.
4843       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
4844         assert(isOutOfScope(EVI->getAggregateOperand()) &&
4845                "Expected aggregate value to be loop invariant");
4846         addToWorklistIfAllowed(EVI);
4847         continue;
4848       }
4849 
4850       // If there's no pointer operand, there's nothing to do.
4851       auto *Ptr = getLoadStorePointerOperand(&I);
4852       if (!Ptr)
4853         continue;
4854 
4855       // A uniform memory op is itself uniform.  We exclude uniform stores
4856       // here as they demand the last lane, not the first one.
4857       if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
4858         addToWorklistIfAllowed(&I);
4859 
4860       if (isUniformDecision(&I, VF)) {
4861         assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
4862         HasUniformUse.insert(Ptr);
4863       }
4864     }
4865 
4866   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
4867   // demanding) users.  Since loops are assumed to be in LCSSA form, this
4868   // disallows uses outside the loop as well.
4869   for (auto *V : HasUniformUse) {
4870     if (isOutOfScope(V))
4871       continue;
4872     auto *I = cast<Instruction>(V);
4873     auto UsersAreMemAccesses =
4874       llvm::all_of(I->users(), [&](User *U) -> bool {
4875         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
4876       });
4877     if (UsersAreMemAccesses)
4878       addToWorklistIfAllowed(I);
4879   }
4880 
4881   // Expand Worklist in topological order: whenever a new instruction
4882   // is added , its users should be already inside Worklist.  It ensures
4883   // a uniform instruction will only be used by uniform instructions.
4884   unsigned idx = 0;
4885   while (idx != Worklist.size()) {
4886     Instruction *I = Worklist[idx++];
4887 
4888     for (auto OV : I->operand_values()) {
4889       // isOutOfScope operands cannot be uniform instructions.
4890       if (isOutOfScope(OV))
4891         continue;
4892       // First order recurrence Phi's should typically be considered
4893       // non-uniform.
4894       auto *OP = dyn_cast<PHINode>(OV);
4895       if (OP && Legal->isFirstOrderRecurrence(OP))
4896         continue;
4897       // If all the users of the operand are uniform, then add the
4898       // operand into the uniform worklist.
4899       auto *OI = cast<Instruction>(OV);
4900       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4901             auto *J = cast<Instruction>(U);
4902             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
4903           }))
4904         addToWorklistIfAllowed(OI);
4905     }
4906   }
4907 
4908   // For an instruction to be added into Worklist above, all its users inside
4909   // the loop should also be in Worklist. However, this condition cannot be
4910   // true for phi nodes that form a cyclic dependence. We must process phi
4911   // nodes separately. An induction variable will remain uniform if all users
4912   // of the induction variable and induction variable update remain uniform.
4913   // The code below handles both pointer and non-pointer induction variables.
4914   for (auto &Induction : Legal->getInductionVars()) {
4915     auto *Ind = Induction.first;
4916     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4917 
4918     // Determine if all users of the induction variable are uniform after
4919     // vectorization.
4920     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4921       auto *I = cast<Instruction>(U);
4922       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4923              isVectorizedMemAccessUse(I, Ind);
4924     });
4925     if (!UniformInd)
4926       continue;
4927 
4928     // Determine if all users of the induction variable update instruction are
4929     // uniform after vectorization.
4930     auto UniformIndUpdate =
4931         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4932           auto *I = cast<Instruction>(U);
4933           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4934                  isVectorizedMemAccessUse(I, IndUpdate);
4935         });
4936     if (!UniformIndUpdate)
4937       continue;
4938 
4939     // The induction variable and its update instruction will remain uniform.
4940     addToWorklistIfAllowed(Ind);
4941     addToWorklistIfAllowed(IndUpdate);
4942   }
4943 
4944   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4945 }
4946 
4947 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4948   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4949 
4950   if (Legal->getRuntimePointerChecking()->Need) {
4951     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4952         "runtime pointer checks needed. Enable vectorization of this "
4953         "loop with '#pragma clang loop vectorize(enable)' when "
4954         "compiling with -Os/-Oz",
4955         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4956     return true;
4957   }
4958 
4959   if (!PSE.getPredicate().isAlwaysTrue()) {
4960     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4961         "runtime SCEV checks needed. Enable vectorization of this "
4962         "loop with '#pragma clang loop vectorize(enable)' when "
4963         "compiling with -Os/-Oz",
4964         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4965     return true;
4966   }
4967 
4968   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4969   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4970     reportVectorizationFailure("Runtime stride check for small trip count",
4971         "runtime stride == 1 checks needed. Enable vectorization of "
4972         "this loop without such check by compiling with -Os/-Oz",
4973         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4974     return true;
4975   }
4976 
4977   return false;
4978 }
4979 
4980 ElementCount
4981 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
4982   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
4983     return ElementCount::getScalable(0);
4984 
4985   if (Hints->isScalableVectorizationDisabled()) {
4986     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
4987                             "ScalableVectorizationDisabled", ORE, TheLoop);
4988     return ElementCount::getScalable(0);
4989   }
4990 
4991   LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
4992 
4993   auto MaxScalableVF = ElementCount::getScalable(
4994       std::numeric_limits<ElementCount::ScalarTy>::max());
4995 
4996   // Test that the loop-vectorizer can legalize all operations for this MaxVF.
4997   // FIXME: While for scalable vectors this is currently sufficient, this should
4998   // be replaced by a more detailed mechanism that filters out specific VFs,
4999   // instead of invalidating vectorization for a whole set of VFs based on the
5000   // MaxVF.
5001 
5002   // Disable scalable vectorization if the loop contains unsupported reductions.
5003   if (!canVectorizeReductions(MaxScalableVF)) {
5004     reportVectorizationInfo(
5005         "Scalable vectorization not supported for the reduction "
5006         "operations found in this loop.",
5007         "ScalableVFUnfeasible", ORE, TheLoop);
5008     return ElementCount::getScalable(0);
5009   }
5010 
5011   // Disable scalable vectorization if the loop contains any instructions
5012   // with element types not supported for scalable vectors.
5013   if (any_of(ElementTypesInLoop, [&](Type *Ty) {
5014         return !Ty->isVoidTy() &&
5015                !this->TTI.isElementTypeLegalForScalableVector(Ty);
5016       })) {
5017     reportVectorizationInfo("Scalable vectorization is not supported "
5018                             "for all element types found in this loop.",
5019                             "ScalableVFUnfeasible", ORE, TheLoop);
5020     return ElementCount::getScalable(0);
5021   }
5022 
5023   if (Legal->isSafeForAnyVectorWidth())
5024     return MaxScalableVF;
5025 
5026   // Limit MaxScalableVF by the maximum safe dependence distance.
5027   Optional<unsigned> MaxVScale = TTI.getMaxVScale();
5028   if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange))
5029     MaxVScale =
5030         TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
5031   MaxScalableVF = ElementCount::getScalable(
5032       MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
5033   if (!MaxScalableVF)
5034     reportVectorizationInfo(
5035         "Max legal vector width too small, scalable vectorization "
5036         "unfeasible.",
5037         "ScalableVFUnfeasible", ORE, TheLoop);
5038 
5039   return MaxScalableVF;
5040 }
5041 
5042 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
5043     unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) {
5044   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5045   unsigned SmallestType, WidestType;
5046   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5047 
5048   // Get the maximum safe dependence distance in bits computed by LAA.
5049   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5050   // the memory accesses that is most restrictive (involved in the smallest
5051   // dependence distance).
5052   unsigned MaxSafeElements =
5053       PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
5054 
5055   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
5056   auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
5057 
5058   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
5059                     << ".\n");
5060   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
5061                     << ".\n");
5062 
5063   // First analyze the UserVF, fall back if the UserVF should be ignored.
5064   if (UserVF) {
5065     auto MaxSafeUserVF =
5066         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
5067 
5068     if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
5069       // If `VF=vscale x N` is safe, then so is `VF=N`
5070       if (UserVF.isScalable())
5071         return FixedScalableVFPair(
5072             ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
5073       else
5074         return UserVF;
5075     }
5076 
5077     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
5078 
5079     // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
5080     // is better to ignore the hint and let the compiler choose a suitable VF.
5081     if (!UserVF.isScalable()) {
5082       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5083                         << " is unsafe, clamping to max safe VF="
5084                         << MaxSafeFixedVF << ".\n");
5085       ORE->emit([&]() {
5086         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5087                                           TheLoop->getStartLoc(),
5088                                           TheLoop->getHeader())
5089                << "User-specified vectorization factor "
5090                << ore::NV("UserVectorizationFactor", UserVF)
5091                << " is unsafe, clamping to maximum safe vectorization factor "
5092                << ore::NV("VectorizationFactor", MaxSafeFixedVF);
5093       });
5094       return MaxSafeFixedVF;
5095     }
5096 
5097     if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
5098       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5099                         << " is ignored because scalable vectors are not "
5100                            "available.\n");
5101       ORE->emit([&]() {
5102         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5103                                           TheLoop->getStartLoc(),
5104                                           TheLoop->getHeader())
5105                << "User-specified vectorization factor "
5106                << ore::NV("UserVectorizationFactor", UserVF)
5107                << " is ignored because the target does not support scalable "
5108                   "vectors. The compiler will pick a more suitable value.";
5109       });
5110     } else {
5111       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5112                         << " is unsafe. Ignoring scalable UserVF.\n");
5113       ORE->emit([&]() {
5114         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5115                                           TheLoop->getStartLoc(),
5116                                           TheLoop->getHeader())
5117                << "User-specified vectorization factor "
5118                << ore::NV("UserVectorizationFactor", UserVF)
5119                << " is unsafe. Ignoring the hint to let the compiler pick a "
5120                   "more suitable value.";
5121       });
5122     }
5123   }
5124 
5125   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5126                     << " / " << WidestType << " bits.\n");
5127 
5128   FixedScalableVFPair Result(ElementCount::getFixed(1),
5129                              ElementCount::getScalable(0));
5130   if (auto MaxVF =
5131           getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5132                                   MaxSafeFixedVF, FoldTailByMasking))
5133     Result.FixedVF = MaxVF;
5134 
5135   if (auto MaxVF =
5136           getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5137                                   MaxSafeScalableVF, FoldTailByMasking))
5138     if (MaxVF.isScalable()) {
5139       Result.ScalableVF = MaxVF;
5140       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
5141                         << "\n");
5142     }
5143 
5144   return Result;
5145 }
5146 
5147 FixedScalableVFPair
5148 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5149   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5150     // TODO: It may by useful to do since it's still likely to be dynamically
5151     // uniform if the target can skip.
5152     reportVectorizationFailure(
5153         "Not inserting runtime ptr check for divergent target",
5154         "runtime pointer checks needed. Not enabled for divergent target",
5155         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5156     return FixedScalableVFPair::getNone();
5157   }
5158 
5159   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5160   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5161   if (TC == 1) {
5162     reportVectorizationFailure("Single iteration (non) loop",
5163         "loop trip count is one, irrelevant for vectorization",
5164         "SingleIterationLoop", ORE, TheLoop);
5165     return FixedScalableVFPair::getNone();
5166   }
5167 
5168   switch (ScalarEpilogueStatus) {
5169   case CM_ScalarEpilogueAllowed:
5170     return computeFeasibleMaxVF(TC, UserVF, false);
5171   case CM_ScalarEpilogueNotAllowedUsePredicate:
5172     LLVM_FALLTHROUGH;
5173   case CM_ScalarEpilogueNotNeededUsePredicate:
5174     LLVM_DEBUG(
5175         dbgs() << "LV: vector predicate hint/switch found.\n"
5176                << "LV: Not allowing scalar epilogue, creating predicated "
5177                << "vector loop.\n");
5178     break;
5179   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5180     // fallthrough as a special case of OptForSize
5181   case CM_ScalarEpilogueNotAllowedOptSize:
5182     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5183       LLVM_DEBUG(
5184           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5185     else
5186       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5187                         << "count.\n");
5188 
5189     // Bail if runtime checks are required, which are not good when optimising
5190     // for size.
5191     if (runtimeChecksRequired())
5192       return FixedScalableVFPair::getNone();
5193 
5194     break;
5195   }
5196 
5197   // The only loops we can vectorize without a scalar epilogue, are loops with
5198   // a bottom-test and a single exiting block. We'd have to handle the fact
5199   // that not every instruction executes on the last iteration.  This will
5200   // require a lane mask which varies through the vector loop body.  (TODO)
5201   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5202     // If there was a tail-folding hint/switch, but we can't fold the tail by
5203     // masking, fallback to a vectorization with a scalar epilogue.
5204     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5205       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5206                            "scalar epilogue instead.\n");
5207       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5208       return computeFeasibleMaxVF(TC, UserVF, false);
5209     }
5210     return FixedScalableVFPair::getNone();
5211   }
5212 
5213   // Now try the tail folding
5214 
5215   // Invalidate interleave groups that require an epilogue if we can't mask
5216   // the interleave-group.
5217   if (!useMaskedInterleavedAccesses(TTI)) {
5218     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5219            "No decisions should have been taken at this point");
5220     // Note: There is no need to invalidate any cost modeling decisions here, as
5221     // non where taken so far.
5222     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5223   }
5224 
5225   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true);
5226   // Avoid tail folding if the trip count is known to be a multiple of any VF
5227   // we chose.
5228   // FIXME: The condition below pessimises the case for fixed-width vectors,
5229   // when scalable VFs are also candidates for vectorization.
5230   if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
5231     ElementCount MaxFixedVF = MaxFactors.FixedVF;
5232     assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&
5233            "MaxFixedVF must be a power of 2");
5234     unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
5235                                    : MaxFixedVF.getFixedValue();
5236     ScalarEvolution *SE = PSE.getSE();
5237     const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5238     const SCEV *ExitCount = SE->getAddExpr(
5239         BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5240     const SCEV *Rem = SE->getURemExpr(
5241         SE->applyLoopGuards(ExitCount, TheLoop),
5242         SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5243     if (Rem->isZero()) {
5244       // Accept MaxFixedVF if we do not have a tail.
5245       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5246       return MaxFactors;
5247     }
5248   }
5249 
5250   // For scalable vectors don't use tail folding for low trip counts or
5251   // optimizing for code size. We only permit this if the user has explicitly
5252   // requested it.
5253   if (ScalarEpilogueStatus != CM_ScalarEpilogueNotNeededUsePredicate &&
5254       ScalarEpilogueStatus != CM_ScalarEpilogueNotAllowedUsePredicate &&
5255       MaxFactors.ScalableVF.isVector())
5256     MaxFactors.ScalableVF = ElementCount::getScalable(0);
5257 
5258   // If we don't know the precise trip count, or if the trip count that we
5259   // found modulo the vectorization factor is not zero, try to fold the tail
5260   // by masking.
5261   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5262   if (Legal->prepareToFoldTailByMasking()) {
5263     FoldTailByMasking = true;
5264     return MaxFactors;
5265   }
5266 
5267   // If there was a tail-folding hint/switch, but we can't fold the tail by
5268   // masking, fallback to a vectorization with a scalar epilogue.
5269   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5270     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5271                          "scalar epilogue instead.\n");
5272     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5273     return MaxFactors;
5274   }
5275 
5276   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5277     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5278     return FixedScalableVFPair::getNone();
5279   }
5280 
5281   if (TC == 0) {
5282     reportVectorizationFailure(
5283         "Unable to calculate the loop count due to complex control flow",
5284         "unable to calculate the loop count due to complex control flow",
5285         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5286     return FixedScalableVFPair::getNone();
5287   }
5288 
5289   reportVectorizationFailure(
5290       "Cannot optimize for size and vectorize at the same time.",
5291       "cannot optimize for size and vectorize at the same time. "
5292       "Enable vectorization of this loop with '#pragma clang loop "
5293       "vectorize(enable)' when compiling with -Os/-Oz",
5294       "NoTailLoopWithOptForSize", ORE, TheLoop);
5295   return FixedScalableVFPair::getNone();
5296 }
5297 
5298 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
5299     unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
5300     const ElementCount &MaxSafeVF, bool FoldTailByMasking) {
5301   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
5302   TypeSize WidestRegister = TTI.getRegisterBitWidth(
5303       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5304                            : TargetTransformInfo::RGK_FixedWidthVector);
5305 
5306   // Convenience function to return the minimum of two ElementCounts.
5307   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
5308     assert((LHS.isScalable() == RHS.isScalable()) &&
5309            "Scalable flags must match");
5310     return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
5311   };
5312 
5313   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5314   // Note that both WidestRegister and WidestType may not be a powers of 2.
5315   auto MaxVectorElementCount = ElementCount::get(
5316       PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType),
5317       ComputeScalableMaxVF);
5318   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
5319   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5320                     << (MaxVectorElementCount * WidestType) << " bits.\n");
5321 
5322   if (!MaxVectorElementCount) {
5323     LLVM_DEBUG(dbgs() << "LV: The target has no "
5324                       << (ComputeScalableMaxVF ? "scalable" : "fixed")
5325                       << " vector registers.\n");
5326     return ElementCount::getFixed(1);
5327   }
5328 
5329   const auto TripCountEC = ElementCount::getFixed(ConstTripCount);
5330   if (ConstTripCount &&
5331       ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) &&
5332       (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {
5333     // If loop trip count (TC) is known at compile time there is no point in
5334     // choosing VF greater than TC (as done in the loop below). Select maximum
5335     // power of two which doesn't exceed TC.
5336     // If MaxVectorElementCount is scalable, we only fall back on a fixed VF
5337     // when the TC is less than or equal to the known number of lanes.
5338     auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount);
5339     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
5340                          "exceeding the constant trip count: "
5341                       << ClampedConstTripCount << "\n");
5342     return ElementCount::getFixed(ClampedConstTripCount);
5343   }
5344 
5345   ElementCount MaxVF = MaxVectorElementCount;
5346   if (TTI.shouldMaximizeVectorBandwidth() ||
5347       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5348     auto MaxVectorElementCountMaxBW = ElementCount::get(
5349         PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
5350         ComputeScalableMaxVF);
5351     MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
5352 
5353     // Collect all viable vectorization factors larger than the default MaxVF
5354     // (i.e. MaxVectorElementCount).
5355     SmallVector<ElementCount, 8> VFs;
5356     for (ElementCount VS = MaxVectorElementCount * 2;
5357          ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
5358       VFs.push_back(VS);
5359 
5360     // For each VF calculate its register usage.
5361     auto RUs = calculateRegisterUsage(VFs);
5362 
5363     // Select the largest VF which doesn't require more registers than existing
5364     // ones.
5365     for (int i = RUs.size() - 1; i >= 0; --i) {
5366       bool Selected = true;
5367       for (auto &pair : RUs[i].MaxLocalUsers) {
5368         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5369         if (pair.second > TargetNumRegisters)
5370           Selected = false;
5371       }
5372       if (Selected) {
5373         MaxVF = VFs[i];
5374         break;
5375       }
5376     }
5377     if (ElementCount MinVF =
5378             TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
5379       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
5380         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5381                           << ") with target's minimum: " << MinVF << '\n');
5382         MaxVF = MinVF;
5383       }
5384     }
5385   }
5386   return MaxVF;
5387 }
5388 
5389 Optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const {
5390   if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
5391     auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
5392     auto Min = Attr.getVScaleRangeMin();
5393     auto Max = Attr.getVScaleRangeMax();
5394     if (Max && Min == Max)
5395       return Max;
5396   }
5397 
5398   return TTI.getVScaleForTuning();
5399 }
5400 
5401 bool LoopVectorizationCostModel::isMoreProfitable(
5402     const VectorizationFactor &A, const VectorizationFactor &B) const {
5403   InstructionCost CostA = A.Cost;
5404   InstructionCost CostB = B.Cost;
5405 
5406   unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
5407 
5408   if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking &&
5409       MaxTripCount) {
5410     // If we are folding the tail and the trip count is a known (possibly small)
5411     // constant, the trip count will be rounded up to an integer number of
5412     // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),
5413     // which we compare directly. When not folding the tail, the total cost will
5414     // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is
5415     // approximated with the per-lane cost below instead of using the tripcount
5416     // as here.
5417     auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue());
5418     auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue());
5419     return RTCostA < RTCostB;
5420   }
5421 
5422   // Improve estimate for the vector width if it is scalable.
5423   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
5424   unsigned EstimatedWidthB = B.Width.getKnownMinValue();
5425   if (Optional<unsigned> VScale = getVScaleForTuning()) {
5426     if (A.Width.isScalable())
5427       EstimatedWidthA *= VScale.getValue();
5428     if (B.Width.isScalable())
5429       EstimatedWidthB *= VScale.getValue();
5430   }
5431 
5432   // Assume vscale may be larger than 1 (or the value being tuned for),
5433   // so that scalable vectorization is slightly favorable over fixed-width
5434   // vectorization.
5435   if (A.Width.isScalable() && !B.Width.isScalable())
5436     return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
5437 
5438   // To avoid the need for FP division:
5439   //      (CostA / A.Width) < (CostB / B.Width)
5440   // <=>  (CostA * B.Width) < (CostB * A.Width)
5441   return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
5442 }
5443 
5444 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
5445     const ElementCountSet &VFCandidates) {
5446   InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
5447   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
5448   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
5449   assert(VFCandidates.count(ElementCount::getFixed(1)) &&
5450          "Expected Scalar VF to be a candidate");
5451 
5452   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost);
5453   VectorizationFactor ChosenFactor = ScalarCost;
5454 
5455   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5456   if (ForceVectorization && VFCandidates.size() > 1) {
5457     // Ignore scalar width, because the user explicitly wants vectorization.
5458     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5459     // evaluation.
5460     ChosenFactor.Cost = InstructionCost::getMax();
5461   }
5462 
5463   SmallVector<InstructionVFPair> InvalidCosts;
5464   for (const auto &i : VFCandidates) {
5465     // The cost for scalar VF=1 is already calculated, so ignore it.
5466     if (i.isScalar())
5467       continue;
5468 
5469     VectorizationCostTy C = expectedCost(i, &InvalidCosts);
5470     VectorizationFactor Candidate(i, C.first);
5471 
5472 #ifndef NDEBUG
5473     unsigned AssumedMinimumVscale = 1;
5474     if (Optional<unsigned> VScale = getVScaleForTuning())
5475       AssumedMinimumVscale = VScale.getValue();
5476     unsigned Width =
5477         Candidate.Width.isScalable()
5478             ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5479             : Candidate.Width.getFixedValue();
5480     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5481                       << " costs: " << (Candidate.Cost / Width));
5482     if (i.isScalable())
5483       LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
5484                         << AssumedMinimumVscale << ")");
5485     LLVM_DEBUG(dbgs() << ".\n");
5486 #endif
5487 
5488     if (!C.second && !ForceVectorization) {
5489       LLVM_DEBUG(
5490           dbgs() << "LV: Not considering vector loop of width " << i
5491                  << " because it will not generate any vector instructions.\n");
5492       continue;
5493     }
5494 
5495     // If profitable add it to ProfitableVF list.
5496     if (isMoreProfitable(Candidate, ScalarCost))
5497       ProfitableVFs.push_back(Candidate);
5498 
5499     if (isMoreProfitable(Candidate, ChosenFactor))
5500       ChosenFactor = Candidate;
5501   }
5502 
5503   // Emit a report of VFs with invalid costs in the loop.
5504   if (!InvalidCosts.empty()) {
5505     // Group the remarks per instruction, keeping the instruction order from
5506     // InvalidCosts.
5507     std::map<Instruction *, unsigned> Numbering;
5508     unsigned I = 0;
5509     for (auto &Pair : InvalidCosts)
5510       if (!Numbering.count(Pair.first))
5511         Numbering[Pair.first] = I++;
5512 
5513     // Sort the list, first on instruction(number) then on VF.
5514     llvm::sort(InvalidCosts,
5515                [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
5516                  if (Numbering[A.first] != Numbering[B.first])
5517                    return Numbering[A.first] < Numbering[B.first];
5518                  ElementCountComparator ECC;
5519                  return ECC(A.second, B.second);
5520                });
5521 
5522     // For a list of ordered instruction-vf pairs:
5523     //   [(load, vf1), (load, vf2), (store, vf1)]
5524     // Group the instructions together to emit separate remarks for:
5525     //   load  (vf1, vf2)
5526     //   store (vf1)
5527     auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
5528     auto Subset = ArrayRef<InstructionVFPair>();
5529     do {
5530       if (Subset.empty())
5531         Subset = Tail.take_front(1);
5532 
5533       Instruction *I = Subset.front().first;
5534 
5535       // If the next instruction is different, or if there are no other pairs,
5536       // emit a remark for the collated subset. e.g.
5537       //   [(load, vf1), (load, vf2))]
5538       // to emit:
5539       //  remark: invalid costs for 'load' at VF=(vf, vf2)
5540       if (Subset == Tail || Tail[Subset.size()].first != I) {
5541         std::string OutString;
5542         raw_string_ostream OS(OutString);
5543         assert(!Subset.empty() && "Unexpected empty range");
5544         OS << "Instruction with invalid costs prevented vectorization at VF=(";
5545         for (auto &Pair : Subset)
5546           OS << (Pair.second == Subset.front().second ? "" : ", ")
5547              << Pair.second;
5548         OS << "):";
5549         if (auto *CI = dyn_cast<CallInst>(I))
5550           OS << " call to " << CI->getCalledFunction()->getName();
5551         else
5552           OS << " " << I->getOpcodeName();
5553         OS.flush();
5554         reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
5555         Tail = Tail.drop_front(Subset.size());
5556         Subset = {};
5557       } else
5558         // Grow the subset by one element
5559         Subset = Tail.take_front(Subset.size() + 1);
5560     } while (!Tail.empty());
5561   }
5562 
5563   if (!EnableCondStoresVectorization && NumPredStores) {
5564     reportVectorizationFailure("There are conditional stores.",
5565         "store that is conditionally executed prevents vectorization",
5566         "ConditionalStore", ORE, TheLoop);
5567     ChosenFactor = ScalarCost;
5568   }
5569 
5570   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
5571                  ChosenFactor.Cost >= ScalarCost.Cost) dbgs()
5572              << "LV: Vectorization seems to be not beneficial, "
5573              << "but was forced by a user.\n");
5574   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
5575   return ChosenFactor;
5576 }
5577 
5578 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
5579     const Loop &L, ElementCount VF) const {
5580   // Cross iteration phis such as reductions need special handling and are
5581   // currently unsupported.
5582   if (any_of(L.getHeader()->phis(),
5583              [&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); }))
5584     return false;
5585 
5586   // Phis with uses outside of the loop require special handling and are
5587   // currently unsupported.
5588   for (auto &Entry : Legal->getInductionVars()) {
5589     // Look for uses of the value of the induction at the last iteration.
5590     Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
5591     for (User *U : PostInc->users())
5592       if (!L.contains(cast<Instruction>(U)))
5593         return false;
5594     // Look for uses of penultimate value of the induction.
5595     for (User *U : Entry.first->users())
5596       if (!L.contains(cast<Instruction>(U)))
5597         return false;
5598   }
5599 
5600   // Induction variables that are widened require special handling that is
5601   // currently not supported.
5602   if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
5603         return !(this->isScalarAfterVectorization(Entry.first, VF) ||
5604                  this->isProfitableToScalarize(Entry.first, VF));
5605       }))
5606     return false;
5607 
5608   // Epilogue vectorization code has not been auditted to ensure it handles
5609   // non-latch exits properly.  It may be fine, but it needs auditted and
5610   // tested.
5611   if (L.getExitingBlock() != L.getLoopLatch())
5612     return false;
5613 
5614   return true;
5615 }
5616 
5617 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5618     const ElementCount VF) const {
5619   // FIXME: We need a much better cost-model to take different parameters such
5620   // as register pressure, code size increase and cost of extra branches into
5621   // account. For now we apply a very crude heuristic and only consider loops
5622   // with vectorization factors larger than a certain value.
5623   // We also consider epilogue vectorization unprofitable for targets that don't
5624   // consider interleaving beneficial (eg. MVE).
5625   if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
5626     return false;
5627   // FIXME: We should consider changing the threshold for scalable
5628   // vectors to take VScaleForTuning into account.
5629   if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF)
5630     return true;
5631   return false;
5632 }
5633 
5634 VectorizationFactor
5635 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
5636     const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
5637   VectorizationFactor Result = VectorizationFactor::Disabled();
5638   if (!EnableEpilogueVectorization) {
5639     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
5640     return Result;
5641   }
5642 
5643   if (!isScalarEpilogueAllowed()) {
5644     LLVM_DEBUG(
5645         dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
5646                   "allowed.\n";);
5647     return Result;
5648   }
5649 
5650   // Not really a cost consideration, but check for unsupported cases here to
5651   // simplify the logic.
5652   if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
5653     LLVM_DEBUG(
5654         dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
5655                   "not a supported candidate.\n";);
5656     return Result;
5657   }
5658 
5659   if (EpilogueVectorizationForceVF > 1) {
5660     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
5661     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
5662     if (LVP.hasPlanWithVF(ForcedEC))
5663       return {ForcedEC, 0};
5664     else {
5665       LLVM_DEBUG(
5666           dbgs()
5667               << "LEV: Epilogue vectorization forced factor is not viable.\n";);
5668       return Result;
5669     }
5670   }
5671 
5672   if (TheLoop->getHeader()->getParent()->hasOptSize() ||
5673       TheLoop->getHeader()->getParent()->hasMinSize()) {
5674     LLVM_DEBUG(
5675         dbgs()
5676             << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
5677     return Result;
5678   }
5679 
5680   if (!isEpilogueVectorizationProfitable(MainLoopVF)) {
5681     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
5682                          "this loop\n");
5683     return Result;
5684   }
5685 
5686   // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
5687   // the main loop handles 8 lanes per iteration. We could still benefit from
5688   // vectorizing the epilogue loop with VF=4.
5689   ElementCount EstimatedRuntimeVF = MainLoopVF;
5690   if (MainLoopVF.isScalable()) {
5691     EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
5692     if (Optional<unsigned> VScale = getVScaleForTuning())
5693       EstimatedRuntimeVF *= VScale.getValue();
5694   }
5695 
5696   for (auto &NextVF : ProfitableVFs)
5697     if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
5698           ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) ||
5699          ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) &&
5700         (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) &&
5701         LVP.hasPlanWithVF(NextVF.Width))
5702       Result = NextVF;
5703 
5704   if (Result != VectorizationFactor::Disabled())
5705     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5706                       << Result.Width << "\n";);
5707   return Result;
5708 }
5709 
5710 std::pair<unsigned, unsigned>
5711 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5712   unsigned MinWidth = -1U;
5713   unsigned MaxWidth = 8;
5714   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5715   // For in-loop reductions, no element types are added to ElementTypesInLoop
5716   // if there are no loads/stores in the loop. In this case, check through the
5717   // reduction variables to determine the maximum width.
5718   if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
5719     // Reset MaxWidth so that we can find the smallest type used by recurrences
5720     // in the loop.
5721     MaxWidth = -1U;
5722     for (auto &PhiDescriptorPair : Legal->getReductionVars()) {
5723       const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
5724       // When finding the min width used by the recurrence we need to account
5725       // for casts on the input operands of the recurrence.
5726       MaxWidth = std::min<unsigned>(
5727           MaxWidth, std::min<unsigned>(
5728                         RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
5729                         RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
5730     }
5731   } else {
5732     for (Type *T : ElementTypesInLoop) {
5733       MinWidth = std::min<unsigned>(
5734           MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5735       MaxWidth = std::max<unsigned>(
5736           MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5737     }
5738   }
5739   return {MinWidth, MaxWidth};
5740 }
5741 
5742 void LoopVectorizationCostModel::collectElementTypesForWidening() {
5743   ElementTypesInLoop.clear();
5744   // For each block.
5745   for (BasicBlock *BB : TheLoop->blocks()) {
5746     // For each instruction in the loop.
5747     for (Instruction &I : BB->instructionsWithoutDebug()) {
5748       Type *T = I.getType();
5749 
5750       // Skip ignored values.
5751       if (ValuesToIgnore.count(&I))
5752         continue;
5753 
5754       // Only examine Loads, Stores and PHINodes.
5755       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5756         continue;
5757 
5758       // Examine PHI nodes that are reduction variables. Update the type to
5759       // account for the recurrence type.
5760       if (auto *PN = dyn_cast<PHINode>(&I)) {
5761         if (!Legal->isReductionVariable(PN))
5762           continue;
5763         const RecurrenceDescriptor &RdxDesc =
5764             Legal->getReductionVars().find(PN)->second;
5765         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
5766             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
5767                                       RdxDesc.getRecurrenceType(),
5768                                       TargetTransformInfo::ReductionFlags()))
5769           continue;
5770         T = RdxDesc.getRecurrenceType();
5771       }
5772 
5773       // Examine the stored values.
5774       if (auto *ST = dyn_cast<StoreInst>(&I))
5775         T = ST->getValueOperand()->getType();
5776 
5777       assert(T->isSized() &&
5778              "Expected the load/store/recurrence type to be sized");
5779 
5780       ElementTypesInLoop.insert(T);
5781     }
5782   }
5783 }
5784 
5785 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5786                                                            unsigned LoopCost) {
5787   // -- The interleave heuristics --
5788   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5789   // There are many micro-architectural considerations that we can't predict
5790   // at this level. For example, frontend pressure (on decode or fetch) due to
5791   // code size, or the number and capabilities of the execution ports.
5792   //
5793   // We use the following heuristics to select the interleave count:
5794   // 1. If the code has reductions, then we interleave to break the cross
5795   // iteration dependency.
5796   // 2. If the loop is really small, then we interleave to reduce the loop
5797   // overhead.
5798   // 3. We don't interleave if we think that we will spill registers to memory
5799   // due to the increased register pressure.
5800 
5801   if (!isScalarEpilogueAllowed())
5802     return 1;
5803 
5804   // We used the distance for the interleave count.
5805   if (Legal->getMaxSafeDepDistBytes() != -1U)
5806     return 1;
5807 
5808   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5809   const bool HasReductions = !Legal->getReductionVars().empty();
5810   // Do not interleave loops with a relatively small known or estimated trip
5811   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5812   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5813   // because with the above conditions interleaving can expose ILP and break
5814   // cross iteration dependences for reductions.
5815   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5816       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5817     return 1;
5818 
5819   RegisterUsage R = calculateRegisterUsage({VF})[0];
5820   // We divide by these constants so assume that we have at least one
5821   // instruction that uses at least one register.
5822   for (auto& pair : R.MaxLocalUsers) {
5823     pair.second = std::max(pair.second, 1U);
5824   }
5825 
5826   // We calculate the interleave count using the following formula.
5827   // Subtract the number of loop invariants from the number of available
5828   // registers. These registers are used by all of the interleaved instances.
5829   // Next, divide the remaining registers by the number of registers that is
5830   // required by the loop, in order to estimate how many parallel instances
5831   // fit without causing spills. All of this is rounded down if necessary to be
5832   // a power of two. We want power of two interleave count to simplify any
5833   // addressing operations or alignment considerations.
5834   // We also want power of two interleave counts to ensure that the induction
5835   // variable of the vector loop wraps to zero, when tail is folded by masking;
5836   // this currently happens when OptForSize, in which case IC is set to 1 above.
5837   unsigned IC = UINT_MAX;
5838 
5839   for (auto& pair : R.MaxLocalUsers) {
5840     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5841     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5842                       << " registers of "
5843                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5844     if (VF.isScalar()) {
5845       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5846         TargetNumRegisters = ForceTargetNumScalarRegs;
5847     } else {
5848       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5849         TargetNumRegisters = ForceTargetNumVectorRegs;
5850     }
5851     unsigned MaxLocalUsers = pair.second;
5852     unsigned LoopInvariantRegs = 0;
5853     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5854       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5855 
5856     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5857     // Don't count the induction variable as interleaved.
5858     if (EnableIndVarRegisterHeur) {
5859       TmpIC =
5860           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5861                         std::max(1U, (MaxLocalUsers - 1)));
5862     }
5863 
5864     IC = std::min(IC, TmpIC);
5865   }
5866 
5867   // Clamp the interleave ranges to reasonable counts.
5868   unsigned MaxInterleaveCount =
5869       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
5870 
5871   // Check if the user has overridden the max.
5872   if (VF.isScalar()) {
5873     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5874       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5875   } else {
5876     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5877       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5878   }
5879 
5880   // If trip count is known or estimated compile time constant, limit the
5881   // interleave count to be less than the trip count divided by VF, provided it
5882   // is at least 1.
5883   //
5884   // For scalable vectors we can't know if interleaving is beneficial. It may
5885   // not be beneficial for small loops if none of the lanes in the second vector
5886   // iterations is enabled. However, for larger loops, there is likely to be a
5887   // similar benefit as for fixed-width vectors. For now, we choose to leave
5888   // the InterleaveCount as if vscale is '1', although if some information about
5889   // the vector is known (e.g. min vector size), we can make a better decision.
5890   if (BestKnownTC) {
5891     MaxInterleaveCount =
5892         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
5893     // Make sure MaxInterleaveCount is greater than 0.
5894     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
5895   }
5896 
5897   assert(MaxInterleaveCount > 0 &&
5898          "Maximum interleave count must be greater than 0");
5899 
5900   // Clamp the calculated IC to be between the 1 and the max interleave count
5901   // that the target and trip count allows.
5902   if (IC > MaxInterleaveCount)
5903     IC = MaxInterleaveCount;
5904   else
5905     // Make sure IC is greater than 0.
5906     IC = std::max(1u, IC);
5907 
5908   assert(IC > 0 && "Interleave count must be greater than 0.");
5909 
5910   // If we did not calculate the cost for VF (because the user selected the VF)
5911   // then we calculate the cost of VF here.
5912   if (LoopCost == 0) {
5913     InstructionCost C = expectedCost(VF).first;
5914     assert(C.isValid() && "Expected to have chosen a VF with valid cost");
5915     LoopCost = *C.getValue();
5916   }
5917 
5918   assert(LoopCost && "Non-zero loop cost expected");
5919 
5920   // Interleave if we vectorized this loop and there is a reduction that could
5921   // benefit from interleaving.
5922   if (VF.isVector() && HasReductions) {
5923     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5924     return IC;
5925   }
5926 
5927   // For any scalar loop that either requires runtime checks or predication we
5928   // are better off leaving this to the unroller. Note that if we've already
5929   // vectorized the loop we will have done the runtime check and so interleaving
5930   // won't require further checks.
5931   bool ScalarInterleavingRequiresPredication =
5932       (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5933          return Legal->blockNeedsPredication(BB);
5934        }));
5935   bool ScalarInterleavingRequiresRuntimePointerCheck =
5936       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5937 
5938   // We want to interleave small loops in order to reduce the loop overhead and
5939   // potentially expose ILP opportunities.
5940   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5941                     << "LV: IC is " << IC << '\n'
5942                     << "LV: VF is " << VF << '\n');
5943   const bool AggressivelyInterleaveReductions =
5944       TTI.enableAggressiveInterleaving(HasReductions);
5945   if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5946       !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5947     // We assume that the cost overhead is 1 and we use the cost model
5948     // to estimate the cost of the loop and interleave until the cost of the
5949     // loop overhead is about 5% of the cost of the loop.
5950     unsigned SmallIC =
5951         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5952 
5953     // Interleave until store/load ports (estimated by max interleave count) are
5954     // saturated.
5955     unsigned NumStores = Legal->getNumStores();
5956     unsigned NumLoads = Legal->getNumLoads();
5957     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5958     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5959 
5960     // There is little point in interleaving for reductions containing selects
5961     // and compares when VF=1 since it may just create more overhead than it's
5962     // worth for loops with small trip counts. This is because we still have to
5963     // do the final reduction after the loop.
5964     bool HasSelectCmpReductions =
5965         HasReductions &&
5966         any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5967           const RecurrenceDescriptor &RdxDesc = Reduction.second;
5968           return RecurrenceDescriptor::isSelectCmpRecurrenceKind(
5969               RdxDesc.getRecurrenceKind());
5970         });
5971     if (HasSelectCmpReductions) {
5972       LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5973       return 1;
5974     }
5975 
5976     // If we have a scalar reduction (vector reductions are already dealt with
5977     // by this point), we can increase the critical path length if the loop
5978     // we're interleaving is inside another loop. For tree-wise reductions
5979     // set the limit to 2, and for ordered reductions it's best to disable
5980     // interleaving entirely.
5981     if (HasReductions && TheLoop->getLoopDepth() > 1) {
5982       bool HasOrderedReductions =
5983           any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5984             const RecurrenceDescriptor &RdxDesc = Reduction.second;
5985             return RdxDesc.isOrdered();
5986           });
5987       if (HasOrderedReductions) {
5988         LLVM_DEBUG(
5989             dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5990         return 1;
5991       }
5992 
5993       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5994       SmallIC = std::min(SmallIC, F);
5995       StoresIC = std::min(StoresIC, F);
5996       LoadsIC = std::min(LoadsIC, F);
5997     }
5998 
5999     if (EnableLoadStoreRuntimeInterleave &&
6000         std::max(StoresIC, LoadsIC) > SmallIC) {
6001       LLVM_DEBUG(
6002           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
6003       return std::max(StoresIC, LoadsIC);
6004     }
6005 
6006     // If there are scalar reductions and TTI has enabled aggressive
6007     // interleaving for reductions, we will interleave to expose ILP.
6008     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
6009         AggressivelyInterleaveReductions) {
6010       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6011       // Interleave no less than SmallIC but not as aggressive as the normal IC
6012       // to satisfy the rare situation when resources are too limited.
6013       return std::max(IC / 2, SmallIC);
6014     } else {
6015       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
6016       return SmallIC;
6017     }
6018   }
6019 
6020   // Interleave if this is a large loop (small loops are already dealt with by
6021   // this point) that could benefit from interleaving.
6022   if (AggressivelyInterleaveReductions) {
6023     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6024     return IC;
6025   }
6026 
6027   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
6028   return 1;
6029 }
6030 
6031 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
6032 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
6033   // This function calculates the register usage by measuring the highest number
6034   // of values that are alive at a single location. Obviously, this is a very
6035   // rough estimation. We scan the loop in a topological order in order and
6036   // assign a number to each instruction. We use RPO to ensure that defs are
6037   // met before their users. We assume that each instruction that has in-loop
6038   // users starts an interval. We record every time that an in-loop value is
6039   // used, so we have a list of the first and last occurrences of each
6040   // instruction. Next, we transpose this data structure into a multi map that
6041   // holds the list of intervals that *end* at a specific location. This multi
6042   // map allows us to perform a linear search. We scan the instructions linearly
6043   // and record each time that a new interval starts, by placing it in a set.
6044   // If we find this value in the multi-map then we remove it from the set.
6045   // The max register usage is the maximum size of the set.
6046   // We also search for instructions that are defined outside the loop, but are
6047   // used inside the loop. We need this number separately from the max-interval
6048   // usage number because when we unroll, loop-invariant values do not take
6049   // more register.
6050   LoopBlocksDFS DFS(TheLoop);
6051   DFS.perform(LI);
6052 
6053   RegisterUsage RU;
6054 
6055   // Each 'key' in the map opens a new interval. The values
6056   // of the map are the index of the 'last seen' usage of the
6057   // instruction that is the key.
6058   using IntervalMap = DenseMap<Instruction *, unsigned>;
6059 
6060   // Maps instruction to its index.
6061   SmallVector<Instruction *, 64> IdxToInstr;
6062   // Marks the end of each interval.
6063   IntervalMap EndPoint;
6064   // Saves the list of instruction indices that are used in the loop.
6065   SmallPtrSet<Instruction *, 8> Ends;
6066   // Saves the list of values that are used in the loop but are
6067   // defined outside the loop, such as arguments and constants.
6068   SmallPtrSet<Value *, 8> LoopInvariants;
6069 
6070   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6071     for (Instruction &I : BB->instructionsWithoutDebug()) {
6072       IdxToInstr.push_back(&I);
6073 
6074       // Save the end location of each USE.
6075       for (Value *U : I.operands()) {
6076         auto *Instr = dyn_cast<Instruction>(U);
6077 
6078         // Ignore non-instruction values such as arguments, constants, etc.
6079         if (!Instr)
6080           continue;
6081 
6082         // If this instruction is outside the loop then record it and continue.
6083         if (!TheLoop->contains(Instr)) {
6084           LoopInvariants.insert(Instr);
6085           continue;
6086         }
6087 
6088         // Overwrite previous end points.
6089         EndPoint[Instr] = IdxToInstr.size();
6090         Ends.insert(Instr);
6091       }
6092     }
6093   }
6094 
6095   // Saves the list of intervals that end with the index in 'key'.
6096   using InstrList = SmallVector<Instruction *, 2>;
6097   DenseMap<unsigned, InstrList> TransposeEnds;
6098 
6099   // Transpose the EndPoints to a list of values that end at each index.
6100   for (auto &Interval : EndPoint)
6101     TransposeEnds[Interval.second].push_back(Interval.first);
6102 
6103   SmallPtrSet<Instruction *, 8> OpenIntervals;
6104   SmallVector<RegisterUsage, 8> RUs(VFs.size());
6105   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
6106 
6107   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
6108 
6109   // A lambda that gets the register usage for the given type and VF.
6110   const auto &TTICapture = TTI;
6111   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
6112     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
6113       return 0;
6114     InstructionCost::CostType RegUsage =
6115         *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue();
6116     assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() &&
6117            "Nonsensical values for register usage.");
6118     return RegUsage;
6119   };
6120 
6121   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
6122     Instruction *I = IdxToInstr[i];
6123 
6124     // Remove all of the instructions that end at this location.
6125     InstrList &List = TransposeEnds[i];
6126     for (Instruction *ToRemove : List)
6127       OpenIntervals.erase(ToRemove);
6128 
6129     // Ignore instructions that are never used within the loop.
6130     if (!Ends.count(I))
6131       continue;
6132 
6133     // Skip ignored values.
6134     if (ValuesToIgnore.count(I))
6135       continue;
6136 
6137     // For each VF find the maximum usage of registers.
6138     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6139       // Count the number of live intervals.
6140       SmallMapVector<unsigned, unsigned, 4> RegUsage;
6141 
6142       if (VFs[j].isScalar()) {
6143         for (auto Inst : OpenIntervals) {
6144           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6145           if (RegUsage.find(ClassID) == RegUsage.end())
6146             RegUsage[ClassID] = 1;
6147           else
6148             RegUsage[ClassID] += 1;
6149         }
6150       } else {
6151         collectUniformsAndScalars(VFs[j]);
6152         for (auto Inst : OpenIntervals) {
6153           // Skip ignored values for VF > 1.
6154           if (VecValuesToIgnore.count(Inst))
6155             continue;
6156           if (isScalarAfterVectorization(Inst, VFs[j])) {
6157             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6158             if (RegUsage.find(ClassID) == RegUsage.end())
6159               RegUsage[ClassID] = 1;
6160             else
6161               RegUsage[ClassID] += 1;
6162           } else {
6163             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
6164             if (RegUsage.find(ClassID) == RegUsage.end())
6165               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
6166             else
6167               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6168           }
6169         }
6170       }
6171 
6172       for (auto& pair : RegUsage) {
6173         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
6174           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
6175         else
6176           MaxUsages[j][pair.first] = pair.second;
6177       }
6178     }
6179 
6180     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6181                       << OpenIntervals.size() << '\n');
6182 
6183     // Add the current instruction to the list of open intervals.
6184     OpenIntervals.insert(I);
6185   }
6186 
6187   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6188     SmallMapVector<unsigned, unsigned, 4> Invariant;
6189 
6190     for (auto Inst : LoopInvariants) {
6191       unsigned Usage =
6192           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6193       unsigned ClassID =
6194           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6195       if (Invariant.find(ClassID) == Invariant.end())
6196         Invariant[ClassID] = Usage;
6197       else
6198         Invariant[ClassID] += Usage;
6199     }
6200 
6201     LLVM_DEBUG({
6202       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
6203       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
6204              << " item\n";
6205       for (const auto &pair : MaxUsages[i]) {
6206         dbgs() << "LV(REG): RegisterClass: "
6207                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6208                << " registers\n";
6209       }
6210       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
6211              << " item\n";
6212       for (const auto &pair : Invariant) {
6213         dbgs() << "LV(REG): RegisterClass: "
6214                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6215                << " registers\n";
6216       }
6217     });
6218 
6219     RU.LoopInvariantRegs = Invariant;
6220     RU.MaxLocalUsers = MaxUsages[i];
6221     RUs[i] = RU;
6222   }
6223 
6224   return RUs;
6225 }
6226 
6227 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
6228                                                            ElementCount VF) {
6229   // TODO: Cost model for emulated masked load/store is completely
6230   // broken. This hack guides the cost model to use an artificially
6231   // high enough value to practically disable vectorization with such
6232   // operations, except where previously deployed legality hack allowed
6233   // using very low cost values. This is to avoid regressions coming simply
6234   // from moving "masked load/store" check from legality to cost model.
6235   // Masked Load/Gather emulation was previously never allowed.
6236   // Limited number of Masked Store/Scatter emulation was allowed.
6237   assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction");
6238   return isa<LoadInst>(I) ||
6239          (isa<StoreInst>(I) &&
6240           NumPredStores > NumberOfStoresToPredicate);
6241 }
6242 
6243 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6244   // If we aren't vectorizing the loop, or if we've already collected the
6245   // instructions to scalarize, there's nothing to do. Collection may already
6246   // have occurred if we have a user-selected VF and are now computing the
6247   // expected cost for interleaving.
6248   if (VF.isScalar() || VF.isZero() ||
6249       InstsToScalarize.find(VF) != InstsToScalarize.end())
6250     return;
6251 
6252   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6253   // not profitable to scalarize any instructions, the presence of VF in the
6254   // map will indicate that we've analyzed it already.
6255   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6256 
6257   // Find all the instructions that are scalar with predication in the loop and
6258   // determine if it would be better to not if-convert the blocks they are in.
6259   // If so, we also record the instructions to scalarize.
6260   for (BasicBlock *BB : TheLoop->blocks()) {
6261     if (!blockNeedsPredicationForAnyReason(BB))
6262       continue;
6263     for (Instruction &I : *BB)
6264       if (isScalarWithPredication(&I, VF)) {
6265         ScalarCostsTy ScalarCosts;
6266         // Do not apply discount if scalable, because that would lead to
6267         // invalid scalarization costs.
6268         // Do not apply discount logic if hacked cost is needed
6269         // for emulated masked memrefs.
6270         if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) &&
6271             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6272           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6273         // Remember that BB will remain after vectorization.
6274         PredicatedBBsAfterVectorization.insert(BB);
6275       }
6276   }
6277 }
6278 
6279 int LoopVectorizationCostModel::computePredInstDiscount(
6280     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
6281   assert(!isUniformAfterVectorization(PredInst, VF) &&
6282          "Instruction marked uniform-after-vectorization will be predicated");
6283 
6284   // Initialize the discount to zero, meaning that the scalar version and the
6285   // vector version cost the same.
6286   InstructionCost Discount = 0;
6287 
6288   // Holds instructions to analyze. The instructions we visit are mapped in
6289   // ScalarCosts. Those instructions are the ones that would be scalarized if
6290   // we find that the scalar version costs less.
6291   SmallVector<Instruction *, 8> Worklist;
6292 
6293   // Returns true if the given instruction can be scalarized.
6294   auto canBeScalarized = [&](Instruction *I) -> bool {
6295     // We only attempt to scalarize instructions forming a single-use chain
6296     // from the original predicated block that would otherwise be vectorized.
6297     // Although not strictly necessary, we give up on instructions we know will
6298     // already be scalar to avoid traversing chains that are unlikely to be
6299     // beneficial.
6300     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6301         isScalarAfterVectorization(I, VF))
6302       return false;
6303 
6304     // If the instruction is scalar with predication, it will be analyzed
6305     // separately. We ignore it within the context of PredInst.
6306     if (isScalarWithPredication(I, VF))
6307       return false;
6308 
6309     // If any of the instruction's operands are uniform after vectorization,
6310     // the instruction cannot be scalarized. This prevents, for example, a
6311     // masked load from being scalarized.
6312     //
6313     // We assume we will only emit a value for lane zero of an instruction
6314     // marked uniform after vectorization, rather than VF identical values.
6315     // Thus, if we scalarize an instruction that uses a uniform, we would
6316     // create uses of values corresponding to the lanes we aren't emitting code
6317     // for. This behavior can be changed by allowing getScalarValue to clone
6318     // the lane zero values for uniforms rather than asserting.
6319     for (Use &U : I->operands())
6320       if (auto *J = dyn_cast<Instruction>(U.get()))
6321         if (isUniformAfterVectorization(J, VF))
6322           return false;
6323 
6324     // Otherwise, we can scalarize the instruction.
6325     return true;
6326   };
6327 
6328   // Compute the expected cost discount from scalarizing the entire expression
6329   // feeding the predicated instruction. We currently only consider expressions
6330   // that are single-use instruction chains.
6331   Worklist.push_back(PredInst);
6332   while (!Worklist.empty()) {
6333     Instruction *I = Worklist.pop_back_val();
6334 
6335     // If we've already analyzed the instruction, there's nothing to do.
6336     if (ScalarCosts.find(I) != ScalarCosts.end())
6337       continue;
6338 
6339     // Compute the cost of the vector instruction. Note that this cost already
6340     // includes the scalarization overhead of the predicated instruction.
6341     InstructionCost VectorCost = getInstructionCost(I, VF).first;
6342 
6343     // Compute the cost of the scalarized instruction. This cost is the cost of
6344     // the instruction as if it wasn't if-converted and instead remained in the
6345     // predicated block. We will scale this cost by block probability after
6346     // computing the scalarization overhead.
6347     InstructionCost ScalarCost =
6348         VF.getFixedValue() *
6349         getInstructionCost(I, ElementCount::getFixed(1)).first;
6350 
6351     // Compute the scalarization overhead of needed insertelement instructions
6352     // and phi nodes.
6353     if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
6354       ScalarCost += TTI.getScalarizationOverhead(
6355           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6356           APInt::getAllOnes(VF.getFixedValue()), true, false);
6357       ScalarCost +=
6358           VF.getFixedValue() *
6359           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6360     }
6361 
6362     // Compute the scalarization overhead of needed extractelement
6363     // instructions. For each of the instruction's operands, if the operand can
6364     // be scalarized, add it to the worklist; otherwise, account for the
6365     // overhead.
6366     for (Use &U : I->operands())
6367       if (auto *J = dyn_cast<Instruction>(U.get())) {
6368         assert(VectorType::isValidElementType(J->getType()) &&
6369                "Instruction has non-scalar type");
6370         if (canBeScalarized(J))
6371           Worklist.push_back(J);
6372         else if (needsExtract(J, VF)) {
6373           ScalarCost += TTI.getScalarizationOverhead(
6374               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6375               APInt::getAllOnes(VF.getFixedValue()), false, true);
6376         }
6377       }
6378 
6379     // Scale the total scalar cost by block probability.
6380     ScalarCost /= getReciprocalPredBlockProb();
6381 
6382     // Compute the discount. A non-negative discount means the vector version
6383     // of the instruction costs more, and scalarizing would be beneficial.
6384     Discount += VectorCost - ScalarCost;
6385     ScalarCosts[I] = ScalarCost;
6386   }
6387 
6388   return *Discount.getValue();
6389 }
6390 
6391 LoopVectorizationCostModel::VectorizationCostTy
6392 LoopVectorizationCostModel::expectedCost(
6393     ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
6394   VectorizationCostTy Cost;
6395 
6396   // For each block.
6397   for (BasicBlock *BB : TheLoop->blocks()) {
6398     VectorizationCostTy BlockCost;
6399 
6400     // For each instruction in the old loop.
6401     for (Instruction &I : BB->instructionsWithoutDebug()) {
6402       // Skip ignored values.
6403       if (ValuesToIgnore.count(&I) ||
6404           (VF.isVector() && VecValuesToIgnore.count(&I)))
6405         continue;
6406 
6407       VectorizationCostTy C = getInstructionCost(&I, VF);
6408 
6409       // Check if we should override the cost.
6410       if (C.first.isValid() &&
6411           ForceTargetInstructionCost.getNumOccurrences() > 0)
6412         C.first = InstructionCost(ForceTargetInstructionCost);
6413 
6414       // Keep a list of instructions with invalid costs.
6415       if (Invalid && !C.first.isValid())
6416         Invalid->emplace_back(&I, VF);
6417 
6418       BlockCost.first += C.first;
6419       BlockCost.second |= C.second;
6420       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6421                         << " for VF " << VF << " For instruction: " << I
6422                         << '\n');
6423     }
6424 
6425     // If we are vectorizing a predicated block, it will have been
6426     // if-converted. This means that the block's instructions (aside from
6427     // stores and instructions that may divide by zero) will now be
6428     // unconditionally executed. For the scalar case, we may not always execute
6429     // the predicated block, if it is an if-else block. Thus, scale the block's
6430     // cost by the probability of executing it. blockNeedsPredication from
6431     // Legal is used so as to not include all blocks in tail folded loops.
6432     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6433       BlockCost.first /= getReciprocalPredBlockProb();
6434 
6435     Cost.first += BlockCost.first;
6436     Cost.second |= BlockCost.second;
6437   }
6438 
6439   return Cost;
6440 }
6441 
6442 /// Gets Address Access SCEV after verifying that the access pattern
6443 /// is loop invariant except the induction variable dependence.
6444 ///
6445 /// This SCEV can be sent to the Target in order to estimate the address
6446 /// calculation cost.
6447 static const SCEV *getAddressAccessSCEV(
6448               Value *Ptr,
6449               LoopVectorizationLegality *Legal,
6450               PredicatedScalarEvolution &PSE,
6451               const Loop *TheLoop) {
6452 
6453   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6454   if (!Gep)
6455     return nullptr;
6456 
6457   // We are looking for a gep with all loop invariant indices except for one
6458   // which should be an induction variable.
6459   auto SE = PSE.getSE();
6460   unsigned NumOperands = Gep->getNumOperands();
6461   for (unsigned i = 1; i < NumOperands; ++i) {
6462     Value *Opd = Gep->getOperand(i);
6463     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6464         !Legal->isInductionVariable(Opd))
6465       return nullptr;
6466   }
6467 
6468   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6469   return PSE.getSCEV(Ptr);
6470 }
6471 
6472 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6473   return Legal->hasStride(I->getOperand(0)) ||
6474          Legal->hasStride(I->getOperand(1));
6475 }
6476 
6477 InstructionCost
6478 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6479                                                         ElementCount VF) {
6480   assert(VF.isVector() &&
6481          "Scalarization cost of instruction implies vectorization.");
6482   if (VF.isScalable())
6483     return InstructionCost::getInvalid();
6484 
6485   Type *ValTy = getLoadStoreType(I);
6486   auto SE = PSE.getSE();
6487 
6488   unsigned AS = getLoadStoreAddressSpace(I);
6489   Value *Ptr = getLoadStorePointerOperand(I);
6490   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6491   // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
6492   //       that it is being called from this specific place.
6493 
6494   // Figure out whether the access is strided and get the stride value
6495   // if it's known in compile time
6496   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6497 
6498   // Get the cost of the scalar memory instruction and address computation.
6499   InstructionCost Cost =
6500       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6501 
6502   // Don't pass *I here, since it is scalar but will actually be part of a
6503   // vectorized loop where the user of it is a vectorized instruction.
6504   const Align Alignment = getLoadStoreAlignment(I);
6505   Cost += VF.getKnownMinValue() *
6506           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6507                               AS, TTI::TCK_RecipThroughput);
6508 
6509   // Get the overhead of the extractelement and insertelement instructions
6510   // we might create due to scalarization.
6511   Cost += getScalarizationOverhead(I, VF);
6512 
6513   // If we have a predicated load/store, it will need extra i1 extracts and
6514   // conditional branches, but may not be executed for each vector lane. Scale
6515   // the cost by the probability of executing the predicated block.
6516   if (isPredicatedInst(I, VF)) {
6517     Cost /= getReciprocalPredBlockProb();
6518 
6519     // Add the cost of an i1 extract and a branch
6520     auto *Vec_i1Ty =
6521         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
6522     Cost += TTI.getScalarizationOverhead(
6523         Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
6524         /*Insert=*/false, /*Extract=*/true);
6525     Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
6526 
6527     if (useEmulatedMaskMemRefHack(I, VF))
6528       // Artificially setting to a high enough value to practically disable
6529       // vectorization with such operations.
6530       Cost = 3000000;
6531   }
6532 
6533   return Cost;
6534 }
6535 
6536 InstructionCost
6537 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6538                                                     ElementCount VF) {
6539   Type *ValTy = getLoadStoreType(I);
6540   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6541   Value *Ptr = getLoadStorePointerOperand(I);
6542   unsigned AS = getLoadStoreAddressSpace(I);
6543   int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
6544   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6545 
6546   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6547          "Stride should be 1 or -1 for consecutive memory access");
6548   const Align Alignment = getLoadStoreAlignment(I);
6549   InstructionCost Cost = 0;
6550   if (Legal->isMaskRequired(I))
6551     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6552                                       CostKind);
6553   else
6554     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6555                                 CostKind, I);
6556 
6557   bool Reverse = ConsecutiveStride < 0;
6558   if (Reverse)
6559     Cost +=
6560         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
6561   return Cost;
6562 }
6563 
6564 InstructionCost
6565 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6566                                                 ElementCount VF) {
6567   assert(Legal->isUniformMemOp(*I));
6568 
6569   Type *ValTy = getLoadStoreType(I);
6570   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6571   const Align Alignment = getLoadStoreAlignment(I);
6572   unsigned AS = getLoadStoreAddressSpace(I);
6573   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6574   if (isa<LoadInst>(I)) {
6575     return TTI.getAddressComputationCost(ValTy) +
6576            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6577                                CostKind) +
6578            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6579   }
6580   StoreInst *SI = cast<StoreInst>(I);
6581 
6582   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6583   return TTI.getAddressComputationCost(ValTy) +
6584          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6585                              CostKind) +
6586          (isLoopInvariantStoreValue
6587               ? 0
6588               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6589                                        VF.getKnownMinValue() - 1));
6590 }
6591 
6592 InstructionCost
6593 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6594                                                  ElementCount VF) {
6595   Type *ValTy = getLoadStoreType(I);
6596   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6597   const Align Alignment = getLoadStoreAlignment(I);
6598   const Value *Ptr = getLoadStorePointerOperand(I);
6599 
6600   return TTI.getAddressComputationCost(VectorTy) +
6601          TTI.getGatherScatterOpCost(
6602              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6603              TargetTransformInfo::TCK_RecipThroughput, I);
6604 }
6605 
6606 InstructionCost
6607 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6608                                                    ElementCount VF) {
6609   // TODO: Once we have support for interleaving with scalable vectors
6610   // we can calculate the cost properly here.
6611   if (VF.isScalable())
6612     return InstructionCost::getInvalid();
6613 
6614   Type *ValTy = getLoadStoreType(I);
6615   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6616   unsigned AS = getLoadStoreAddressSpace(I);
6617 
6618   auto Group = getInterleavedAccessGroup(I);
6619   assert(Group && "Fail to get an interleaved access group.");
6620 
6621   unsigned InterleaveFactor = Group->getFactor();
6622   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6623 
6624   // Holds the indices of existing members in the interleaved group.
6625   SmallVector<unsigned, 4> Indices;
6626   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
6627     if (Group->getMember(IF))
6628       Indices.push_back(IF);
6629 
6630   // Calculate the cost of the whole interleaved group.
6631   bool UseMaskForGaps =
6632       (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6633       (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
6634   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
6635       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6636       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6637 
6638   if (Group->isReverse()) {
6639     // TODO: Add support for reversed masked interleaved access.
6640     assert(!Legal->isMaskRequired(I) &&
6641            "Reverse masked interleaved access not supported.");
6642     Cost +=
6643         Group->getNumMembers() *
6644         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
6645   }
6646   return Cost;
6647 }
6648 
6649 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
6650     Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
6651   using namespace llvm::PatternMatch;
6652   // Early exit for no inloop reductions
6653   if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6654     return None;
6655   auto *VectorTy = cast<VectorType>(Ty);
6656 
6657   // We are looking for a pattern of, and finding the minimal acceptable cost:
6658   //  reduce(mul(ext(A), ext(B))) or
6659   //  reduce(mul(A, B)) or
6660   //  reduce(ext(A)) or
6661   //  reduce(A).
6662   // The basic idea is that we walk down the tree to do that, finding the root
6663   // reduction instruction in InLoopReductionImmediateChains. From there we find
6664   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6665   // of the components. If the reduction cost is lower then we return it for the
6666   // reduction instruction and 0 for the other instructions in the pattern. If
6667   // it is not we return an invalid cost specifying the orignal cost method
6668   // should be used.
6669   Instruction *RetI = I;
6670   if (match(RetI, m_ZExtOrSExt(m_Value()))) {
6671     if (!RetI->hasOneUser())
6672       return None;
6673     RetI = RetI->user_back();
6674   }
6675   if (match(RetI, m_Mul(m_Value(), m_Value())) &&
6676       RetI->user_back()->getOpcode() == Instruction::Add) {
6677     if (!RetI->hasOneUser())
6678       return None;
6679     RetI = RetI->user_back();
6680   }
6681 
6682   // Test if the found instruction is a reduction, and if not return an invalid
6683   // cost specifying the parent to use the original cost modelling.
6684   if (!InLoopReductionImmediateChains.count(RetI))
6685     return None;
6686 
6687   // Find the reduction this chain is a part of and calculate the basic cost of
6688   // the reduction on its own.
6689   Instruction *LastChain = InLoopReductionImmediateChains[RetI];
6690   Instruction *ReductionPhi = LastChain;
6691   while (!isa<PHINode>(ReductionPhi))
6692     ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
6693 
6694   const RecurrenceDescriptor &RdxDesc =
6695       Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
6696 
6697   InstructionCost BaseCost = TTI.getArithmeticReductionCost(
6698       RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
6699 
6700   // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6701   // normal fmul instruction to the cost of the fadd reduction.
6702   if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
6703     BaseCost +=
6704         TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
6705 
6706   // If we're using ordered reductions then we can just return the base cost
6707   // here, since getArithmeticReductionCost calculates the full ordered
6708   // reduction cost when FP reassociation is not allowed.
6709   if (useOrderedReductions(RdxDesc))
6710     return BaseCost;
6711 
6712   // Get the operand that was not the reduction chain and match it to one of the
6713   // patterns, returning the better cost if it is found.
6714   Instruction *RedOp = RetI->getOperand(1) == LastChain
6715                            ? dyn_cast<Instruction>(RetI->getOperand(0))
6716                            : dyn_cast<Instruction>(RetI->getOperand(1));
6717 
6718   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6719 
6720   Instruction *Op0, *Op1;
6721   if (RedOp &&
6722       match(RedOp,
6723             m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
6724       match(Op0, m_ZExtOrSExt(m_Value())) &&
6725       Op0->getOpcode() == Op1->getOpcode() &&
6726       Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6727       !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
6728       (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
6729 
6730     // Matched reduce(ext(mul(ext(A), ext(B)))
6731     // Note that the extend opcodes need to all match, or if A==B they will have
6732     // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6733     // which is equally fine.
6734     bool IsUnsigned = isa<ZExtInst>(Op0);
6735     auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6736     auto *MulType = VectorType::get(Op0->getType(), VectorTy);
6737 
6738     InstructionCost ExtCost =
6739         TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
6740                              TTI::CastContextHint::None, CostKind, Op0);
6741     InstructionCost MulCost =
6742         TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
6743     InstructionCost Ext2Cost =
6744         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
6745                              TTI::CastContextHint::None, CostKind, RedOp);
6746 
6747     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6748         /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6749         CostKind);
6750 
6751     if (RedCost.isValid() &&
6752         RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6753       return I == RetI ? RedCost : 0;
6754   } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
6755              !TheLoop->isLoopInvariant(RedOp)) {
6756     // Matched reduce(ext(A))
6757     bool IsUnsigned = isa<ZExtInst>(RedOp);
6758     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6759     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6760         /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6761         CostKind);
6762 
6763     InstructionCost ExtCost =
6764         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6765                              TTI::CastContextHint::None, CostKind, RedOp);
6766     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6767       return I == RetI ? RedCost : 0;
6768   } else if (RedOp &&
6769              match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
6770     if (match(Op0, m_ZExtOrSExt(m_Value())) &&
6771         Op0->getOpcode() == Op1->getOpcode() &&
6772         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
6773       bool IsUnsigned = isa<ZExtInst>(Op0);
6774       Type *Op0Ty = Op0->getOperand(0)->getType();
6775       Type *Op1Ty = Op1->getOperand(0)->getType();
6776       Type *LargestOpTy =
6777           Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
6778                                                                     : Op0Ty;
6779       auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
6780 
6781       // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of
6782       // different sizes. We take the largest type as the ext to reduce, and add
6783       // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6784       InstructionCost ExtCost0 = TTI.getCastInstrCost(
6785           Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
6786           TTI::CastContextHint::None, CostKind, Op0);
6787       InstructionCost ExtCost1 = TTI.getCastInstrCost(
6788           Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
6789           TTI::CastContextHint::None, CostKind, Op1);
6790       InstructionCost MulCost =
6791           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6792 
6793       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6794           /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6795           CostKind);
6796       InstructionCost ExtraExtCost = 0;
6797       if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
6798         Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
6799         ExtraExtCost = TTI.getCastInstrCost(
6800             ExtraExtOp->getOpcode(), ExtType,
6801             VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
6802             TTI::CastContextHint::None, CostKind, ExtraExtOp);
6803       }
6804 
6805       if (RedCost.isValid() &&
6806           (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6807         return I == RetI ? RedCost : 0;
6808     } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
6809       // Matched reduce(mul())
6810       InstructionCost MulCost =
6811           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6812 
6813       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6814           /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy,
6815           CostKind);
6816 
6817       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
6818         return I == RetI ? RedCost : 0;
6819     }
6820   }
6821 
6822   return I == RetI ? Optional<InstructionCost>(BaseCost) : None;
6823 }
6824 
6825 InstructionCost
6826 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6827                                                      ElementCount VF) {
6828   // Calculate scalar cost only. Vectorization cost should be ready at this
6829   // moment.
6830   if (VF.isScalar()) {
6831     Type *ValTy = getLoadStoreType(I);
6832     const Align Alignment = getLoadStoreAlignment(I);
6833     unsigned AS = getLoadStoreAddressSpace(I);
6834 
6835     return TTI.getAddressComputationCost(ValTy) +
6836            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6837                                TTI::TCK_RecipThroughput, I);
6838   }
6839   return getWideningCost(I, VF);
6840 }
6841 
6842 LoopVectorizationCostModel::VectorizationCostTy
6843 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6844                                                ElementCount VF) {
6845   // If we know that this instruction will remain uniform, check the cost of
6846   // the scalar version.
6847   if (isUniformAfterVectorization(I, VF))
6848     VF = ElementCount::getFixed(1);
6849 
6850   if (VF.isVector() && isProfitableToScalarize(I, VF))
6851     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6852 
6853   // Forced scalars do not have any scalarization overhead.
6854   auto ForcedScalar = ForcedScalars.find(VF);
6855   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6856     auto InstSet = ForcedScalar->second;
6857     if (InstSet.count(I))
6858       return VectorizationCostTy(
6859           (getInstructionCost(I, ElementCount::getFixed(1)).first *
6860            VF.getKnownMinValue()),
6861           false);
6862   }
6863 
6864   Type *VectorTy;
6865   InstructionCost C = getInstructionCost(I, VF, VectorTy);
6866 
6867   bool TypeNotScalarized = false;
6868   if (VF.isVector() && VectorTy->isVectorTy()) {
6869     unsigned NumParts = TTI.getNumberOfParts(VectorTy);
6870     if (NumParts)
6871       TypeNotScalarized = NumParts < VF.getKnownMinValue();
6872     else
6873       C = InstructionCost::getInvalid();
6874   }
6875   return VectorizationCostTy(C, TypeNotScalarized);
6876 }
6877 
6878 InstructionCost
6879 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6880                                                      ElementCount VF) const {
6881 
6882   // There is no mechanism yet to create a scalable scalarization loop,
6883   // so this is currently Invalid.
6884   if (VF.isScalable())
6885     return InstructionCost::getInvalid();
6886 
6887   if (VF.isScalar())
6888     return 0;
6889 
6890   InstructionCost Cost = 0;
6891   Type *RetTy = ToVectorTy(I->getType(), VF);
6892   if (!RetTy->isVoidTy() &&
6893       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6894     Cost += TTI.getScalarizationOverhead(
6895         cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true,
6896         false);
6897 
6898   // Some targets keep addresses scalar.
6899   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6900     return Cost;
6901 
6902   // Some targets support efficient element stores.
6903   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6904     return Cost;
6905 
6906   // Collect operands to consider.
6907   CallInst *CI = dyn_cast<CallInst>(I);
6908   Instruction::op_range Ops = CI ? CI->args() : I->operands();
6909 
6910   // Skip operands that do not require extraction/scalarization and do not incur
6911   // any overhead.
6912   SmallVector<Type *> Tys;
6913   for (auto *V : filterExtractingOperands(Ops, VF))
6914     Tys.push_back(MaybeVectorizeType(V->getType(), VF));
6915   return Cost + TTI.getOperandsScalarizationOverhead(
6916                     filterExtractingOperands(Ops, VF), Tys);
6917 }
6918 
6919 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6920   if (VF.isScalar())
6921     return;
6922   NumPredStores = 0;
6923   for (BasicBlock *BB : TheLoop->blocks()) {
6924     // For each instruction in the old loop.
6925     for (Instruction &I : *BB) {
6926       Value *Ptr =  getLoadStorePointerOperand(&I);
6927       if (!Ptr)
6928         continue;
6929 
6930       // TODO: We should generate better code and update the cost model for
6931       // predicated uniform stores. Today they are treated as any other
6932       // predicated store (see added test cases in
6933       // invariant-store-vectorization.ll).
6934       if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6935         NumPredStores++;
6936 
6937       if (Legal->isUniformMemOp(I)) {
6938         // TODO: Avoid replicating loads and stores instead of
6939         // relying on instcombine to remove them.
6940         // Load: Scalar load + broadcast
6941         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6942         InstructionCost Cost;
6943         if (isa<StoreInst>(&I) && VF.isScalable() &&
6944             isLegalGatherOrScatter(&I, VF)) {
6945           Cost = getGatherScatterCost(&I, VF);
6946           setWideningDecision(&I, VF, CM_GatherScatter, Cost);
6947         } else {
6948           assert((isa<LoadInst>(&I) || !VF.isScalable()) &&
6949                  "Cannot yet scalarize uniform stores");
6950           Cost = getUniformMemOpCost(&I, VF);
6951           setWideningDecision(&I, VF, CM_Scalarize, Cost);
6952         }
6953         continue;
6954       }
6955 
6956       // We assume that widening is the best solution when possible.
6957       if (memoryInstructionCanBeWidened(&I, VF)) {
6958         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6959         int ConsecutiveStride = Legal->isConsecutivePtr(
6960             getLoadStoreType(&I), getLoadStorePointerOperand(&I));
6961         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6962                "Expected consecutive stride.");
6963         InstWidening Decision =
6964             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6965         setWideningDecision(&I, VF, Decision, Cost);
6966         continue;
6967       }
6968 
6969       // Choose between Interleaving, Gather/Scatter or Scalarization.
6970       InstructionCost InterleaveCost = InstructionCost::getInvalid();
6971       unsigned NumAccesses = 1;
6972       if (isAccessInterleaved(&I)) {
6973         auto Group = getInterleavedAccessGroup(&I);
6974         assert(Group && "Fail to get an interleaved access group.");
6975 
6976         // Make one decision for the whole group.
6977         if (getWideningDecision(&I, VF) != CM_Unknown)
6978           continue;
6979 
6980         NumAccesses = Group->getNumMembers();
6981         if (interleavedAccessCanBeWidened(&I, VF))
6982           InterleaveCost = getInterleaveGroupCost(&I, VF);
6983       }
6984 
6985       InstructionCost GatherScatterCost =
6986           isLegalGatherOrScatter(&I, VF)
6987               ? getGatherScatterCost(&I, VF) * NumAccesses
6988               : InstructionCost::getInvalid();
6989 
6990       InstructionCost ScalarizationCost =
6991           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6992 
6993       // Choose better solution for the current VF,
6994       // write down this decision and use it during vectorization.
6995       InstructionCost Cost;
6996       InstWidening Decision;
6997       if (InterleaveCost <= GatherScatterCost &&
6998           InterleaveCost < ScalarizationCost) {
6999         Decision = CM_Interleave;
7000         Cost = InterleaveCost;
7001       } else if (GatherScatterCost < ScalarizationCost) {
7002         Decision = CM_GatherScatter;
7003         Cost = GatherScatterCost;
7004       } else {
7005         Decision = CM_Scalarize;
7006         Cost = ScalarizationCost;
7007       }
7008       // If the instructions belongs to an interleave group, the whole group
7009       // receives the same decision. The whole group receives the cost, but
7010       // the cost will actually be assigned to one instruction.
7011       if (auto Group = getInterleavedAccessGroup(&I))
7012         setWideningDecision(Group, VF, Decision, Cost);
7013       else
7014         setWideningDecision(&I, VF, Decision, Cost);
7015     }
7016   }
7017 
7018   // Make sure that any load of address and any other address computation
7019   // remains scalar unless there is gather/scatter support. This avoids
7020   // inevitable extracts into address registers, and also has the benefit of
7021   // activating LSR more, since that pass can't optimize vectorized
7022   // addresses.
7023   if (TTI.prefersVectorizedAddressing())
7024     return;
7025 
7026   // Start with all scalar pointer uses.
7027   SmallPtrSet<Instruction *, 8> AddrDefs;
7028   for (BasicBlock *BB : TheLoop->blocks())
7029     for (Instruction &I : *BB) {
7030       Instruction *PtrDef =
7031         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
7032       if (PtrDef && TheLoop->contains(PtrDef) &&
7033           getWideningDecision(&I, VF) != CM_GatherScatter)
7034         AddrDefs.insert(PtrDef);
7035     }
7036 
7037   // Add all instructions used to generate the addresses.
7038   SmallVector<Instruction *, 4> Worklist;
7039   append_range(Worklist, AddrDefs);
7040   while (!Worklist.empty()) {
7041     Instruction *I = Worklist.pop_back_val();
7042     for (auto &Op : I->operands())
7043       if (auto *InstOp = dyn_cast<Instruction>(Op))
7044         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
7045             AddrDefs.insert(InstOp).second)
7046           Worklist.push_back(InstOp);
7047   }
7048 
7049   for (auto *I : AddrDefs) {
7050     if (isa<LoadInst>(I)) {
7051       // Setting the desired widening decision should ideally be handled in
7052       // by cost functions, but since this involves the task of finding out
7053       // if the loaded register is involved in an address computation, it is
7054       // instead changed here when we know this is the case.
7055       InstWidening Decision = getWideningDecision(I, VF);
7056       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
7057         // Scalarize a widened load of address.
7058         setWideningDecision(
7059             I, VF, CM_Scalarize,
7060             (VF.getKnownMinValue() *
7061              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
7062       else if (auto Group = getInterleavedAccessGroup(I)) {
7063         // Scalarize an interleave group of address loads.
7064         for (unsigned I = 0; I < Group->getFactor(); ++I) {
7065           if (Instruction *Member = Group->getMember(I))
7066             setWideningDecision(
7067                 Member, VF, CM_Scalarize,
7068                 (VF.getKnownMinValue() *
7069                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
7070         }
7071       }
7072     } else
7073       // Make sure I gets scalarized and a cost estimate without
7074       // scalarization overhead.
7075       ForcedScalars[VF].insert(I);
7076   }
7077 }
7078 
7079 InstructionCost
7080 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
7081                                                Type *&VectorTy) {
7082   Type *RetTy = I->getType();
7083   if (canTruncateToMinimalBitwidth(I, VF))
7084     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
7085   auto SE = PSE.getSE();
7086   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7087 
7088   auto hasSingleCopyAfterVectorization = [this](Instruction *I,
7089                                                 ElementCount VF) -> bool {
7090     if (VF.isScalar())
7091       return true;
7092 
7093     auto Scalarized = InstsToScalarize.find(VF);
7094     assert(Scalarized != InstsToScalarize.end() &&
7095            "VF not yet analyzed for scalarization profitability");
7096     return !Scalarized->second.count(I) &&
7097            llvm::all_of(I->users(), [&](User *U) {
7098              auto *UI = cast<Instruction>(U);
7099              return !Scalarized->second.count(UI);
7100            });
7101   };
7102   (void) hasSingleCopyAfterVectorization;
7103 
7104   if (isScalarAfterVectorization(I, VF)) {
7105     // With the exception of GEPs and PHIs, after scalarization there should
7106     // only be one copy of the instruction generated in the loop. This is
7107     // because the VF is either 1, or any instructions that need scalarizing
7108     // have already been dealt with by the the time we get here. As a result,
7109     // it means we don't have to multiply the instruction cost by VF.
7110     assert(I->getOpcode() == Instruction::GetElementPtr ||
7111            I->getOpcode() == Instruction::PHI ||
7112            (I->getOpcode() == Instruction::BitCast &&
7113             I->getType()->isPointerTy()) ||
7114            hasSingleCopyAfterVectorization(I, VF));
7115     VectorTy = RetTy;
7116   } else
7117     VectorTy = ToVectorTy(RetTy, VF);
7118 
7119   // TODO: We need to estimate the cost of intrinsic calls.
7120   switch (I->getOpcode()) {
7121   case Instruction::GetElementPtr:
7122     // We mark this instruction as zero-cost because the cost of GEPs in
7123     // vectorized code depends on whether the corresponding memory instruction
7124     // is scalarized or not. Therefore, we handle GEPs with the memory
7125     // instruction cost.
7126     return 0;
7127   case Instruction::Br: {
7128     // In cases of scalarized and predicated instructions, there will be VF
7129     // predicated blocks in the vectorized loop. Each branch around these
7130     // blocks requires also an extract of its vector compare i1 element.
7131     bool ScalarPredicatedBB = false;
7132     BranchInst *BI = cast<BranchInst>(I);
7133     if (VF.isVector() && BI->isConditional() &&
7134         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
7135          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
7136       ScalarPredicatedBB = true;
7137 
7138     if (ScalarPredicatedBB) {
7139       // Not possible to scalarize scalable vector with predicated instructions.
7140       if (VF.isScalable())
7141         return InstructionCost::getInvalid();
7142       // Return cost for branches around scalarized and predicated blocks.
7143       auto *Vec_i1Ty =
7144           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
7145       return (
7146           TTI.getScalarizationOverhead(
7147               Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) +
7148           (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
7149     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
7150       // The back-edge branch will remain, as will all scalar branches.
7151       return TTI.getCFInstrCost(Instruction::Br, CostKind);
7152     else
7153       // This branch will be eliminated by if-conversion.
7154       return 0;
7155     // Note: We currently assume zero cost for an unconditional branch inside
7156     // a predicated block since it will become a fall-through, although we
7157     // may decide in the future to call TTI for all branches.
7158   }
7159   case Instruction::PHI: {
7160     auto *Phi = cast<PHINode>(I);
7161 
7162     // First-order recurrences are replaced by vector shuffles inside the loop.
7163     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
7164     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
7165       return TTI.getShuffleCost(
7166           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
7167           None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
7168 
7169     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7170     // converted into select instructions. We require N - 1 selects per phi
7171     // node, where N is the number of incoming values.
7172     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
7173       return (Phi->getNumIncomingValues() - 1) *
7174              TTI.getCmpSelInstrCost(
7175                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
7176                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
7177                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
7178 
7179     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
7180   }
7181   case Instruction::UDiv:
7182   case Instruction::SDiv:
7183   case Instruction::URem:
7184   case Instruction::SRem:
7185     // If we have a predicated instruction, it may not be executed for each
7186     // vector lane. Get the scalarization cost and scale this amount by the
7187     // probability of executing the predicated block. If the instruction is not
7188     // predicated, we fall through to the next case.
7189     if (VF.isVector() && isScalarWithPredication(I, VF)) {
7190       InstructionCost Cost = 0;
7191 
7192       // These instructions have a non-void type, so account for the phi nodes
7193       // that we will create. This cost is likely to be zero. The phi node
7194       // cost, if any, should be scaled by the block probability because it
7195       // models a copy at the end of each predicated block.
7196       Cost += VF.getKnownMinValue() *
7197               TTI.getCFInstrCost(Instruction::PHI, CostKind);
7198 
7199       // The cost of the non-predicated instruction.
7200       Cost += VF.getKnownMinValue() *
7201               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
7202 
7203       // The cost of insertelement and extractelement instructions needed for
7204       // scalarization.
7205       Cost += getScalarizationOverhead(I, VF);
7206 
7207       // Scale the cost by the probability of executing the predicated blocks.
7208       // This assumes the predicated block for each vector lane is equally
7209       // likely.
7210       return Cost / getReciprocalPredBlockProb();
7211     }
7212     LLVM_FALLTHROUGH;
7213   case Instruction::Add:
7214   case Instruction::FAdd:
7215   case Instruction::Sub:
7216   case Instruction::FSub:
7217   case Instruction::Mul:
7218   case Instruction::FMul:
7219   case Instruction::FDiv:
7220   case Instruction::FRem:
7221   case Instruction::Shl:
7222   case Instruction::LShr:
7223   case Instruction::AShr:
7224   case Instruction::And:
7225   case Instruction::Or:
7226   case Instruction::Xor: {
7227     // Since we will replace the stride by 1 the multiplication should go away.
7228     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7229       return 0;
7230 
7231     // Detect reduction patterns
7232     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7233       return *RedCost;
7234 
7235     // Certain instructions can be cheaper to vectorize if they have a constant
7236     // second vector operand. One example of this are shifts on x86.
7237     Value *Op2 = I->getOperand(1);
7238     TargetTransformInfo::OperandValueProperties Op2VP;
7239     TargetTransformInfo::OperandValueKind Op2VK =
7240         TTI.getOperandInfo(Op2, Op2VP);
7241     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7242       Op2VK = TargetTransformInfo::OK_UniformValue;
7243 
7244     SmallVector<const Value *, 4> Operands(I->operand_values());
7245     return TTI.getArithmeticInstrCost(
7246         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7247         Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
7248   }
7249   case Instruction::FNeg: {
7250     return TTI.getArithmeticInstrCost(
7251         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7252         TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None,
7253         TargetTransformInfo::OP_None, I->getOperand(0), I);
7254   }
7255   case Instruction::Select: {
7256     SelectInst *SI = cast<SelectInst>(I);
7257     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7258     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7259 
7260     const Value *Op0, *Op1;
7261     using namespace llvm::PatternMatch;
7262     if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
7263                         match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
7264       // select x, y, false --> x & y
7265       // select x, true, y --> x | y
7266       TTI::OperandValueProperties Op1VP = TTI::OP_None;
7267       TTI::OperandValueProperties Op2VP = TTI::OP_None;
7268       TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP);
7269       TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP);
7270       assert(Op0->getType()->getScalarSizeInBits() == 1 &&
7271               Op1->getType()->getScalarSizeInBits() == 1);
7272 
7273       SmallVector<const Value *, 2> Operands{Op0, Op1};
7274       return TTI.getArithmeticInstrCost(
7275           match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
7276           CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I);
7277     }
7278 
7279     Type *CondTy = SI->getCondition()->getType();
7280     if (!ScalarCond)
7281       CondTy = VectorType::get(CondTy, VF);
7282 
7283     CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
7284     if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
7285       Pred = Cmp->getPredicate();
7286     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
7287                                   CostKind, I);
7288   }
7289   case Instruction::ICmp:
7290   case Instruction::FCmp: {
7291     Type *ValTy = I->getOperand(0)->getType();
7292     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7293     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7294       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7295     VectorTy = ToVectorTy(ValTy, VF);
7296     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7297                                   cast<CmpInst>(I)->getPredicate(), CostKind,
7298                                   I);
7299   }
7300   case Instruction::Store:
7301   case Instruction::Load: {
7302     ElementCount Width = VF;
7303     if (Width.isVector()) {
7304       InstWidening Decision = getWideningDecision(I, Width);
7305       assert(Decision != CM_Unknown &&
7306              "CM decision should be taken at this point");
7307       if (Decision == CM_Scalarize)
7308         Width = ElementCount::getFixed(1);
7309     }
7310     VectorTy = ToVectorTy(getLoadStoreType(I), Width);
7311     return getMemoryInstructionCost(I, VF);
7312   }
7313   case Instruction::BitCast:
7314     if (I->getType()->isPointerTy())
7315       return 0;
7316     LLVM_FALLTHROUGH;
7317   case Instruction::ZExt:
7318   case Instruction::SExt:
7319   case Instruction::FPToUI:
7320   case Instruction::FPToSI:
7321   case Instruction::FPExt:
7322   case Instruction::PtrToInt:
7323   case Instruction::IntToPtr:
7324   case Instruction::SIToFP:
7325   case Instruction::UIToFP:
7326   case Instruction::Trunc:
7327   case Instruction::FPTrunc: {
7328     // Computes the CastContextHint from a Load/Store instruction.
7329     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7330       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7331              "Expected a load or a store!");
7332 
7333       if (VF.isScalar() || !TheLoop->contains(I))
7334         return TTI::CastContextHint::Normal;
7335 
7336       switch (getWideningDecision(I, VF)) {
7337       case LoopVectorizationCostModel::CM_GatherScatter:
7338         return TTI::CastContextHint::GatherScatter;
7339       case LoopVectorizationCostModel::CM_Interleave:
7340         return TTI::CastContextHint::Interleave;
7341       case LoopVectorizationCostModel::CM_Scalarize:
7342       case LoopVectorizationCostModel::CM_Widen:
7343         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7344                                         : TTI::CastContextHint::Normal;
7345       case LoopVectorizationCostModel::CM_Widen_Reverse:
7346         return TTI::CastContextHint::Reversed;
7347       case LoopVectorizationCostModel::CM_Unknown:
7348         llvm_unreachable("Instr did not go through cost modelling?");
7349       }
7350 
7351       llvm_unreachable("Unhandled case!");
7352     };
7353 
7354     unsigned Opcode = I->getOpcode();
7355     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7356     // For Trunc, the context is the only user, which must be a StoreInst.
7357     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7358       if (I->hasOneUse())
7359         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7360           CCH = ComputeCCH(Store);
7361     }
7362     // For Z/Sext, the context is the operand, which must be a LoadInst.
7363     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7364              Opcode == Instruction::FPExt) {
7365       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7366         CCH = ComputeCCH(Load);
7367     }
7368 
7369     // We optimize the truncation of induction variables having constant
7370     // integer steps. The cost of these truncations is the same as the scalar
7371     // operation.
7372     if (isOptimizableIVTruncate(I, VF)) {
7373       auto *Trunc = cast<TruncInst>(I);
7374       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7375                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7376     }
7377 
7378     // Detect reduction patterns
7379     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7380       return *RedCost;
7381 
7382     Type *SrcScalarTy = I->getOperand(0)->getType();
7383     Type *SrcVecTy =
7384         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7385     if (canTruncateToMinimalBitwidth(I, VF)) {
7386       // This cast is going to be shrunk. This may remove the cast or it might
7387       // turn it into slightly different cast. For example, if MinBW == 16,
7388       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7389       //
7390       // Calculate the modified src and dest types.
7391       Type *MinVecTy = VectorTy;
7392       if (Opcode == Instruction::Trunc) {
7393         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7394         VectorTy =
7395             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7396       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7397         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7398         VectorTy =
7399             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7400       }
7401     }
7402 
7403     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7404   }
7405   case Instruction::Call: {
7406     if (RecurrenceDescriptor::isFMulAddIntrinsic(I))
7407       if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7408         return *RedCost;
7409     bool NeedToScalarize;
7410     CallInst *CI = cast<CallInst>(I);
7411     InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7412     if (getVectorIntrinsicIDForCall(CI, TLI)) {
7413       InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
7414       return std::min(CallCost, IntrinsicCost);
7415     }
7416     return CallCost;
7417   }
7418   case Instruction::ExtractValue:
7419     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7420   case Instruction::Alloca:
7421     // We cannot easily widen alloca to a scalable alloca, as
7422     // the result would need to be a vector of pointers.
7423     if (VF.isScalable())
7424       return InstructionCost::getInvalid();
7425     LLVM_FALLTHROUGH;
7426   default:
7427     // This opcode is unknown. Assume that it is the same as 'mul'.
7428     return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7429   } // end of switch.
7430 }
7431 
7432 char LoopVectorize::ID = 0;
7433 
7434 static const char lv_name[] = "Loop Vectorization";
7435 
7436 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7437 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7438 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7439 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
7440 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7441 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7442 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7443 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7444 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7445 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7446 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7447 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7448 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7449 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
7450 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
7451 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7452 
7453 namespace llvm {
7454 
7455 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7456 
7457 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7458                               bool VectorizeOnlyWhenForced) {
7459   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7460 }
7461 
7462 } // end namespace llvm
7463 
7464 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
7465   // Check if the pointer operand of a load or store instruction is
7466   // consecutive.
7467   if (auto *Ptr = getLoadStorePointerOperand(Inst))
7468     return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr);
7469   return false;
7470 }
7471 
7472 void LoopVectorizationCostModel::collectValuesToIgnore() {
7473   // Ignore ephemeral values.
7474   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7475 
7476   // Ignore type-promoting instructions we identified during reduction
7477   // detection.
7478   for (auto &Reduction : Legal->getReductionVars()) {
7479     const RecurrenceDescriptor &RedDes = Reduction.second;
7480     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7481     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7482   }
7483   // Ignore type-casting instructions we identified during induction
7484   // detection.
7485   for (auto &Induction : Legal->getInductionVars()) {
7486     const InductionDescriptor &IndDes = Induction.second;
7487     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7488     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7489   }
7490 }
7491 
7492 void LoopVectorizationCostModel::collectInLoopReductions() {
7493   for (auto &Reduction : Legal->getReductionVars()) {
7494     PHINode *Phi = Reduction.first;
7495     const RecurrenceDescriptor &RdxDesc = Reduction.second;
7496 
7497     // We don't collect reductions that are type promoted (yet).
7498     if (RdxDesc.getRecurrenceType() != Phi->getType())
7499       continue;
7500 
7501     // If the target would prefer this reduction to happen "in-loop", then we
7502     // want to record it as such.
7503     unsigned Opcode = RdxDesc.getOpcode();
7504     if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7505         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7506                                    TargetTransformInfo::ReductionFlags()))
7507       continue;
7508 
7509     // Check that we can correctly put the reductions into the loop, by
7510     // finding the chain of operations that leads from the phi to the loop
7511     // exit value.
7512     SmallVector<Instruction *, 4> ReductionOperations =
7513         RdxDesc.getReductionOpChain(Phi, TheLoop);
7514     bool InLoop = !ReductionOperations.empty();
7515     if (InLoop) {
7516       InLoopReductionChains[Phi] = ReductionOperations;
7517       // Add the elements to InLoopReductionImmediateChains for cost modelling.
7518       Instruction *LastChain = Phi;
7519       for (auto *I : ReductionOperations) {
7520         InLoopReductionImmediateChains[I] = LastChain;
7521         LastChain = I;
7522       }
7523     }
7524     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7525                       << " reduction for phi: " << *Phi << "\n");
7526   }
7527 }
7528 
7529 // TODO: we could return a pair of values that specify the max VF and
7530 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7531 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7532 // doesn't have a cost model that can choose which plan to execute if
7533 // more than one is generated.
7534 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7535                                  LoopVectorizationCostModel &CM) {
7536   unsigned WidestType;
7537   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7538   return WidestVectorRegBits / WidestType;
7539 }
7540 
7541 VectorizationFactor
7542 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7543   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
7544   ElementCount VF = UserVF;
7545   // Outer loop handling: They may require CFG and instruction level
7546   // transformations before even evaluating whether vectorization is profitable.
7547   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7548   // the vectorization pipeline.
7549   if (!OrigLoop->isInnermost()) {
7550     // If the user doesn't provide a vectorization factor, determine a
7551     // reasonable one.
7552     if (UserVF.isZero()) {
7553       VF = ElementCount::getFixed(determineVPlanVF(
7554           TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
7555               .getFixedSize(),
7556           CM));
7557       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7558 
7559       // Make sure we have a VF > 1 for stress testing.
7560       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7561         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7562                           << "overriding computed VF.\n");
7563         VF = ElementCount::getFixed(4);
7564       }
7565     }
7566     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7567     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7568            "VF needs to be a power of two");
7569     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7570                       << "VF " << VF << " to build VPlans.\n");
7571     buildVPlans(VF, VF);
7572 
7573     // For VPlan build stress testing, we bail out after VPlan construction.
7574     if (VPlanBuildStressTest)
7575       return VectorizationFactor::Disabled();
7576 
7577     return {VF, 0 /*Cost*/};
7578   }
7579 
7580   LLVM_DEBUG(
7581       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7582                 "VPlan-native path.\n");
7583   return VectorizationFactor::Disabled();
7584 }
7585 
7586 Optional<VectorizationFactor>
7587 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7588   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7589   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7590   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7591     return None;
7592 
7593   // Invalidate interleave groups if all blocks of loop will be predicated.
7594   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7595       !useMaskedInterleavedAccesses(*TTI)) {
7596     LLVM_DEBUG(
7597         dbgs()
7598         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7599            "which requires masked-interleaved support.\n");
7600     if (CM.InterleaveInfo.invalidateGroups())
7601       // Invalidating interleave groups also requires invalidating all decisions
7602       // based on them, which includes widening decisions and uniform and scalar
7603       // values.
7604       CM.invalidateCostModelingDecisions();
7605   }
7606 
7607   ElementCount MaxUserVF =
7608       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7609   bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
7610   if (!UserVF.isZero() && UserVFIsLegal) {
7611     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7612            "VF needs to be a power of two");
7613     // Collect the instructions (and their associated costs) that will be more
7614     // profitable to scalarize.
7615     if (CM.selectUserVectorizationFactor(UserVF)) {
7616       LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7617       CM.collectInLoopReductions();
7618       buildVPlansWithVPRecipes(UserVF, UserVF);
7619       LLVM_DEBUG(printPlans(dbgs()));
7620       return {{UserVF, 0}};
7621     } else
7622       reportVectorizationInfo("UserVF ignored because of invalid costs.",
7623                               "InvalidCost", ORE, OrigLoop);
7624   }
7625 
7626   // Populate the set of Vectorization Factor Candidates.
7627   ElementCountSet VFCandidates;
7628   for (auto VF = ElementCount::getFixed(1);
7629        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7630     VFCandidates.insert(VF);
7631   for (auto VF = ElementCount::getScalable(1);
7632        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7633     VFCandidates.insert(VF);
7634 
7635   for (const auto &VF : VFCandidates) {
7636     // Collect Uniform and Scalar instructions after vectorization with VF.
7637     CM.collectUniformsAndScalars(VF);
7638 
7639     // Collect the instructions (and their associated costs) that will be more
7640     // profitable to scalarize.
7641     if (VF.isVector())
7642       CM.collectInstsToScalarize(VF);
7643   }
7644 
7645   CM.collectInLoopReductions();
7646   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7647   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7648 
7649   LLVM_DEBUG(printPlans(dbgs()));
7650   if (!MaxFactors.hasVector())
7651     return VectorizationFactor::Disabled();
7652 
7653   // Select the optimal vectorization factor.
7654   auto SelectedVF = CM.selectVectorizationFactor(VFCandidates);
7655 
7656   // Check if it is profitable to vectorize with runtime checks.
7657   unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
7658   if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) {
7659     bool PragmaThresholdReached =
7660         NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
7661     bool ThresholdReached =
7662         NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
7663     if ((ThresholdReached && !Hints.allowReordering()) ||
7664         PragmaThresholdReached) {
7665       ORE->emit([&]() {
7666         return OptimizationRemarkAnalysisAliasing(
7667                    DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(),
7668                    OrigLoop->getHeader())
7669                << "loop not vectorized: cannot prove it is safe to reorder "
7670                   "memory operations";
7671       });
7672       LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
7673       Hints.emitRemarkWithHints();
7674       return VectorizationFactor::Disabled();
7675     }
7676   }
7677   return SelectedVF;
7678 }
7679 
7680 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
7681   assert(count_if(VPlans,
7682                   [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7683              1 &&
7684          "Best VF has not a single VPlan.");
7685 
7686   for (const VPlanPtr &Plan : VPlans) {
7687     if (Plan->hasVF(VF))
7688       return *Plan.get();
7689   }
7690   llvm_unreachable("No plan found!");
7691 }
7692 
7693 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7694   SmallVector<Metadata *, 4> MDs;
7695   // Reserve first location for self reference to the LoopID metadata node.
7696   MDs.push_back(nullptr);
7697   bool IsUnrollMetadata = false;
7698   MDNode *LoopID = L->getLoopID();
7699   if (LoopID) {
7700     // First find existing loop unrolling disable metadata.
7701     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7702       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7703       if (MD) {
7704         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7705         IsUnrollMetadata =
7706             S && S->getString().startswith("llvm.loop.unroll.disable");
7707       }
7708       MDs.push_back(LoopID->getOperand(i));
7709     }
7710   }
7711 
7712   if (!IsUnrollMetadata) {
7713     // Add runtime unroll disable metadata.
7714     LLVMContext &Context = L->getHeader()->getContext();
7715     SmallVector<Metadata *, 1> DisableOperands;
7716     DisableOperands.push_back(
7717         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7718     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7719     MDs.push_back(DisableNode);
7720     MDNode *NewLoopID = MDNode::get(Context, MDs);
7721     // Set operand 0 to refer to the loop id itself.
7722     NewLoopID->replaceOperandWith(0, NewLoopID);
7723     L->setLoopID(NewLoopID);
7724   }
7725 }
7726 
7727 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
7728                                            VPlan &BestVPlan,
7729                                            InnerLoopVectorizer &ILV,
7730                                            DominatorTree *DT) {
7731   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
7732                     << '\n');
7733 
7734   // Perform the actual loop transformation.
7735 
7736   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
7737   VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
7738   Value *CanonicalIVStartValue;
7739   std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
7740       ILV.createVectorizedLoopSkeleton();
7741   ILV.collectPoisonGeneratingRecipes(State);
7742 
7743   ILV.printDebugTracesAtStart();
7744 
7745   //===------------------------------------------------===//
7746   //
7747   // Notice: any optimization or new instruction that go
7748   // into the code below should also be implemented in
7749   // the cost-model.
7750   //
7751   //===------------------------------------------------===//
7752 
7753   // 2. Copy and widen instructions from the old loop into the new loop.
7754   BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr),
7755                              ILV.getOrCreateVectorTripCount(nullptr),
7756                              CanonicalIVStartValue, State);
7757   BestVPlan.execute(&State);
7758 
7759   // Keep all loop hints from the original loop on the vector loop (we'll
7760   // replace the vectorizer-specific hints below).
7761   MDNode *OrigLoopID = OrigLoop->getLoopID();
7762 
7763   Optional<MDNode *> VectorizedLoopID =
7764       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7765                                       LLVMLoopVectorizeFollowupVectorized});
7766 
7767   Loop *L = LI->getLoopFor(State.CFG.PrevBB);
7768   if (VectorizedLoopID.hasValue())
7769     L->setLoopID(VectorizedLoopID.getValue());
7770   else {
7771     // Keep all loop hints from the original loop on the vector loop (we'll
7772     // replace the vectorizer-specific hints below).
7773     if (MDNode *LID = OrigLoop->getLoopID())
7774       L->setLoopID(LID);
7775 
7776     LoopVectorizeHints Hints(L, true, *ORE);
7777     Hints.setAlreadyVectorized();
7778   }
7779   // Disable runtime unrolling when vectorizing the epilogue loop.
7780   if (CanonicalIVStartValue)
7781     AddRuntimeUnrollDisableMetaData(L);
7782 
7783   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7784   //    predication, updating analyses.
7785   ILV.fixVectorizedLoop(State);
7786 
7787   ILV.printDebugTracesAtEnd();
7788 }
7789 
7790 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
7791 void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
7792   for (const auto &Plan : VPlans)
7793     if (PrintVPlansInDotFormat)
7794       Plan->printDOT(O);
7795     else
7796       Plan->print(O);
7797 }
7798 #endif
7799 
7800 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
7801     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7802 
7803   // We create new control-flow for the vectorized loop, so the original exit
7804   // conditions will be dead after vectorization if it's only used by the
7805   // terminator
7806   SmallVector<BasicBlock*> ExitingBlocks;
7807   OrigLoop->getExitingBlocks(ExitingBlocks);
7808   for (auto *BB : ExitingBlocks) {
7809     auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));
7810     if (!Cmp || !Cmp->hasOneUse())
7811       continue;
7812 
7813     // TODO: we should introduce a getUniqueExitingBlocks on Loop
7814     if (!DeadInstructions.insert(Cmp).second)
7815       continue;
7816 
7817     // The operands of the icmp is often a dead trunc, used by IndUpdate.
7818     // TODO: can recurse through operands in general
7819     for (Value *Op : Cmp->operands()) {
7820       if (isa<TruncInst>(Op) && Op->hasOneUse())
7821           DeadInstructions.insert(cast<Instruction>(Op));
7822     }
7823   }
7824 
7825   // We create new "steps" for induction variable updates to which the original
7826   // induction variables map. An original update instruction will be dead if
7827   // all its users except the induction variable are dead.
7828   auto *Latch = OrigLoop->getLoopLatch();
7829   for (auto &Induction : Legal->getInductionVars()) {
7830     PHINode *Ind = Induction.first;
7831     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
7832 
7833     // If the tail is to be folded by masking, the primary induction variable,
7834     // if exists, isn't dead: it will be used for masking. Don't kill it.
7835     if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
7836       continue;
7837 
7838     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
7839           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
7840         }))
7841       DeadInstructions.insert(IndUpdate);
7842   }
7843 }
7844 
7845 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7846 
7847 //===--------------------------------------------------------------------===//
7848 // EpilogueVectorizerMainLoop
7849 //===--------------------------------------------------------------------===//
7850 
7851 /// This function is partially responsible for generating the control flow
7852 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7853 std::pair<BasicBlock *, Value *>
7854 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
7855   MDNode *OrigLoopID = OrigLoop->getLoopID();
7856   Loop *Lp = createVectorLoopSkeleton("");
7857 
7858   // Generate the code to check the minimum iteration count of the vector
7859   // epilogue (see below).
7860   EPI.EpilogueIterationCountCheck =
7861       emitMinimumIterationCountCheck(LoopScalarPreHeader, true);
7862   EPI.EpilogueIterationCountCheck->setName("iter.check");
7863 
7864   // Generate the code to check any assumptions that we've made for SCEV
7865   // expressions.
7866   EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader);
7867 
7868   // Generate the code that checks at runtime if arrays overlap. We put the
7869   // checks into a separate block to make the more common case of few elements
7870   // faster.
7871   EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader);
7872 
7873   // Generate the iteration count check for the main loop, *after* the check
7874   // for the epilogue loop, so that the path-length is shorter for the case
7875   // that goes directly through the vector epilogue. The longer-path length for
7876   // the main loop is compensated for, by the gain from vectorizing the larger
7877   // trip count. Note: the branch will get updated later on when we vectorize
7878   // the epilogue.
7879   EPI.MainLoopIterationCountCheck =
7880       emitMinimumIterationCountCheck(LoopScalarPreHeader, false);
7881 
7882   // Generate the induction variable.
7883   Value *CountRoundDown = getOrCreateVectorTripCount(LoopVectorPreHeader);
7884   EPI.VectorTripCount = CountRoundDown;
7885   createHeaderBranch(Lp);
7886 
7887   // Skip induction resume value creation here because they will be created in
7888   // the second pass. If we created them here, they wouldn't be used anyway,
7889   // because the vplan in the second pass still contains the inductions from the
7890   // original loop.
7891 
7892   return {completeLoopSkeleton(OrigLoopID), nullptr};
7893 }
7894 
7895 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7896   LLVM_DEBUG({
7897     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7898            << "Main Loop VF:" << EPI.MainLoopVF
7899            << ", Main Loop UF:" << EPI.MainLoopUF
7900            << ", Epilogue Loop VF:" << EPI.EpilogueVF
7901            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7902   });
7903 }
7904 
7905 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7906   DEBUG_WITH_TYPE(VerboseDebug, {
7907     dbgs() << "intermediate fn:\n"
7908            << *OrigLoop->getHeader()->getParent() << "\n";
7909   });
7910 }
7911 
7912 BasicBlock *
7913 EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(BasicBlock *Bypass,
7914                                                            bool ForEpilogue) {
7915   assert(Bypass && "Expected valid bypass basic block.");
7916   ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7917   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7918   Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
7919   // Reuse existing vector loop preheader for TC checks.
7920   // Note that new preheader block is generated for vector loop.
7921   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7922   IRBuilder<> Builder(TCCheckBlock->getTerminator());
7923 
7924   // Generate code to check if the loop's trip count is less than VF * UF of the
7925   // main vector loop.
7926   auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ?
7927       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7928 
7929   Value *CheckMinIters = Builder.CreateICmp(
7930       P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7931       "min.iters.check");
7932 
7933   if (!ForEpilogue)
7934     TCCheckBlock->setName("vector.main.loop.iter.check");
7935 
7936   // Create new preheader for vector loop.
7937   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7938                                    DT, LI, nullptr, "vector.ph");
7939 
7940   if (ForEpilogue) {
7941     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7942                                  DT->getNode(Bypass)->getIDom()) &&
7943            "TC check is expected to dominate Bypass");
7944 
7945     // Update dominator for Bypass & LoopExit.
7946     DT->changeImmediateDominator(Bypass, TCCheckBlock);
7947     if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
7948       // For loops with multiple exits, there's no edge from the middle block
7949       // to exit blocks (as the epilogue must run) and thus no need to update
7950       // the immediate dominator of the exit blocks.
7951       DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
7952 
7953     LoopBypassBlocks.push_back(TCCheckBlock);
7954 
7955     // Save the trip count so we don't have to regenerate it in the
7956     // vec.epilog.iter.check. This is safe to do because the trip count
7957     // generated here dominates the vector epilog iter check.
7958     EPI.TripCount = Count;
7959   }
7960 
7961   ReplaceInstWithInst(
7962       TCCheckBlock->getTerminator(),
7963       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7964 
7965   return TCCheckBlock;
7966 }
7967 
7968 //===--------------------------------------------------------------------===//
7969 // EpilogueVectorizerEpilogueLoop
7970 //===--------------------------------------------------------------------===//
7971 
7972 /// This function is partially responsible for generating the control flow
7973 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7974 std::pair<BasicBlock *, Value *>
7975 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
7976   MDNode *OrigLoopID = OrigLoop->getLoopID();
7977   Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
7978 
7979   // Now, compare the remaining count and if there aren't enough iterations to
7980   // execute the vectorized epilogue skip to the scalar part.
7981   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
7982   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
7983   LoopVectorPreHeader =
7984       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
7985                  LI, nullptr, "vec.epilog.ph");
7986   emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
7987                                           VecEpilogueIterationCountCheck);
7988 
7989   // Adjust the control flow taking the state info from the main loop
7990   // vectorization into account.
7991   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7992          "expected this to be saved from the previous pass.");
7993   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7994       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7995 
7996   DT->changeImmediateDominator(LoopVectorPreHeader,
7997                                EPI.MainLoopIterationCountCheck);
7998 
7999   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
8000       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8001 
8002   if (EPI.SCEVSafetyCheck)
8003     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
8004         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8005   if (EPI.MemSafetyCheck)
8006     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
8007         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8008 
8009   DT->changeImmediateDominator(
8010       VecEpilogueIterationCountCheck,
8011       VecEpilogueIterationCountCheck->getSinglePredecessor());
8012 
8013   DT->changeImmediateDominator(LoopScalarPreHeader,
8014                                EPI.EpilogueIterationCountCheck);
8015   if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
8016     // If there is an epilogue which must run, there's no edge from the
8017     // middle block to exit blocks  and thus no need to update the immediate
8018     // dominator of the exit blocks.
8019     DT->changeImmediateDominator(LoopExitBlock,
8020                                  EPI.EpilogueIterationCountCheck);
8021 
8022   // Keep track of bypass blocks, as they feed start values to the induction
8023   // phis in the scalar loop preheader.
8024   if (EPI.SCEVSafetyCheck)
8025     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
8026   if (EPI.MemSafetyCheck)
8027     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
8028   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
8029 
8030   // The vec.epilog.iter.check block may contain Phi nodes from reductions which
8031   // merge control-flow from the latch block and the middle block. Update the
8032   // incoming values here and move the Phi into the preheader.
8033   SmallVector<PHINode *, 4> PhisInBlock;
8034   for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
8035     PhisInBlock.push_back(&Phi);
8036 
8037   for (PHINode *Phi : PhisInBlock) {
8038     Phi->replaceIncomingBlockWith(
8039         VecEpilogueIterationCountCheck->getSinglePredecessor(),
8040         VecEpilogueIterationCountCheck);
8041     Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
8042     if (EPI.SCEVSafetyCheck)
8043       Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
8044     if (EPI.MemSafetyCheck)
8045       Phi->removeIncomingValue(EPI.MemSafetyCheck);
8046     Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
8047   }
8048 
8049   // Generate a resume induction for the vector epilogue and put it in the
8050   // vector epilogue preheader
8051   Type *IdxTy = Legal->getWidestInductionType();
8052   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
8053                                          LoopVectorPreHeader->getFirstNonPHI());
8054   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
8055   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
8056                            EPI.MainLoopIterationCountCheck);
8057 
8058   // Generate the induction variable.
8059   createHeaderBranch(Lp);
8060 
8061   // Generate induction resume values. These variables save the new starting
8062   // indexes for the scalar loop. They are used to test if there are any tail
8063   // iterations left once the vector loop has completed.
8064   // Note that when the vectorized epilogue is skipped due to iteration count
8065   // check, then the resume value for the induction variable comes from
8066   // the trip count of the main vector loop, hence passing the AdditionalBypass
8067   // argument.
8068   createInductionResumeValues({VecEpilogueIterationCountCheck,
8069                                EPI.VectorTripCount} /* AdditionalBypass */);
8070 
8071   return {completeLoopSkeleton(OrigLoopID), EPResumeVal};
8072 }
8073 
8074 BasicBlock *
8075 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
8076     BasicBlock *Bypass, BasicBlock *Insert) {
8077 
8078   assert(EPI.TripCount &&
8079          "Expected trip count to have been safed in the first pass.");
8080   assert(
8081       (!isa<Instruction>(EPI.TripCount) ||
8082        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
8083       "saved trip count does not dominate insertion point.");
8084   Value *TC = EPI.TripCount;
8085   IRBuilder<> Builder(Insert->getTerminator());
8086   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
8087 
8088   // Generate code to check if the loop's trip count is less than VF * UF of the
8089   // vector epilogue loop.
8090   auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ?
8091       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
8092 
8093   Value *CheckMinIters =
8094       Builder.CreateICmp(P, Count,
8095                          createStepForVF(Builder, Count->getType(),
8096                                          EPI.EpilogueVF, EPI.EpilogueUF),
8097                          "min.epilog.iters.check");
8098 
8099   ReplaceInstWithInst(
8100       Insert->getTerminator(),
8101       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
8102 
8103   LoopBypassBlocks.push_back(Insert);
8104   return Insert;
8105 }
8106 
8107 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
8108   LLVM_DEBUG({
8109     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
8110            << "Epilogue Loop VF:" << EPI.EpilogueVF
8111            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8112   });
8113 }
8114 
8115 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
8116   DEBUG_WITH_TYPE(VerboseDebug, {
8117     dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
8118   });
8119 }
8120 
8121 bool LoopVectorizationPlanner::getDecisionAndClampRange(
8122     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
8123   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
8124   bool PredicateAtRangeStart = Predicate(Range.Start);
8125 
8126   for (ElementCount TmpVF = Range.Start * 2;
8127        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
8128     if (Predicate(TmpVF) != PredicateAtRangeStart) {
8129       Range.End = TmpVF;
8130       break;
8131     }
8132 
8133   return PredicateAtRangeStart;
8134 }
8135 
8136 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
8137 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
8138 /// of VF's starting at a given VF and extending it as much as possible. Each
8139 /// vectorization decision can potentially shorten this sub-range during
8140 /// buildVPlan().
8141 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
8142                                            ElementCount MaxVF) {
8143   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8144   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8145     VFRange SubRange = {VF, MaxVFPlusOne};
8146     VPlans.push_back(buildVPlan(SubRange));
8147     VF = SubRange.End;
8148   }
8149 }
8150 
8151 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
8152                                          VPlanPtr &Plan) {
8153   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8154 
8155   // Look for cached value.
8156   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8157   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8158   if (ECEntryIt != EdgeMaskCache.end())
8159     return ECEntryIt->second;
8160 
8161   VPValue *SrcMask = createBlockInMask(Src, Plan);
8162 
8163   // The terminator has to be a branch inst!
8164   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8165   assert(BI && "Unexpected terminator found");
8166 
8167   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8168     return EdgeMaskCache[Edge] = SrcMask;
8169 
8170   // If source is an exiting block, we know the exit edge is dynamically dead
8171   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
8172   // adding uses of an otherwise potentially dead instruction.
8173   if (OrigLoop->isLoopExiting(Src))
8174     return EdgeMaskCache[Edge] = SrcMask;
8175 
8176   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
8177   assert(EdgeMask && "No Edge Mask found for condition");
8178 
8179   if (BI->getSuccessor(0) != Dst)
8180     EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
8181 
8182   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8183     // The condition is 'SrcMask && EdgeMask', which is equivalent to
8184     // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8185     // The select version does not introduce new UB if SrcMask is false and
8186     // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8187     VPValue *False = Plan->getOrAddVPValue(
8188         ConstantInt::getFalse(BI->getCondition()->getType()));
8189     EdgeMask =
8190         Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
8191   }
8192 
8193   return EdgeMaskCache[Edge] = EdgeMask;
8194 }
8195 
8196 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
8197   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8198 
8199   // Look for cached value.
8200   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
8201   if (BCEntryIt != BlockMaskCache.end())
8202     return BCEntryIt->second;
8203 
8204   // All-one mask is modelled as no-mask following the convention for masked
8205   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8206   VPValue *BlockMask = nullptr;
8207 
8208   if (OrigLoop->getHeader() == BB) {
8209     if (!CM.blockNeedsPredicationForAnyReason(BB))
8210       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
8211 
8212     // Introduce the early-exit compare IV <= BTC to form header block mask.
8213     // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8214     // constructing the desired canonical IV in the header block as its first
8215     // non-phi instructions.
8216     assert(CM.foldTailByMasking() && "must fold the tail");
8217     VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock();
8218     auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8219     auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV());
8220     HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi());
8221 
8222     VPBuilder::InsertPointGuard Guard(Builder);
8223     Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8224     if (CM.TTI.emitGetActiveLaneMask()) {
8225       VPValue *TC = Plan->getOrCreateTripCount();
8226       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC});
8227     } else {
8228       VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
8229       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
8230     }
8231     return BlockMaskCache[BB] = BlockMask;
8232   }
8233 
8234   // This is the block mask. We OR all incoming edges.
8235   for (auto *Predecessor : predecessors(BB)) {
8236     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8237     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
8238       return BlockMaskCache[BB] = EdgeMask;
8239 
8240     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8241       BlockMask = EdgeMask;
8242       continue;
8243     }
8244 
8245     BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8246   }
8247 
8248   return BlockMaskCache[BB] = BlockMask;
8249 }
8250 
8251 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
8252                                                 ArrayRef<VPValue *> Operands,
8253                                                 VFRange &Range,
8254                                                 VPlanPtr &Plan) {
8255   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8256          "Must be called with either a load or store");
8257 
8258   auto willWiden = [&](ElementCount VF) -> bool {
8259     if (VF.isScalar())
8260       return false;
8261     LoopVectorizationCostModel::InstWidening Decision =
8262         CM.getWideningDecision(I, VF);
8263     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8264            "CM decision should be taken at this point.");
8265     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8266       return true;
8267     if (CM.isScalarAfterVectorization(I, VF) ||
8268         CM.isProfitableToScalarize(I, VF))
8269       return false;
8270     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8271   };
8272 
8273   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8274     return nullptr;
8275 
8276   VPValue *Mask = nullptr;
8277   if (Legal->isMaskRequired(I))
8278     Mask = createBlockInMask(I->getParent(), Plan);
8279 
8280   // Determine if the pointer operand of the access is either consecutive or
8281   // reverse consecutive.
8282   LoopVectorizationCostModel::InstWidening Decision =
8283       CM.getWideningDecision(I, Range.Start);
8284   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8285   bool Consecutive =
8286       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8287 
8288   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8289     return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
8290                                               Consecutive, Reverse);
8291 
8292   StoreInst *Store = cast<StoreInst>(I);
8293   return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
8294                                             Mask, Consecutive, Reverse);
8295 }
8296 
8297 static VPWidenIntOrFpInductionRecipe *
8298 createWidenInductionRecipe(PHINode *Phi, Instruction *PhiOrTrunc,
8299                            VPValue *Start, const InductionDescriptor &IndDesc,
8300                            LoopVectorizationCostModel &CM, ScalarEvolution &SE,
8301                            Loop &OrigLoop, VFRange &Range) {
8302   // Returns true if an instruction \p I should be scalarized instead of
8303   // vectorized for the chosen vectorization factor.
8304   auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) {
8305     return CM.isScalarAfterVectorization(I, VF) ||
8306            CM.isProfitableToScalarize(I, VF);
8307   };
8308 
8309   bool NeedsScalarIV = LoopVectorizationPlanner::getDecisionAndClampRange(
8310       [&](ElementCount VF) {
8311         // Returns true if we should generate a scalar version of \p IV.
8312         if (ShouldScalarizeInstruction(PhiOrTrunc, VF))
8313           return true;
8314         auto isScalarInst = [&](User *U) -> bool {
8315           auto *I = cast<Instruction>(U);
8316           return OrigLoop.contains(I) && ShouldScalarizeInstruction(I, VF);
8317         };
8318         return any_of(PhiOrTrunc->users(), isScalarInst);
8319       },
8320       Range);
8321   bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange(
8322       [&](ElementCount VF) {
8323         return ShouldScalarizeInstruction(PhiOrTrunc, VF);
8324       },
8325       Range);
8326   assert(IndDesc.getStartValue() ==
8327          Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8328   assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8329          "step must be loop invariant");
8330   if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8331     return new VPWidenIntOrFpInductionRecipe(
8332         Phi, Start, IndDesc, TruncI, NeedsScalarIV, !NeedsScalarIVOnly, SE);
8333   }
8334   assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8335   return new VPWidenIntOrFpInductionRecipe(Phi, Start, IndDesc, NeedsScalarIV,
8336                                            !NeedsScalarIVOnly, SE);
8337 }
8338 
8339 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
8340     PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) const {
8341 
8342   // Check if this is an integer or fp induction. If so, build the recipe that
8343   // produces its scalar and vector values.
8344   if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8345     return createWidenInductionRecipe(Phi, Phi, Operands[0], *II, CM,
8346                                       *PSE.getSE(), *OrigLoop, Range);
8347 
8348   return nullptr;
8349 }
8350 
8351 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8352     TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range,
8353     VPlan &Plan) const {
8354   // Optimize the special case where the source is a constant integer
8355   // induction variable. Notice that we can only optimize the 'trunc' case
8356   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8357   // (c) other casts depend on pointer size.
8358 
8359   // Determine whether \p K is a truncation based on an induction variable that
8360   // can be optimized.
8361   auto isOptimizableIVTruncate =
8362       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8363     return [=](ElementCount VF) -> bool {
8364       return CM.isOptimizableIVTruncate(K, VF);
8365     };
8366   };
8367 
8368   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8369           isOptimizableIVTruncate(I), Range)) {
8370 
8371     auto *Phi = cast<PHINode>(I->getOperand(0));
8372     const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8373     VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
8374     return createWidenInductionRecipe(Phi, I, Start, II, CM, *PSE.getSE(),
8375                                       *OrigLoop, Range);
8376   }
8377   return nullptr;
8378 }
8379 
8380 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
8381                                                 ArrayRef<VPValue *> Operands,
8382                                                 VPlanPtr &Plan) {
8383   // If all incoming values are equal, the incoming VPValue can be used directly
8384   // instead of creating a new VPBlendRecipe.
8385   VPValue *FirstIncoming = Operands[0];
8386   if (all_of(Operands, [FirstIncoming](const VPValue *Inc) {
8387         return FirstIncoming == Inc;
8388       })) {
8389     return Operands[0];
8390   }
8391 
8392   unsigned NumIncoming = Phi->getNumIncomingValues();
8393   // For in-loop reductions, we do not need to create an additional select.
8394   VPValue *InLoopVal = nullptr;
8395   for (unsigned In = 0; In < NumIncoming; In++) {
8396     PHINode *PhiOp =
8397         dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue());
8398     if (PhiOp && CM.isInLoopReduction(PhiOp)) {
8399       assert(!InLoopVal && "Found more than one in-loop reduction!");
8400       InLoopVal = Operands[In];
8401     }
8402   }
8403 
8404   assert((!InLoopVal || NumIncoming == 2) &&
8405          "Found an in-loop reduction for PHI with unexpected number of "
8406          "incoming values");
8407   if (InLoopVal)
8408     return Operands[Operands[0] == InLoopVal ? 1 : 0];
8409 
8410   // We know that all PHIs in non-header blocks are converted into selects, so
8411   // we don't have to worry about the insertion order and we can just use the
8412   // builder. At this point we generate the predication tree. There may be
8413   // duplications since this is a simple recursive scan, but future
8414   // optimizations will clean it up.
8415   SmallVector<VPValue *, 2> OperandsWithMask;
8416 
8417   for (unsigned In = 0; In < NumIncoming; In++) {
8418     VPValue *EdgeMask =
8419       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8420     assert((EdgeMask || NumIncoming == 1) &&
8421            "Multiple predecessors with one having a full mask");
8422     OperandsWithMask.push_back(Operands[In]);
8423     if (EdgeMask)
8424       OperandsWithMask.push_back(EdgeMask);
8425   }
8426   return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
8427 }
8428 
8429 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8430                                                    ArrayRef<VPValue *> Operands,
8431                                                    VFRange &Range) const {
8432 
8433   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8434       [this, CI](ElementCount VF) {
8435         return CM.isScalarWithPredication(CI, VF);
8436       },
8437       Range);
8438 
8439   if (IsPredicated)
8440     return nullptr;
8441 
8442   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8443   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8444              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8445              ID == Intrinsic::pseudoprobe ||
8446              ID == Intrinsic::experimental_noalias_scope_decl))
8447     return nullptr;
8448 
8449   auto willWiden = [&](ElementCount VF) -> bool {
8450     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8451     // The following case may be scalarized depending on the VF.
8452     // The flag shows whether we use Intrinsic or a usual Call for vectorized
8453     // version of the instruction.
8454     // Is it beneficial to perform intrinsic call compared to lib call?
8455     bool NeedToScalarize = false;
8456     InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
8457     InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
8458     bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
8459     return UseVectorIntrinsic || !NeedToScalarize;
8460   };
8461 
8462   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8463     return nullptr;
8464 
8465   ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());
8466   return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()));
8467 }
8468 
8469 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8470   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8471          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8472   // Instruction should be widened, unless it is scalar after vectorization,
8473   // scalarization is profitable or it is predicated.
8474   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8475     return CM.isScalarAfterVectorization(I, VF) ||
8476            CM.isProfitableToScalarize(I, VF) ||
8477            CM.isScalarWithPredication(I, VF);
8478   };
8479   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8480                                                              Range);
8481 }
8482 
8483 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8484                                            ArrayRef<VPValue *> Operands) const {
8485   auto IsVectorizableOpcode = [](unsigned Opcode) {
8486     switch (Opcode) {
8487     case Instruction::Add:
8488     case Instruction::And:
8489     case Instruction::AShr:
8490     case Instruction::BitCast:
8491     case Instruction::FAdd:
8492     case Instruction::FCmp:
8493     case Instruction::FDiv:
8494     case Instruction::FMul:
8495     case Instruction::FNeg:
8496     case Instruction::FPExt:
8497     case Instruction::FPToSI:
8498     case Instruction::FPToUI:
8499     case Instruction::FPTrunc:
8500     case Instruction::FRem:
8501     case Instruction::FSub:
8502     case Instruction::ICmp:
8503     case Instruction::IntToPtr:
8504     case Instruction::LShr:
8505     case Instruction::Mul:
8506     case Instruction::Or:
8507     case Instruction::PtrToInt:
8508     case Instruction::SDiv:
8509     case Instruction::Select:
8510     case Instruction::SExt:
8511     case Instruction::Shl:
8512     case Instruction::SIToFP:
8513     case Instruction::SRem:
8514     case Instruction::Sub:
8515     case Instruction::Trunc:
8516     case Instruction::UDiv:
8517     case Instruction::UIToFP:
8518     case Instruction::URem:
8519     case Instruction::Xor:
8520     case Instruction::ZExt:
8521       return true;
8522     }
8523     return false;
8524   };
8525 
8526   if (!IsVectorizableOpcode(I->getOpcode()))
8527     return nullptr;
8528 
8529   // Success: widen this instruction.
8530   return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8531 }
8532 
8533 void VPRecipeBuilder::fixHeaderPhis() {
8534   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8535   for (VPHeaderPHIRecipe *R : PhisToFix) {
8536     auto *PN = cast<PHINode>(R->getUnderlyingValue());
8537     VPRecipeBase *IncR =
8538         getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8539     R->addOperand(IncR->getVPSingleValue());
8540   }
8541 }
8542 
8543 VPBasicBlock *VPRecipeBuilder::handleReplication(
8544     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8545     VPlanPtr &Plan) {
8546   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8547       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8548       Range);
8549 
8550   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8551       [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); },
8552       Range);
8553 
8554   // Even if the instruction is not marked as uniform, there are certain
8555   // intrinsic calls that can be effectively treated as such, so we check for
8556   // them here. Conservatively, we only do this for scalable vectors, since
8557   // for fixed-width VFs we can always fall back on full scalarization.
8558   if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8559     switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8560     case Intrinsic::assume:
8561     case Intrinsic::lifetime_start:
8562     case Intrinsic::lifetime_end:
8563       // For scalable vectors if one of the operands is variant then we still
8564       // want to mark as uniform, which will generate one instruction for just
8565       // the first lane of the vector. We can't scalarize the call in the same
8566       // way as for fixed-width vectors because we don't know how many lanes
8567       // there are.
8568       //
8569       // The reasons for doing it this way for scalable vectors are:
8570       //   1. For the assume intrinsic generating the instruction for the first
8571       //      lane is still be better than not generating any at all. For
8572       //      example, the input may be a splat across all lanes.
8573       //   2. For the lifetime start/end intrinsics the pointer operand only
8574       //      does anything useful when the input comes from a stack object,
8575       //      which suggests it should always be uniform. For non-stack objects
8576       //      the effect is to poison the object, which still allows us to
8577       //      remove the call.
8578       IsUniform = true;
8579       break;
8580     default:
8581       break;
8582     }
8583   }
8584 
8585   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8586                                        IsUniform, IsPredicated);
8587   setRecipe(I, Recipe);
8588   Plan->addVPValue(I, Recipe);
8589 
8590   // Find if I uses a predicated instruction. If so, it will use its scalar
8591   // value. Avoid hoisting the insert-element which packs the scalar value into
8592   // a vector value, as that happens iff all users use the vector value.
8593   for (VPValue *Op : Recipe->operands()) {
8594     auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef());
8595     if (!PredR)
8596       continue;
8597     auto *RepR =
8598         cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef());
8599     assert(RepR->isPredicated() &&
8600            "expected Replicate recipe to be predicated");
8601     RepR->setAlsoPack(false);
8602   }
8603 
8604   // Finalize the recipe for Instr, first if it is not predicated.
8605   if (!IsPredicated) {
8606     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8607     VPBB->appendRecipe(Recipe);
8608     return VPBB;
8609   }
8610   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8611 
8612   VPBlockBase *SingleSucc = VPBB->getSingleSuccessor();
8613   assert(SingleSucc && "VPBB must have a single successor when handling "
8614                        "predicated replication.");
8615   VPBlockUtils::disconnectBlocks(VPBB, SingleSucc);
8616   // Record predicated instructions for above packing optimizations.
8617   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
8618   VPBlockUtils::insertBlockAfter(Region, VPBB);
8619   auto *RegSucc = new VPBasicBlock();
8620   VPBlockUtils::insertBlockAfter(RegSucc, Region);
8621   VPBlockUtils::connectBlocks(RegSucc, SingleSucc);
8622   return RegSucc;
8623 }
8624 
8625 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
8626                                                       VPRecipeBase *PredRecipe,
8627                                                       VPlanPtr &Plan) {
8628   // Instructions marked for predication are replicated and placed under an
8629   // if-then construct to prevent side-effects.
8630 
8631   // Generate recipes to compute the block mask for this region.
8632   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
8633 
8634   // Build the triangular if-then region.
8635   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8636   assert(Instr->getParent() && "Predicated instruction not in any basic block");
8637   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8638   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8639   auto *PHIRecipe = Instr->getType()->isVoidTy()
8640                         ? nullptr
8641                         : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
8642   if (PHIRecipe) {
8643     Plan->removeVPValueFor(Instr);
8644     Plan->addVPValue(Instr, PHIRecipe);
8645   }
8646   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8647   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8648   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
8649 
8650   // Note: first set Entry as region entry and then connect successors starting
8651   // from it in order, to propagate the "parent" of each VPBasicBlock.
8652   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
8653   VPBlockUtils::connectBlocks(Pred, Exit);
8654 
8655   return Region;
8656 }
8657 
8658 VPRecipeOrVPValueTy
8659 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8660                                         ArrayRef<VPValue *> Operands,
8661                                         VFRange &Range, VPlanPtr &Plan) {
8662   // First, check for specific widening recipes that deal with calls, memory
8663   // operations, inductions and Phi nodes.
8664   if (auto *CI = dyn_cast<CallInst>(Instr))
8665     return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
8666 
8667   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8668     return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
8669 
8670   VPRecipeBase *Recipe;
8671   if (auto Phi = dyn_cast<PHINode>(Instr)) {
8672     if (Phi->getParent() != OrigLoop->getHeader())
8673       return tryToBlend(Phi, Operands, Plan);
8674     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
8675       return toVPRecipeResult(Recipe);
8676 
8677     VPHeaderPHIRecipe *PhiRecipe = nullptr;
8678     if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) {
8679       VPValue *StartV = Operands[0];
8680       if (Legal->isReductionVariable(Phi)) {
8681         const RecurrenceDescriptor &RdxDesc =
8682             Legal->getReductionVars().find(Phi)->second;
8683         assert(RdxDesc.getRecurrenceStartValue() ==
8684                Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8685         PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8686                                              CM.isInLoopReduction(Phi),
8687                                              CM.useOrderedReductions(RdxDesc));
8688       } else {
8689         PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8690       }
8691 
8692       // Record the incoming value from the backedge, so we can add the incoming
8693       // value from the backedge after all recipes have been created.
8694       recordRecipeOf(cast<Instruction>(
8695           Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())));
8696       PhisToFix.push_back(PhiRecipe);
8697     } else {
8698       // TODO: record backedge value for remaining pointer induction phis.
8699       assert(Phi->getType()->isPointerTy() &&
8700              "only pointer phis should be handled here");
8701       assert(Legal->getInductionVars().count(Phi) &&
8702              "Not an induction variable");
8703       InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
8704       VPValue *Start = Plan->getOrAddVPValue(II.getStartValue());
8705       PhiRecipe = new VPWidenPHIRecipe(Phi, Start);
8706     }
8707 
8708     return toVPRecipeResult(PhiRecipe);
8709   }
8710 
8711   if (isa<TruncInst>(Instr) &&
8712       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
8713                                                Range, *Plan)))
8714     return toVPRecipeResult(Recipe);
8715 
8716   if (!shouldWiden(Instr, Range))
8717     return nullptr;
8718 
8719   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8720     return toVPRecipeResult(new VPWidenGEPRecipe(
8721         GEP, make_range(Operands.begin(), Operands.end()), OrigLoop));
8722 
8723   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8724     bool InvariantCond =
8725         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
8726     return toVPRecipeResult(new VPWidenSelectRecipe(
8727         *SI, make_range(Operands.begin(), Operands.end()), InvariantCond));
8728   }
8729 
8730   return toVPRecipeResult(tryToWiden(Instr, Operands));
8731 }
8732 
8733 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8734                                                         ElementCount MaxVF) {
8735   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8736 
8737   // Collect instructions from the original loop that will become trivially dead
8738   // in the vectorized loop. We don't need to vectorize these instructions. For
8739   // example, original induction update instructions can become dead because we
8740   // separately emit induction "steps" when generating code for the new loop.
8741   // Similarly, we create a new latch condition when setting up the structure
8742   // of the new loop, so the old one can become dead.
8743   SmallPtrSet<Instruction *, 4> DeadInstructions;
8744   collectTriviallyDeadInstructions(DeadInstructions);
8745 
8746   // Add assume instructions we need to drop to DeadInstructions, to prevent
8747   // them from being added to the VPlan.
8748   // TODO: We only need to drop assumes in blocks that get flattend. If the
8749   // control flow is preserved, we should keep them.
8750   auto &ConditionalAssumes = Legal->getConditionalAssumes();
8751   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
8752 
8753   MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
8754   // Dead instructions do not need sinking. Remove them from SinkAfter.
8755   for (Instruction *I : DeadInstructions)
8756     SinkAfter.erase(I);
8757 
8758   // Cannot sink instructions after dead instructions (there won't be any
8759   // recipes for them). Instead, find the first non-dead previous instruction.
8760   for (auto &P : Legal->getSinkAfter()) {
8761     Instruction *SinkTarget = P.second;
8762     Instruction *FirstInst = &*SinkTarget->getParent()->begin();
8763     (void)FirstInst;
8764     while (DeadInstructions.contains(SinkTarget)) {
8765       assert(
8766           SinkTarget != FirstInst &&
8767           "Must find a live instruction (at least the one feeding the "
8768           "first-order recurrence PHI) before reaching beginning of the block");
8769       SinkTarget = SinkTarget->getPrevNode();
8770       assert(SinkTarget != P.first &&
8771              "sink source equals target, no sinking required");
8772     }
8773     P.second = SinkTarget;
8774   }
8775 
8776   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8777   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8778     VFRange SubRange = {VF, MaxVFPlusOne};
8779     VPlans.push_back(
8780         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
8781     VF = SubRange.End;
8782   }
8783 }
8784 
8785 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a
8786 // CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a
8787 // BranchOnCount VPInstruction to the latch.
8788 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
8789                                   bool HasNUW, bool IsVPlanNative) {
8790   Value *StartIdx = ConstantInt::get(IdxTy, 0);
8791   auto *StartV = Plan.getOrAddVPValue(StartIdx);
8792 
8793   auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8794   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8795   VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8796   if (IsVPlanNative)
8797     Header = cast<VPBasicBlock>(Header->getSingleSuccessor());
8798   Header->insert(CanonicalIVPHI, Header->begin());
8799 
8800   auto *CanonicalIVIncrement =
8801       new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW
8802                                : VPInstruction::CanonicalIVIncrement,
8803                         {CanonicalIVPHI}, DL);
8804   CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8805 
8806   VPBasicBlock *EB = TopRegion->getExitBasicBlock();
8807   if (IsVPlanNative) {
8808     EB = cast<VPBasicBlock>(EB->getSinglePredecessor());
8809     EB->setCondBit(nullptr);
8810   }
8811   EB->appendRecipe(CanonicalIVIncrement);
8812 
8813   auto *BranchOnCount =
8814       new VPInstruction(VPInstruction::BranchOnCount,
8815                         {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8816   EB->appendRecipe(BranchOnCount);
8817 }
8818 
8819 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
8820     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
8821     const MapVector<Instruction *, Instruction *> &SinkAfter) {
8822 
8823   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8824 
8825   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8826 
8827   // ---------------------------------------------------------------------------
8828   // Pre-construction: record ingredients whose recipes we'll need to further
8829   // process after constructing the initial VPlan.
8830   // ---------------------------------------------------------------------------
8831 
8832   // Mark instructions we'll need to sink later and their targets as
8833   // ingredients whose recipe we'll need to record.
8834   for (auto &Entry : SinkAfter) {
8835     RecipeBuilder.recordRecipeOf(Entry.first);
8836     RecipeBuilder.recordRecipeOf(Entry.second);
8837   }
8838   for (auto &Reduction : CM.getInLoopReductionChains()) {
8839     PHINode *Phi = Reduction.first;
8840     RecurKind Kind =
8841         Legal->getReductionVars().find(Phi)->second.getRecurrenceKind();
8842     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8843 
8844     RecipeBuilder.recordRecipeOf(Phi);
8845     for (auto &R : ReductionOperations) {
8846       RecipeBuilder.recordRecipeOf(R);
8847       // For min/max reductions, where we have a pair of icmp/select, we also
8848       // need to record the ICmp recipe, so it can be removed later.
8849       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
8850              "Only min/max recurrences allowed for inloop reductions");
8851       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
8852         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
8853     }
8854   }
8855 
8856   // For each interleave group which is relevant for this (possibly trimmed)
8857   // Range, add it to the set of groups to be later applied to the VPlan and add
8858   // placeholders for its members' Recipes which we'll be replacing with a
8859   // single VPInterleaveRecipe.
8860   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8861     auto applyIG = [IG, this](ElementCount VF) -> bool {
8862       return (VF.isVector() && // Query is illegal for VF == 1
8863               CM.getWideningDecision(IG->getInsertPos(), VF) ==
8864                   LoopVectorizationCostModel::CM_Interleave);
8865     };
8866     if (!getDecisionAndClampRange(applyIG, Range))
8867       continue;
8868     InterleaveGroups.insert(IG);
8869     for (unsigned i = 0; i < IG->getFactor(); i++)
8870       if (Instruction *Member = IG->getMember(i))
8871         RecipeBuilder.recordRecipeOf(Member);
8872   };
8873 
8874   // ---------------------------------------------------------------------------
8875   // Build initial VPlan: Scan the body of the loop in a topological order to
8876   // visit each basic block after having visited its predecessor basic blocks.
8877   // ---------------------------------------------------------------------------
8878 
8879   // Create initial VPlan skeleton, with separate header and latch blocks.
8880   VPBasicBlock *HeaderVPBB = new VPBasicBlock();
8881   VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
8882   VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
8883   auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop");
8884   auto Plan = std::make_unique<VPlan>(TopRegion);
8885 
8886   Instruction *DLInst =
8887       getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
8888   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(),
8889                         DLInst ? DLInst->getDebugLoc() : DebugLoc(),
8890                         !CM.foldTailByMasking(), false);
8891 
8892   // Scan the body of the loop in a topological order to visit each basic block
8893   // after having visited its predecessor basic blocks.
8894   LoopBlocksDFS DFS(OrigLoop);
8895   DFS.perform(LI);
8896 
8897   VPBasicBlock *VPBB = HeaderVPBB;
8898   SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove;
8899   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8900     // Relevant instructions from basic block BB will be grouped into VPRecipe
8901     // ingredients and fill a new VPBasicBlock.
8902     unsigned VPBBsForBB = 0;
8903     VPBB->setName(BB->getName());
8904     Builder.setInsertPoint(VPBB);
8905 
8906     // Introduce each ingredient into VPlan.
8907     // TODO: Model and preserve debug instrinsics in VPlan.
8908     for (Instruction &I : BB->instructionsWithoutDebug()) {
8909       Instruction *Instr = &I;
8910 
8911       // First filter out irrelevant instructions, to ensure no recipes are
8912       // built for them.
8913       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
8914         continue;
8915 
8916       SmallVector<VPValue *, 4> Operands;
8917       auto *Phi = dyn_cast<PHINode>(Instr);
8918       if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
8919         Operands.push_back(Plan->getOrAddVPValue(
8920             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
8921       } else {
8922         auto OpRange = Plan->mapToVPValues(Instr->operands());
8923         Operands = {OpRange.begin(), OpRange.end()};
8924       }
8925       if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
8926               Instr, Operands, Range, Plan)) {
8927         // If Instr can be simplified to an existing VPValue, use it.
8928         if (RecipeOrValue.is<VPValue *>()) {
8929           auto *VPV = RecipeOrValue.get<VPValue *>();
8930           Plan->addVPValue(Instr, VPV);
8931           // If the re-used value is a recipe, register the recipe for the
8932           // instruction, in case the recipe for Instr needs to be recorded.
8933           if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef()))
8934             RecipeBuilder.setRecipe(Instr, R);
8935           continue;
8936         }
8937         // Otherwise, add the new recipe.
8938         VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>();
8939         for (auto *Def : Recipe->definedValues()) {
8940           auto *UV = Def->getUnderlyingValue();
8941           Plan->addVPValue(UV, Def);
8942         }
8943 
8944         if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) &&
8945             HeaderVPBB->getFirstNonPhi() != VPBB->end()) {
8946           // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section
8947           // of the header block. That can happen for truncates of induction
8948           // variables. Those recipes are moved to the phi section of the header
8949           // block after applying SinkAfter, which relies on the original
8950           // position of the trunc.
8951           assert(isa<TruncInst>(Instr));
8952           InductionsToMove.push_back(
8953               cast<VPWidenIntOrFpInductionRecipe>(Recipe));
8954         }
8955         RecipeBuilder.setRecipe(Instr, Recipe);
8956         VPBB->appendRecipe(Recipe);
8957         continue;
8958       }
8959 
8960       // Otherwise, if all widening options failed, Instruction is to be
8961       // replicated. This may create a successor for VPBB.
8962       VPBasicBlock *NextVPBB =
8963           RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan);
8964       if (NextVPBB != VPBB) {
8965         VPBB = NextVPBB;
8966         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
8967                                     : "");
8968       }
8969     }
8970 
8971     VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
8972     VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
8973   }
8974 
8975   // Fold the last, empty block into its predecessor.
8976   VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB);
8977   assert(VPBB && "expected to fold last (empty) block");
8978   // After here, VPBB should not be used.
8979   VPBB = nullptr;
8980 
8981   assert(isa<VPRegionBlock>(Plan->getEntry()) &&
8982          !Plan->getEntry()->getEntryBasicBlock()->empty() &&
8983          "entry block must be set to a VPRegionBlock having a non-empty entry "
8984          "VPBasicBlock");
8985   RecipeBuilder.fixHeaderPhis();
8986 
8987   // ---------------------------------------------------------------------------
8988   // Transform initial VPlan: Apply previously taken decisions, in order, to
8989   // bring the VPlan to its final state.
8990   // ---------------------------------------------------------------------------
8991 
8992   // Apply Sink-After legal constraints.
8993   auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {
8994     auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
8995     if (Region && Region->isReplicator()) {
8996       assert(Region->getNumSuccessors() == 1 &&
8997              Region->getNumPredecessors() == 1 && "Expected SESE region!");
8998       assert(R->getParent()->size() == 1 &&
8999              "A recipe in an original replicator region must be the only "
9000              "recipe in its block");
9001       return Region;
9002     }
9003     return nullptr;
9004   };
9005   for (auto &Entry : SinkAfter) {
9006     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
9007     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
9008 
9009     auto *TargetRegion = GetReplicateRegion(Target);
9010     auto *SinkRegion = GetReplicateRegion(Sink);
9011     if (!SinkRegion) {
9012       // If the sink source is not a replicate region, sink the recipe directly.
9013       if (TargetRegion) {
9014         // The target is in a replication region, make sure to move Sink to
9015         // the block after it, not into the replication region itself.
9016         VPBasicBlock *NextBlock =
9017             cast<VPBasicBlock>(TargetRegion->getSuccessors().front());
9018         Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
9019       } else
9020         Sink->moveAfter(Target);
9021       continue;
9022     }
9023 
9024     // The sink source is in a replicate region. Unhook the region from the CFG.
9025     auto *SinkPred = SinkRegion->getSinglePredecessor();
9026     auto *SinkSucc = SinkRegion->getSingleSuccessor();
9027     VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion);
9028     VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc);
9029     VPBlockUtils::connectBlocks(SinkPred, SinkSucc);
9030 
9031     if (TargetRegion) {
9032       // The target recipe is also in a replicate region, move the sink region
9033       // after the target region.
9034       auto *TargetSucc = TargetRegion->getSingleSuccessor();
9035       VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc);
9036       VPBlockUtils::connectBlocks(TargetRegion, SinkRegion);
9037       VPBlockUtils::connectBlocks(SinkRegion, TargetSucc);
9038     } else {
9039       // The sink source is in a replicate region, we need to move the whole
9040       // replicate region, which should only contain a single recipe in the
9041       // main block.
9042       auto *SplitBlock =
9043           Target->getParent()->splitAt(std::next(Target->getIterator()));
9044 
9045       auto *SplitPred = SplitBlock->getSinglePredecessor();
9046 
9047       VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock);
9048       VPBlockUtils::connectBlocks(SplitPred, SinkRegion);
9049       VPBlockUtils::connectBlocks(SinkRegion, SplitBlock);
9050     }
9051   }
9052 
9053   VPlanTransforms::removeRedundantCanonicalIVs(*Plan);
9054   VPlanTransforms::removeRedundantInductionCasts(*Plan);
9055 
9056   // Now that sink-after is done, move induction recipes for optimized truncates
9057   // to the phi section of the header block.
9058   for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove)
9059     Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
9060 
9061   // Adjust the recipes for any inloop reductions.
9062   adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExit()), Plan,
9063                              RecipeBuilder, Range.Start);
9064 
9065   // Introduce a recipe to combine the incoming and previous values of a
9066   // first-order recurrence.
9067   for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
9068     auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R);
9069     if (!RecurPhi)
9070       continue;
9071 
9072     VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe();
9073     VPBasicBlock *InsertBlock = PrevRecipe->getParent();
9074     auto *Region = GetReplicateRegion(PrevRecipe);
9075     if (Region)
9076       InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor());
9077     if (Region || PrevRecipe->isPhi())
9078       Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
9079     else
9080       Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator()));
9081 
9082     auto *RecurSplice = cast<VPInstruction>(
9083         Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
9084                              {RecurPhi, RecurPhi->getBackedgeValue()}));
9085 
9086     RecurPhi->replaceAllUsesWith(RecurSplice);
9087     // Set the first operand of RecurSplice to RecurPhi again, after replacing
9088     // all users.
9089     RecurSplice->setOperand(0, RecurPhi);
9090   }
9091 
9092   // Interleave memory: for each Interleave Group we marked earlier as relevant
9093   // for this VPlan, replace the Recipes widening its memory instructions with a
9094   // single VPInterleaveRecipe at its insertion point.
9095   for (auto IG : InterleaveGroups) {
9096     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
9097         RecipeBuilder.getRecipe(IG->getInsertPos()));
9098     SmallVector<VPValue *, 4> StoredValues;
9099     for (unsigned i = 0; i < IG->getFactor(); ++i)
9100       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
9101         auto *StoreR =
9102             cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
9103         StoredValues.push_back(StoreR->getStoredValue());
9104       }
9105 
9106     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
9107                                         Recipe->getMask());
9108     VPIG->insertBefore(Recipe);
9109     unsigned J = 0;
9110     for (unsigned i = 0; i < IG->getFactor(); ++i)
9111       if (Instruction *Member = IG->getMember(i)) {
9112         if (!Member->getType()->isVoidTy()) {
9113           VPValue *OriginalV = Plan->getVPValue(Member);
9114           Plan->removeVPValueFor(Member);
9115           Plan->addVPValue(Member, VPIG->getVPValue(J));
9116           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
9117           J++;
9118         }
9119         RecipeBuilder.getRecipe(Member)->eraseFromParent();
9120       }
9121   }
9122 
9123   // From this point onwards, VPlan-to-VPlan transformations may change the plan
9124   // in ways that accessing values using original IR values is incorrect.
9125   Plan->disableValue2VPValue();
9126 
9127   VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE());
9128   VPlanTransforms::sinkScalarOperands(*Plan);
9129   VPlanTransforms::mergeReplicateRegions(*Plan);
9130   VPlanTransforms::removeDeadRecipes(*Plan, *OrigLoop);
9131 
9132   std::string PlanName;
9133   raw_string_ostream RSO(PlanName);
9134   ElementCount VF = Range.Start;
9135   Plan->addVF(VF);
9136   RSO << "Initial VPlan for VF={" << VF;
9137   for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
9138     Plan->addVF(VF);
9139     RSO << "," << VF;
9140   }
9141   RSO << "},UF>=1";
9142   RSO.flush();
9143   Plan->setName(PlanName);
9144 
9145   // Fold Exit block into its predecessor if possible.
9146   // TODO: Fold block earlier once all VPlan transforms properly maintain a
9147   // VPBasicBlock as exit.
9148   VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExit());
9149 
9150   assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
9151   return Plan;
9152 }
9153 
9154 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9155   // Outer loop handling: They may require CFG and instruction level
9156   // transformations before even evaluating whether vectorization is profitable.
9157   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9158   // the vectorization pipeline.
9159   assert(!OrigLoop->isInnermost());
9160   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9161 
9162   // Create new empty VPlan
9163   auto Plan = std::make_unique<VPlan>();
9164 
9165   // Build hierarchical CFG
9166   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9167   HCFGBuilder.buildHierarchicalCFG();
9168 
9169   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
9170        VF *= 2)
9171     Plan->addVF(VF);
9172 
9173   if (EnableVPlanPredication) {
9174     VPlanPredicator VPP(*Plan);
9175     VPP.predicate();
9176 
9177     // Avoid running transformation to recipes until masked code generation in
9178     // VPlan-native path is in place.
9179     return Plan;
9180   }
9181 
9182   SmallPtrSet<Instruction *, 1> DeadInstructions;
9183   VPlanTransforms::VPInstructionsToVPRecipes(
9184       OrigLoop, Plan,
9185       [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
9186       DeadInstructions, *PSE.getSE());
9187 
9188   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(),
9189                         true, true);
9190   return Plan;
9191 }
9192 
9193 // Adjust the recipes for reductions. For in-loop reductions the chain of
9194 // instructions leading from the loop exit instr to the phi need to be converted
9195 // to reductions, with one operand being vector and the other being the scalar
9196 // reduction chain. For other reductions, a select is introduced between the phi
9197 // and live-out recipes when folding the tail.
9198 void LoopVectorizationPlanner::adjustRecipesForReductions(
9199     VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
9200     ElementCount MinVF) {
9201   for (auto &Reduction : CM.getInLoopReductionChains()) {
9202     PHINode *Phi = Reduction.first;
9203     const RecurrenceDescriptor &RdxDesc =
9204         Legal->getReductionVars().find(Phi)->second;
9205     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9206 
9207     if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc))
9208       continue;
9209 
9210     // ReductionOperations are orders top-down from the phi's use to the
9211     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
9212     // which of the two operands will remain scalar and which will be reduced.
9213     // For minmax the chain will be the select instructions.
9214     Instruction *Chain = Phi;
9215     for (Instruction *R : ReductionOperations) {
9216       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
9217       RecurKind Kind = RdxDesc.getRecurrenceKind();
9218 
9219       VPValue *ChainOp = Plan->getVPValue(Chain);
9220       unsigned FirstOpId;
9221       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
9222              "Only min/max recurrences allowed for inloop reductions");
9223       // Recognize a call to the llvm.fmuladd intrinsic.
9224       bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9225       assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) &&
9226              "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9227       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9228         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
9229                "Expected to replace a VPWidenSelectSC");
9230         FirstOpId = 1;
9231       } else {
9232         assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) ||
9233                 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) &&
9234                "Expected to replace a VPWidenSC");
9235         FirstOpId = 0;
9236       }
9237       unsigned VecOpId =
9238           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
9239       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
9240 
9241       auto *CondOp = CM.blockNeedsPredicationForAnyReason(R->getParent())
9242                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
9243                          : nullptr;
9244 
9245       if (IsFMulAdd) {
9246         // If the instruction is a call to the llvm.fmuladd intrinsic then we
9247         // need to create an fmul recipe to use as the vector operand for the
9248         // fadd reduction.
9249         VPInstruction *FMulRecipe = new VPInstruction(
9250             Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))});
9251         FMulRecipe->setFastMathFlags(R->getFastMathFlags());
9252         WidenRecipe->getParent()->insert(FMulRecipe,
9253                                          WidenRecipe->getIterator());
9254         VecOp = FMulRecipe;
9255       }
9256       VPReductionRecipe *RedRecipe =
9257           new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
9258       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9259       Plan->removeVPValueFor(R);
9260       Plan->addVPValue(R, RedRecipe);
9261       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
9262       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9263       WidenRecipe->eraseFromParent();
9264 
9265       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9266         VPRecipeBase *CompareRecipe =
9267             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
9268         assert(isa<VPWidenRecipe>(CompareRecipe) &&
9269                "Expected to replace a VPWidenSC");
9270         assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
9271                "Expected no remaining users");
9272         CompareRecipe->eraseFromParent();
9273       }
9274       Chain = R;
9275     }
9276   }
9277 
9278   // If tail is folded by masking, introduce selects between the phi
9279   // and the live-out instruction of each reduction, at the beginning of the
9280   // dedicated latch block.
9281   if (CM.foldTailByMasking()) {
9282     Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin());
9283     for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
9284       VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9285       if (!PhiR || PhiR->isInLoop())
9286         continue;
9287       VPValue *Cond =
9288           RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
9289       VPValue *Red = PhiR->getBackedgeValue();
9290       assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB &&
9291              "reduction recipe must be defined before latch");
9292       Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});
9293     }
9294   }
9295 }
9296 
9297 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9298 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9299                                VPSlotTracker &SlotTracker) const {
9300   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9301   IG->getInsertPos()->printAsOperand(O, false);
9302   O << ", ";
9303   getAddr()->printAsOperand(O, SlotTracker);
9304   VPValue *Mask = getMask();
9305   if (Mask) {
9306     O << ", ";
9307     Mask->printAsOperand(O, SlotTracker);
9308   }
9309 
9310   unsigned OpIdx = 0;
9311   for (unsigned i = 0; i < IG->getFactor(); ++i) {
9312     if (!IG->getMember(i))
9313       continue;
9314     if (getNumStoreOperands() > 0) {
9315       O << "\n" << Indent << "  store ";
9316       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9317       O << " to index " << i;
9318     } else {
9319       O << "\n" << Indent << "  ";
9320       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
9321       O << " = load from index " << i;
9322     }
9323     ++OpIdx;
9324   }
9325 }
9326 #endif
9327 
9328 void VPWidenCallRecipe::execute(VPTransformState &State) {
9329   State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
9330                                   *this, State);
9331 }
9332 
9333 void VPWidenSelectRecipe::execute(VPTransformState &State) {
9334   auto &I = *cast<SelectInst>(getUnderlyingInstr());
9335   State.ILV->setDebugLocFromInst(&I);
9336 
9337   // The condition can be loop invariant  but still defined inside the
9338   // loop. This means that we can't just use the original 'cond' value.
9339   // We have to take the 'vectorized' value and pick the first lane.
9340   // Instcombine will make this a no-op.
9341   auto *InvarCond =
9342       InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr;
9343 
9344   for (unsigned Part = 0; Part < State.UF; ++Part) {
9345     Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part);
9346     Value *Op0 = State.get(getOperand(1), Part);
9347     Value *Op1 = State.get(getOperand(2), Part);
9348     Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
9349     State.set(this, Sel, Part);
9350     State.ILV->addMetadata(Sel, &I);
9351   }
9352 }
9353 
9354 void VPWidenRecipe::execute(VPTransformState &State) {
9355   auto &I = *cast<Instruction>(getUnderlyingValue());
9356   auto &Builder = State.Builder;
9357   switch (I.getOpcode()) {
9358   case Instruction::Call:
9359   case Instruction::Br:
9360   case Instruction::PHI:
9361   case Instruction::GetElementPtr:
9362   case Instruction::Select:
9363     llvm_unreachable("This instruction is handled by a different recipe.");
9364   case Instruction::UDiv:
9365   case Instruction::SDiv:
9366   case Instruction::SRem:
9367   case Instruction::URem:
9368   case Instruction::Add:
9369   case Instruction::FAdd:
9370   case Instruction::Sub:
9371   case Instruction::FSub:
9372   case Instruction::FNeg:
9373   case Instruction::Mul:
9374   case Instruction::FMul:
9375   case Instruction::FDiv:
9376   case Instruction::FRem:
9377   case Instruction::Shl:
9378   case Instruction::LShr:
9379   case Instruction::AShr:
9380   case Instruction::And:
9381   case Instruction::Or:
9382   case Instruction::Xor: {
9383     // Just widen unops and binops.
9384     State.ILV->setDebugLocFromInst(&I);
9385 
9386     for (unsigned Part = 0; Part < State.UF; ++Part) {
9387       SmallVector<Value *, 2> Ops;
9388       for (VPValue *VPOp : operands())
9389         Ops.push_back(State.get(VPOp, Part));
9390 
9391       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
9392 
9393       if (auto *VecOp = dyn_cast<Instruction>(V)) {
9394         VecOp->copyIRFlags(&I);
9395 
9396         // If the instruction is vectorized and was in a basic block that needed
9397         // predication, we can't propagate poison-generating flags (nuw/nsw,
9398         // exact, etc.). The control flow has been linearized and the
9399         // instruction is no longer guarded by the predicate, which could make
9400         // the flag properties to no longer hold.
9401         if (State.MayGeneratePoisonRecipes.contains(this))
9402           VecOp->dropPoisonGeneratingFlags();
9403       }
9404 
9405       // Use this vector value for all users of the original instruction.
9406       State.set(this, V, Part);
9407       State.ILV->addMetadata(V, &I);
9408     }
9409 
9410     break;
9411   }
9412   case Instruction::ICmp:
9413   case Instruction::FCmp: {
9414     // Widen compares. Generate vector compares.
9415     bool FCmp = (I.getOpcode() == Instruction::FCmp);
9416     auto *Cmp = cast<CmpInst>(&I);
9417     State.ILV->setDebugLocFromInst(Cmp);
9418     for (unsigned Part = 0; Part < State.UF; ++Part) {
9419       Value *A = State.get(getOperand(0), Part);
9420       Value *B = State.get(getOperand(1), Part);
9421       Value *C = nullptr;
9422       if (FCmp) {
9423         // Propagate fast math flags.
9424         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
9425         Builder.setFastMathFlags(Cmp->getFastMathFlags());
9426         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
9427       } else {
9428         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
9429       }
9430       State.set(this, C, Part);
9431       State.ILV->addMetadata(C, &I);
9432     }
9433 
9434     break;
9435   }
9436 
9437   case Instruction::ZExt:
9438   case Instruction::SExt:
9439   case Instruction::FPToUI:
9440   case Instruction::FPToSI:
9441   case Instruction::FPExt:
9442   case Instruction::PtrToInt:
9443   case Instruction::IntToPtr:
9444   case Instruction::SIToFP:
9445   case Instruction::UIToFP:
9446   case Instruction::Trunc:
9447   case Instruction::FPTrunc:
9448   case Instruction::BitCast: {
9449     auto *CI = cast<CastInst>(&I);
9450     State.ILV->setDebugLocFromInst(CI);
9451 
9452     /// Vectorize casts.
9453     Type *DestTy = (State.VF.isScalar())
9454                        ? CI->getType()
9455                        : VectorType::get(CI->getType(), State.VF);
9456 
9457     for (unsigned Part = 0; Part < State.UF; ++Part) {
9458       Value *A = State.get(getOperand(0), Part);
9459       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
9460       State.set(this, Cast, Part);
9461       State.ILV->addMetadata(Cast, &I);
9462     }
9463     break;
9464   }
9465   default:
9466     // This instruction is not vectorized by simple widening.
9467     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
9468     llvm_unreachable("Unhandled instruction!");
9469   } // end of switch.
9470 }
9471 
9472 void VPWidenGEPRecipe::execute(VPTransformState &State) {
9473   auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
9474   // Construct a vector GEP by widening the operands of the scalar GEP as
9475   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
9476   // results in a vector of pointers when at least one operand of the GEP
9477   // is vector-typed. Thus, to keep the representation compact, we only use
9478   // vector-typed operands for loop-varying values.
9479 
9480   if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
9481     // If we are vectorizing, but the GEP has only loop-invariant operands,
9482     // the GEP we build (by only using vector-typed operands for
9483     // loop-varying values) would be a scalar pointer. Thus, to ensure we
9484     // produce a vector of pointers, we need to either arbitrarily pick an
9485     // operand to broadcast, or broadcast a clone of the original GEP.
9486     // Here, we broadcast a clone of the original.
9487     //
9488     // TODO: If at some point we decide to scalarize instructions having
9489     //       loop-invariant operands, this special case will no longer be
9490     //       required. We would add the scalarization decision to
9491     //       collectLoopScalars() and teach getVectorValue() to broadcast
9492     //       the lane-zero scalar value.
9493     auto *Clone = State.Builder.Insert(GEP->clone());
9494     for (unsigned Part = 0; Part < State.UF; ++Part) {
9495       Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone);
9496       State.set(this, EntryPart, Part);
9497       State.ILV->addMetadata(EntryPart, GEP);
9498     }
9499   } else {
9500     // If the GEP has at least one loop-varying operand, we are sure to
9501     // produce a vector of pointers. But if we are only unrolling, we want
9502     // to produce a scalar GEP for each unroll part. Thus, the GEP we
9503     // produce with the code below will be scalar (if VF == 1) or vector
9504     // (otherwise). Note that for the unroll-only case, we still maintain
9505     // values in the vector mapping with initVector, as we do for other
9506     // instructions.
9507     for (unsigned Part = 0; Part < State.UF; ++Part) {
9508       // The pointer operand of the new GEP. If it's loop-invariant, we
9509       // won't broadcast it.
9510       auto *Ptr = IsPtrLoopInvariant
9511                       ? State.get(getOperand(0), VPIteration(0, 0))
9512                       : State.get(getOperand(0), Part);
9513 
9514       // Collect all the indices for the new GEP. If any index is
9515       // loop-invariant, we won't broadcast it.
9516       SmallVector<Value *, 4> Indices;
9517       for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
9518         VPValue *Operand = getOperand(I);
9519         if (IsIndexLoopInvariant[I - 1])
9520           Indices.push_back(State.get(Operand, VPIteration(0, 0)));
9521         else
9522           Indices.push_back(State.get(Operand, Part));
9523       }
9524 
9525       // If the GEP instruction is vectorized and was in a basic block that
9526       // needed predication, we can't propagate the poison-generating 'inbounds'
9527       // flag. The control flow has been linearized and the GEP is no longer
9528       // guarded by the predicate, which could make the 'inbounds' properties to
9529       // no longer hold.
9530       bool IsInBounds =
9531           GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0;
9532 
9533       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
9534       // but it should be a vector, otherwise.
9535       auto *NewGEP = IsInBounds
9536                          ? State.Builder.CreateInBoundsGEP(
9537                                GEP->getSourceElementType(), Ptr, Indices)
9538                          : State.Builder.CreateGEP(GEP->getSourceElementType(),
9539                                                    Ptr, Indices);
9540       assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
9541              "NewGEP is not a pointer vector");
9542       State.set(this, NewGEP, Part);
9543       State.ILV->addMetadata(NewGEP, GEP);
9544     }
9545   }
9546 }
9547 
9548 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
9549   assert(!State.Instance && "Int or FP induction being replicated.");
9550 
9551   Value *Start = getStartValue()->getLiveInIRValue();
9552   const InductionDescriptor &ID = getInductionDescriptor();
9553   TruncInst *Trunc = getTruncInst();
9554   IRBuilderBase &Builder = State.Builder;
9555   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
9556   assert(State.VF.isVector() && "must have vector VF");
9557 
9558   // The value from the original loop to which we are mapping the new induction
9559   // variable.
9560   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
9561 
9562   auto &DL = EntryVal->getModule()->getDataLayout();
9563 
9564   // Generate code for the induction step. Note that induction steps are
9565   // required to be loop-invariant
9566   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
9567     if (SE.isSCEVable(IV->getType())) {
9568       SCEVExpander Exp(SE, DL, "induction");
9569       return Exp.expandCodeFor(Step, Step->getType(),
9570                                State.CFG.VectorPreHeader->getTerminator());
9571     }
9572     return cast<SCEVUnknown>(Step)->getValue();
9573   };
9574 
9575   // Fast-math-flags propagate from the original induction instruction.
9576   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
9577   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
9578     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
9579 
9580   // Now do the actual transformations, and start with creating the step value.
9581   Value *Step = CreateStepValue(ID.getStep());
9582 
9583   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
9584          "Expected either an induction phi-node or a truncate of it!");
9585 
9586   // Construct the initial value of the vector IV in the vector loop preheader
9587   auto CurrIP = Builder.saveIP();
9588   Builder.SetInsertPoint(State.CFG.VectorPreHeader->getTerminator());
9589   if (isa<TruncInst>(EntryVal)) {
9590     assert(Start->getType()->isIntegerTy() &&
9591            "Truncation requires an integer type");
9592     auto *TruncType = cast<IntegerType>(EntryVal->getType());
9593     Step = Builder.CreateTrunc(Step, TruncType);
9594     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
9595   }
9596 
9597   Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
9598   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
9599   Value *SteppedStart = getStepVector(
9600       SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
9601 
9602   // We create vector phi nodes for both integer and floating-point induction
9603   // variables. Here, we determine the kind of arithmetic we will perform.
9604   Instruction::BinaryOps AddOp;
9605   Instruction::BinaryOps MulOp;
9606   if (Step->getType()->isIntegerTy()) {
9607     AddOp = Instruction::Add;
9608     MulOp = Instruction::Mul;
9609   } else {
9610     AddOp = ID.getInductionOpcode();
9611     MulOp = Instruction::FMul;
9612   }
9613 
9614   // Multiply the vectorization factor by the step using integer or
9615   // floating-point arithmetic as appropriate.
9616   Type *StepType = Step->getType();
9617   Value *RuntimeVF;
9618   if (Step->getType()->isFloatingPointTy())
9619     RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
9620   else
9621     RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
9622   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
9623 
9624   // Create a vector splat to use in the induction update.
9625   //
9626   // FIXME: If the step is non-constant, we create the vector splat with
9627   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
9628   //        handle a constant vector splat.
9629   Value *SplatVF = isa<Constant>(Mul)
9630                        ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
9631                        : Builder.CreateVectorSplat(State.VF, Mul);
9632   Builder.restoreIP(CurrIP);
9633 
9634   // We may need to add the step a number of times, depending on the unroll
9635   // factor. The last of those goes into the PHI.
9636   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
9637                                     &*State.CFG.PrevBB->getFirstInsertionPt());
9638   VecInd->setDebugLoc(EntryVal->getDebugLoc());
9639   Instruction *LastInduction = VecInd;
9640   for (unsigned Part = 0; Part < State.UF; ++Part) {
9641     State.set(this, LastInduction, Part);
9642 
9643     if (isa<TruncInst>(EntryVal))
9644       State.ILV->addMetadata(LastInduction, EntryVal);
9645 
9646     LastInduction = cast<Instruction>(
9647         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
9648     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
9649   }
9650 
9651   // Move the last step to the end of the latch block. This ensures consistent
9652   // placement of all induction updates.
9653   auto *LoopVectorLatch =
9654       State.LI->getLoopFor(State.CFG.PrevBB)->getLoopLatch();
9655   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
9656   LastInduction->moveBefore(Br);
9657   LastInduction->setName("vec.ind.next");
9658 
9659   VecInd->addIncoming(SteppedStart, State.CFG.VectorPreHeader);
9660   VecInd->addIncoming(LastInduction, LoopVectorLatch);
9661 }
9662 
9663 void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
9664   assert(!State.Instance && "VPScalarIVStepsRecipe being replicated.");
9665 
9666   // Fast-math-flags propagate from the original induction instruction.
9667   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9668   if (IndDesc.getInductionBinOp() &&
9669       isa<FPMathOperator>(IndDesc.getInductionBinOp()))
9670     State.Builder.setFastMathFlags(
9671         IndDesc.getInductionBinOp()->getFastMathFlags());
9672 
9673   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9674   auto CreateScalarIV = [&](Value *&Step) -> Value * {
9675     Value *ScalarIV = State.get(getCanonicalIV(), VPIteration(0, 0));
9676     auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0);
9677     if (!isCanonical() || CanonicalIV->getType() != Ty) {
9678       ScalarIV =
9679           Ty->isIntegerTy()
9680               ? State.Builder.CreateSExtOrTrunc(ScalarIV, Ty)
9681               : State.Builder.CreateCast(Instruction::SIToFP, ScalarIV, Ty);
9682       ScalarIV = emitTransformedIndex(State.Builder, ScalarIV,
9683                                       getStartValue()->getLiveInIRValue(), Step,
9684                                       IndDesc);
9685       ScalarIV->setName("offset.idx");
9686     }
9687     if (TruncToTy) {
9688       assert(Step->getType()->isIntegerTy() &&
9689              "Truncation requires an integer step");
9690       ScalarIV = State.Builder.CreateTrunc(ScalarIV, TruncToTy);
9691       Step = State.Builder.CreateTrunc(Step, TruncToTy);
9692     }
9693     return ScalarIV;
9694   };
9695 
9696   Value *ScalarIV = CreateScalarIV(Step);
9697   if (State.VF.isVector()) {
9698     buildScalarSteps(ScalarIV, Step, IndDesc, this, State);
9699     return;
9700   }
9701 
9702   for (unsigned Part = 0; Part < State.UF; ++Part) {
9703     assert(!State.VF.isScalable() && "scalable vectors not yet supported.");
9704     Value *EntryPart;
9705     if (Step->getType()->isFloatingPointTy()) {
9706       Value *StartIdx =
9707           getRuntimeVFAsFloat(State.Builder, Step->getType(), State.VF * Part);
9708       // Floating-point operations inherit FMF via the builder's flags.
9709       Value *MulOp = State.Builder.CreateFMul(StartIdx, Step);
9710       EntryPart = State.Builder.CreateBinOp(IndDesc.getInductionOpcode(),
9711                                             ScalarIV, MulOp);
9712     } else {
9713       Value *StartIdx =
9714           getRuntimeVF(State.Builder, Step->getType(), State.VF * Part);
9715       EntryPart = State.Builder.CreateAdd(
9716           ScalarIV, State.Builder.CreateMul(StartIdx, Step), "induction");
9717     }
9718     State.set(this, EntryPart, Part);
9719   }
9720 }
9721 
9722 void VPWidenPHIRecipe::execute(VPTransformState &State) {
9723   State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this,
9724                                  State);
9725 }
9726 
9727 void VPBlendRecipe::execute(VPTransformState &State) {
9728   State.ILV->setDebugLocFromInst(Phi, &State.Builder);
9729   // We know that all PHIs in non-header blocks are converted into
9730   // selects, so we don't have to worry about the insertion order and we
9731   // can just use the builder.
9732   // At this point we generate the predication tree. There may be
9733   // duplications since this is a simple recursive scan, but future
9734   // optimizations will clean it up.
9735 
9736   unsigned NumIncoming = getNumIncomingValues();
9737 
9738   // Generate a sequence of selects of the form:
9739   // SELECT(Mask3, In3,
9740   //        SELECT(Mask2, In2,
9741   //               SELECT(Mask1, In1,
9742   //                      In0)))
9743   // Note that Mask0 is never used: lanes for which no path reaches this phi and
9744   // are essentially undef are taken from In0.
9745   InnerLoopVectorizer::VectorParts Entry(State.UF);
9746   for (unsigned In = 0; In < NumIncoming; ++In) {
9747     for (unsigned Part = 0; Part < State.UF; ++Part) {
9748       // We might have single edge PHIs (blocks) - use an identity
9749       // 'select' for the first PHI operand.
9750       Value *In0 = State.get(getIncomingValue(In), Part);
9751       if (In == 0)
9752         Entry[Part] = In0; // Initialize with the first incoming value.
9753       else {
9754         // Select between the current value and the previous incoming edge
9755         // based on the incoming mask.
9756         Value *Cond = State.get(getMask(In), Part);
9757         Entry[Part] =
9758             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
9759       }
9760     }
9761   }
9762   for (unsigned Part = 0; Part < State.UF; ++Part)
9763     State.set(this, Entry[Part], Part);
9764 }
9765 
9766 void VPInterleaveRecipe::execute(VPTransformState &State) {
9767   assert(!State.Instance && "Interleave group being replicated.");
9768   State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9769                                       getStoredValues(), getMask());
9770 }
9771 
9772 void VPReductionRecipe::execute(VPTransformState &State) {
9773   assert(!State.Instance && "Reduction being replicated.");
9774   Value *PrevInChain = State.get(getChainOp(), 0);
9775   RecurKind Kind = RdxDesc->getRecurrenceKind();
9776   bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc);
9777   // Propagate the fast-math flags carried by the underlying instruction.
9778   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
9779   State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags());
9780   for (unsigned Part = 0; Part < State.UF; ++Part) {
9781     Value *NewVecOp = State.get(getVecOp(), Part);
9782     if (VPValue *Cond = getCondOp()) {
9783       Value *NewCond = State.get(Cond, Part);
9784       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
9785       Value *Iden = RdxDesc->getRecurrenceIdentity(
9786           Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags());
9787       Value *IdenVec =
9788           State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
9789       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
9790       NewVecOp = Select;
9791     }
9792     Value *NewRed;
9793     Value *NextInChain;
9794     if (IsOrdered) {
9795       if (State.VF.isVector())
9796         NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp,
9797                                         PrevInChain);
9798       else
9799         NewRed = State.Builder.CreateBinOp(
9800             (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain,
9801             NewVecOp);
9802       PrevInChain = NewRed;
9803     } else {
9804       PrevInChain = State.get(getChainOp(), Part);
9805       NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
9806     }
9807     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9808       NextInChain =
9809           createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
9810                          NewRed, PrevInChain);
9811     } else if (IsOrdered)
9812       NextInChain = NewRed;
9813     else
9814       NextInChain = State.Builder.CreateBinOp(
9815           (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed,
9816           PrevInChain);
9817     State.set(this, NextInChain, Part);
9818   }
9819 }
9820 
9821 void VPReplicateRecipe::execute(VPTransformState &State) {
9822   if (State.Instance) { // Generate a single instance.
9823     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9824     State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance,
9825                                     IsPredicated, State);
9826     // Insert scalar instance packing it into a vector.
9827     if (AlsoPack && State.VF.isVector()) {
9828       // If we're constructing lane 0, initialize to start from poison.
9829       if (State.Instance->Lane.isFirstLane()) {
9830         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9831         Value *Poison = PoisonValue::get(
9832             VectorType::get(getUnderlyingValue()->getType(), State.VF));
9833         State.set(this, Poison, State.Instance->Part);
9834       }
9835       State.ILV->packScalarIntoVectorValue(this, *State.Instance, State);
9836     }
9837     return;
9838   }
9839 
9840   // Generate scalar instances for all VF lanes of all UF parts, unless the
9841   // instruction is uniform inwhich case generate only the first lane for each
9842   // of the UF parts.
9843   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
9844   assert((!State.VF.isScalable() || IsUniform) &&
9845          "Can't scalarize a scalable vector");
9846   for (unsigned Part = 0; Part < State.UF; ++Part)
9847     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9848       State.ILV->scalarizeInstruction(getUnderlyingInstr(), this,
9849                                       VPIteration(Part, Lane), IsPredicated,
9850                                       State);
9851 }
9852 
9853 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
9854   assert(State.Instance && "Branch on Mask works only on single instance.");
9855 
9856   unsigned Part = State.Instance->Part;
9857   unsigned Lane = State.Instance->Lane.getKnownLane();
9858 
9859   Value *ConditionBit = nullptr;
9860   VPValue *BlockInMask = getMask();
9861   if (BlockInMask) {
9862     ConditionBit = State.get(BlockInMask, Part);
9863     if (ConditionBit->getType()->isVectorTy())
9864       ConditionBit = State.Builder.CreateExtractElement(
9865           ConditionBit, State.Builder.getInt32(Lane));
9866   } else // Block in mask is all-one.
9867     ConditionBit = State.Builder.getTrue();
9868 
9869   // Replace the temporary unreachable terminator with a new conditional branch,
9870   // whose two destinations will be set later when they are created.
9871   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
9872   assert(isa<UnreachableInst>(CurrentTerminator) &&
9873          "Expected to replace unreachable terminator with conditional branch.");
9874   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
9875   CondBr->setSuccessor(0, nullptr);
9876   ReplaceInstWithInst(CurrentTerminator, CondBr);
9877 }
9878 
9879 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
9880   assert(State.Instance && "Predicated instruction PHI works per instance.");
9881   Instruction *ScalarPredInst =
9882       cast<Instruction>(State.get(getOperand(0), *State.Instance));
9883   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
9884   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
9885   assert(PredicatingBB && "Predicated block has no single predecessor.");
9886   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
9887          "operand must be VPReplicateRecipe");
9888 
9889   // By current pack/unpack logic we need to generate only a single phi node: if
9890   // a vector value for the predicated instruction exists at this point it means
9891   // the instruction has vector users only, and a phi for the vector value is
9892   // needed. In this case the recipe of the predicated instruction is marked to
9893   // also do that packing, thereby "hoisting" the insert-element sequence.
9894   // Otherwise, a phi node for the scalar value is needed.
9895   unsigned Part = State.Instance->Part;
9896   if (State.hasVectorValue(getOperand(0), Part)) {
9897     Value *VectorValue = State.get(getOperand(0), Part);
9898     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
9899     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
9900     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
9901     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
9902     if (State.hasVectorValue(this, Part))
9903       State.reset(this, VPhi, Part);
9904     else
9905       State.set(this, VPhi, Part);
9906     // NOTE: Currently we need to update the value of the operand, so the next
9907     // predicated iteration inserts its generated value in the correct vector.
9908     State.reset(getOperand(0), VPhi, Part);
9909   } else {
9910     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
9911     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
9912     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
9913                      PredicatingBB);
9914     Phi->addIncoming(ScalarPredInst, PredicatedBB);
9915     if (State.hasScalarValue(this, *State.Instance))
9916       State.reset(this, Phi, *State.Instance);
9917     else
9918       State.set(this, Phi, *State.Instance);
9919     // NOTE: Currently we need to update the value of the operand, so the next
9920     // predicated iteration inserts its generated value in the correct vector.
9921     State.reset(getOperand(0), Phi, *State.Instance);
9922   }
9923 }
9924 
9925 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9926   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9927 
9928   // Attempt to issue a wide load.
9929   LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
9930   StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
9931 
9932   assert((LI || SI) && "Invalid Load/Store instruction");
9933   assert((!SI || StoredValue) && "No stored value provided for widened store");
9934   assert((!LI || !StoredValue) && "Stored value provided for widened load");
9935 
9936   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9937 
9938   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9939   const Align Alignment = getLoadStoreAlignment(&Ingredient);
9940   bool CreateGatherScatter = !Consecutive;
9941 
9942   auto &Builder = State.Builder;
9943   InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
9944   bool isMaskRequired = getMask();
9945   if (isMaskRequired)
9946     for (unsigned Part = 0; Part < State.UF; ++Part)
9947       BlockInMaskParts[Part] = State.get(getMask(), Part);
9948 
9949   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
9950     // Calculate the pointer for the specific unroll-part.
9951     GetElementPtrInst *PartPtr = nullptr;
9952 
9953     bool InBounds = false;
9954     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
9955       InBounds = gep->isInBounds();
9956     if (Reverse) {
9957       // If the address is consecutive but reversed, then the
9958       // wide store needs to start at the last vector element.
9959       // RunTimeVF =  VScale * VF.getKnownMinValue()
9960       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
9961       Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF);
9962       // NumElt = -Part * RunTimeVF
9963       Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
9964       // LastLane = 1 - RunTimeVF
9965       Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
9966       PartPtr =
9967           cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
9968       PartPtr->setIsInBounds(InBounds);
9969       PartPtr = cast<GetElementPtrInst>(
9970           Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
9971       PartPtr->setIsInBounds(InBounds);
9972       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
9973         BlockInMaskParts[Part] =
9974             Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse");
9975     } else {
9976       Value *Increment =
9977           createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part);
9978       PartPtr = cast<GetElementPtrInst>(
9979           Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
9980       PartPtr->setIsInBounds(InBounds);
9981     }
9982 
9983     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
9984     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
9985   };
9986 
9987   // Handle Stores:
9988   if (SI) {
9989     State.ILV->setDebugLocFromInst(SI);
9990 
9991     for (unsigned Part = 0; Part < State.UF; ++Part) {
9992       Instruction *NewSI = nullptr;
9993       Value *StoredVal = State.get(StoredValue, Part);
9994       if (CreateGatherScatter) {
9995         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9996         Value *VectorGep = State.get(getAddr(), Part);
9997         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
9998                                             MaskPart);
9999       } else {
10000         if (Reverse) {
10001           // If we store to reverse consecutive memory locations, then we need
10002           // to reverse the order of elements in the stored value.
10003           StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
10004           // We don't want to update the value in the map as it might be used in
10005           // another expression. So don't call resetVectorValue(StoredVal).
10006         }
10007         auto *VecPtr =
10008             CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
10009         if (isMaskRequired)
10010           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
10011                                             BlockInMaskParts[Part]);
10012         else
10013           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
10014       }
10015       State.ILV->addMetadata(NewSI, SI);
10016     }
10017     return;
10018   }
10019 
10020   // Handle loads.
10021   assert(LI && "Must have a load instruction");
10022   State.ILV->setDebugLocFromInst(LI);
10023   for (unsigned Part = 0; Part < State.UF; ++Part) {
10024     Value *NewLI;
10025     if (CreateGatherScatter) {
10026       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
10027       Value *VectorGep = State.get(getAddr(), Part);
10028       NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
10029                                          nullptr, "wide.masked.gather");
10030       State.ILV->addMetadata(NewLI, LI);
10031     } else {
10032       auto *VecPtr =
10033           CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
10034       if (isMaskRequired)
10035         NewLI = Builder.CreateMaskedLoad(
10036             DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
10037             PoisonValue::get(DataTy), "wide.masked.load");
10038       else
10039         NewLI =
10040             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
10041 
10042       // Add metadata to the load, but setVectorValue to the reverse shuffle.
10043       State.ILV->addMetadata(NewLI, LI);
10044       if (Reverse)
10045         NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
10046     }
10047 
10048     State.set(this, NewLI, Part);
10049   }
10050 }
10051 
10052 // Determine how to lower the scalar epilogue, which depends on 1) optimising
10053 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
10054 // predication, and 4) a TTI hook that analyses whether the loop is suitable
10055 // for predication.
10056 static ScalarEpilogueLowering getScalarEpilogueLowering(
10057     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
10058     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
10059     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
10060     LoopVectorizationLegality &LVL) {
10061   // 1) OptSize takes precedence over all other options, i.e. if this is set,
10062   // don't look at hints or options, and don't request a scalar epilogue.
10063   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
10064   // LoopAccessInfo (due to code dependency and not being able to reliably get
10065   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
10066   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
10067   // versioning when the vectorization is forced, unlike hasOptSize. So revert
10068   // back to the old way and vectorize with versioning when forced. See D81345.)
10069   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
10070                                                       PGSOQueryType::IRPass) &&
10071                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
10072     return CM_ScalarEpilogueNotAllowedOptSize;
10073 
10074   // 2) If set, obey the directives
10075   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
10076     switch (PreferPredicateOverEpilogue) {
10077     case PreferPredicateTy::ScalarEpilogue:
10078       return CM_ScalarEpilogueAllowed;
10079     case PreferPredicateTy::PredicateElseScalarEpilogue:
10080       return CM_ScalarEpilogueNotNeededUsePredicate;
10081     case PreferPredicateTy::PredicateOrDontVectorize:
10082       return CM_ScalarEpilogueNotAllowedUsePredicate;
10083     };
10084   }
10085 
10086   // 3) If set, obey the hints
10087   switch (Hints.getPredicate()) {
10088   case LoopVectorizeHints::FK_Enabled:
10089     return CM_ScalarEpilogueNotNeededUsePredicate;
10090   case LoopVectorizeHints::FK_Disabled:
10091     return CM_ScalarEpilogueAllowed;
10092   };
10093 
10094   // 4) if the TTI hook indicates this is profitable, request predication.
10095   if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
10096                                        LVL.getLAI()))
10097     return CM_ScalarEpilogueNotNeededUsePredicate;
10098 
10099   return CM_ScalarEpilogueAllowed;
10100 }
10101 
10102 Value *VPTransformState::get(VPValue *Def, unsigned Part) {
10103   // If Values have been set for this Def return the one relevant for \p Part.
10104   if (hasVectorValue(Def, Part))
10105     return Data.PerPartOutput[Def][Part];
10106 
10107   if (!hasScalarValue(Def, {Part, 0})) {
10108     Value *IRV = Def->getLiveInIRValue();
10109     Value *B = ILV->getBroadcastInstrs(IRV);
10110     set(Def, B, Part);
10111     return B;
10112   }
10113 
10114   Value *ScalarValue = get(Def, {Part, 0});
10115   // If we aren't vectorizing, we can just copy the scalar map values over
10116   // to the vector map.
10117   if (VF.isScalar()) {
10118     set(Def, ScalarValue, Part);
10119     return ScalarValue;
10120   }
10121 
10122   auto *RepR = dyn_cast<VPReplicateRecipe>(Def);
10123   bool IsUniform = RepR && RepR->isUniform();
10124 
10125   unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
10126   // Check if there is a scalar value for the selected lane.
10127   if (!hasScalarValue(Def, {Part, LastLane})) {
10128     // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform.
10129     assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) ||
10130             isa<VPScalarIVStepsRecipe>(Def->getDef())) &&
10131            "unexpected recipe found to be invariant");
10132     IsUniform = true;
10133     LastLane = 0;
10134   }
10135 
10136   auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));
10137   // Set the insert point after the last scalarized instruction or after the
10138   // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
10139   // will directly follow the scalar definitions.
10140   auto OldIP = Builder.saveIP();
10141   auto NewIP =
10142       isa<PHINode>(LastInst)
10143           ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
10144           : std::next(BasicBlock::iterator(LastInst));
10145   Builder.SetInsertPoint(&*NewIP);
10146 
10147   // However, if we are vectorizing, we need to construct the vector values.
10148   // If the value is known to be uniform after vectorization, we can just
10149   // broadcast the scalar value corresponding to lane zero for each unroll
10150   // iteration. Otherwise, we construct the vector values using
10151   // insertelement instructions. Since the resulting vectors are stored in
10152   // State, we will only generate the insertelements once.
10153   Value *VectorValue = nullptr;
10154   if (IsUniform) {
10155     VectorValue = ILV->getBroadcastInstrs(ScalarValue);
10156     set(Def, VectorValue, Part);
10157   } else {
10158     // Initialize packing with insertelements to start from undef.
10159     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
10160     Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
10161     set(Def, Undef, Part);
10162     for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
10163       ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this);
10164     VectorValue = get(Def, Part);
10165   }
10166   Builder.restoreIP(OldIP);
10167   return VectorValue;
10168 }
10169 
10170 // Process the loop in the VPlan-native vectorization path. This path builds
10171 // VPlan upfront in the vectorization pipeline, which allows to apply
10172 // VPlan-to-VPlan transformations from the very beginning without modifying the
10173 // input LLVM IR.
10174 static bool processLoopInVPlanNativePath(
10175     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
10176     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
10177     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
10178     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
10179     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
10180     LoopVectorizationRequirements &Requirements) {
10181 
10182   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
10183     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
10184     return false;
10185   }
10186   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
10187   Function *F = L->getHeader()->getParent();
10188   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
10189 
10190   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10191       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
10192 
10193   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
10194                                 &Hints, IAI);
10195   // Use the planner for outer loop vectorization.
10196   // TODO: CM is not used at this point inside the planner. Turn CM into an
10197   // optional argument if we don't need it in the future.
10198   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints,
10199                                Requirements, ORE);
10200 
10201   // Get user vectorization factor.
10202   ElementCount UserVF = Hints.getWidth();
10203 
10204   CM.collectElementTypesForWidening();
10205 
10206   // Plan how to best vectorize, return the best VF and its cost.
10207   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
10208 
10209   // If we are stress testing VPlan builds, do not attempt to generate vector
10210   // code. Masked vector code generation support will follow soon.
10211   // Also, do not attempt to vectorize if no vector code will be produced.
10212   if (VPlanBuildStressTest || EnableVPlanPredication ||
10213       VectorizationFactor::Disabled() == VF)
10214     return false;
10215 
10216   VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10217 
10218   {
10219     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10220                              F->getParent()->getDataLayout());
10221     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
10222                            &CM, BFI, PSI, Checks);
10223     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
10224                       << L->getHeader()->getParent()->getName() << "\"\n");
10225     LVP.executePlan(VF.Width, 1, BestPlan, LB, DT);
10226   }
10227 
10228   // Mark the loop as already vectorized to avoid vectorizing again.
10229   Hints.setAlreadyVectorized();
10230   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10231   return true;
10232 }
10233 
10234 // Emit a remark if there are stores to floats that required a floating point
10235 // extension. If the vectorized loop was generated with floating point there
10236 // will be a performance penalty from the conversion overhead and the change in
10237 // the vector width.
10238 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
10239   SmallVector<Instruction *, 4> Worklist;
10240   for (BasicBlock *BB : L->getBlocks()) {
10241     for (Instruction &Inst : *BB) {
10242       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
10243         if (S->getValueOperand()->getType()->isFloatTy())
10244           Worklist.push_back(S);
10245       }
10246     }
10247   }
10248 
10249   // Traverse the floating point stores upwards searching, for floating point
10250   // conversions.
10251   SmallPtrSet<const Instruction *, 4> Visited;
10252   SmallPtrSet<const Instruction *, 4> EmittedRemark;
10253   while (!Worklist.empty()) {
10254     auto *I = Worklist.pop_back_val();
10255     if (!L->contains(I))
10256       continue;
10257     if (!Visited.insert(I).second)
10258       continue;
10259 
10260     // Emit a remark if the floating point store required a floating
10261     // point conversion.
10262     // TODO: More work could be done to identify the root cause such as a
10263     // constant or a function return type and point the user to it.
10264     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
10265       ORE->emit([&]() {
10266         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
10267                                           I->getDebugLoc(), L->getHeader())
10268                << "floating point conversion changes vector width. "
10269                << "Mixed floating point precision requires an up/down "
10270                << "cast that will negatively impact performance.";
10271       });
10272 
10273     for (Use &Op : I->operands())
10274       if (auto *OpI = dyn_cast<Instruction>(Op))
10275         Worklist.push_back(OpI);
10276   }
10277 }
10278 
10279 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
10280     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10281                                !EnableLoopInterleaving),
10282       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10283                               !EnableLoopVectorization) {}
10284 
10285 bool LoopVectorizePass::processLoop(Loop *L) {
10286   assert((EnableVPlanNativePath || L->isInnermost()) &&
10287          "VPlan-native path is not enabled. Only process inner loops.");
10288 
10289 #ifndef NDEBUG
10290   const std::string DebugLocStr = getDebugLocString(L);
10291 #endif /* NDEBUG */
10292 
10293   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
10294                     << L->getHeader()->getParent()->getName() << "' from "
10295                     << DebugLocStr << "\n");
10296 
10297   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
10298 
10299   LLVM_DEBUG(
10300       dbgs() << "LV: Loop hints:"
10301              << " force="
10302              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
10303                      ? "disabled"
10304                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
10305                             ? "enabled"
10306                             : "?"))
10307              << " width=" << Hints.getWidth()
10308              << " interleave=" << Hints.getInterleave() << "\n");
10309 
10310   // Function containing loop
10311   Function *F = L->getHeader()->getParent();
10312 
10313   // Looking at the diagnostic output is the only way to determine if a loop
10314   // was vectorized (other than looking at the IR or machine code), so it
10315   // is important to generate an optimization remark for each loop. Most of
10316   // these messages are generated as OptimizationRemarkAnalysis. Remarks
10317   // generated as OptimizationRemark and OptimizationRemarkMissed are
10318   // less verbose reporting vectorized loops and unvectorized loops that may
10319   // benefit from vectorization, respectively.
10320 
10321   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10322     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10323     return false;
10324   }
10325 
10326   PredicatedScalarEvolution PSE(*SE, *L);
10327 
10328   // Check if it is legal to vectorize the loop.
10329   LoopVectorizationRequirements Requirements;
10330   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
10331                                 &Requirements, &Hints, DB, AC, BFI, PSI);
10332   if (!LVL.canVectorize(EnableVPlanNativePath)) {
10333     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10334     Hints.emitRemarkWithHints();
10335     return false;
10336   }
10337 
10338   // Check the function attributes and profiles to find out if this function
10339   // should be optimized for size.
10340   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10341       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
10342 
10343   // Entrance to the VPlan-native vectorization path. Outer loops are processed
10344   // here. They may require CFG and instruction level transformations before
10345   // even evaluating whether vectorization is profitable. Since we cannot modify
10346   // the incoming IR, we need to build VPlan upfront in the vectorization
10347   // pipeline.
10348   if (!L->isInnermost())
10349     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10350                                         ORE, BFI, PSI, Hints, Requirements);
10351 
10352   assert(L->isInnermost() && "Inner loop expected.");
10353 
10354   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10355   // count by optimizing for size, to minimize overheads.
10356   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
10357   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10358     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10359                       << "This loop is worth vectorizing only if no scalar "
10360                       << "iteration overheads are incurred.");
10361     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10362       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10363     else {
10364       LLVM_DEBUG(dbgs() << "\n");
10365       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10366     }
10367   }
10368 
10369   // Check the function attributes to see if implicit floats are allowed.
10370   // FIXME: This check doesn't seem possibly correct -- what if the loop is
10371   // an integer loop and the vector instructions selected are purely integer
10372   // vector instructions?
10373   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10374     reportVectorizationFailure(
10375         "Can't vectorize when the NoImplicitFloat attribute is used",
10376         "loop not vectorized due to NoImplicitFloat attribute",
10377         "NoImplicitFloat", ORE, L);
10378     Hints.emitRemarkWithHints();
10379     return false;
10380   }
10381 
10382   // Check if the target supports potentially unsafe FP vectorization.
10383   // FIXME: Add a check for the type of safety issue (denormal, signaling)
10384   // for the target we're vectorizing for, to make sure none of the
10385   // additional fp-math flags can help.
10386   if (Hints.isPotentiallyUnsafe() &&
10387       TTI->isFPVectorizationPotentiallyUnsafe()) {
10388     reportVectorizationFailure(
10389         "Potentially unsafe FP op prevents vectorization",
10390         "loop not vectorized due to unsafe FP support.",
10391         "UnsafeFP", ORE, L);
10392     Hints.emitRemarkWithHints();
10393     return false;
10394   }
10395 
10396   bool AllowOrderedReductions;
10397   // If the flag is set, use that instead and override the TTI behaviour.
10398   if (ForceOrderedReductions.getNumOccurrences() > 0)
10399     AllowOrderedReductions = ForceOrderedReductions;
10400   else
10401     AllowOrderedReductions = TTI->enableOrderedReductions();
10402   if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10403     ORE->emit([&]() {
10404       auto *ExactFPMathInst = Requirements.getExactFPInst();
10405       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10406                                                  ExactFPMathInst->getDebugLoc(),
10407                                                  ExactFPMathInst->getParent())
10408              << "loop not vectorized: cannot prove it is safe to reorder "
10409                 "floating-point operations";
10410     });
10411     LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10412                          "reorder floating-point operations\n");
10413     Hints.emitRemarkWithHints();
10414     return false;
10415   }
10416 
10417   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10418   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10419 
10420   // If an override option has been passed in for interleaved accesses, use it.
10421   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
10422     UseInterleaved = EnableInterleavedMemAccesses;
10423 
10424   // Analyze interleaved memory accesses.
10425   if (UseInterleaved) {
10426     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
10427   }
10428 
10429   // Use the cost model.
10430   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10431                                 F, &Hints, IAI);
10432   CM.collectValuesToIgnore();
10433   CM.collectElementTypesForWidening();
10434 
10435   // Use the planner for vectorization.
10436   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints,
10437                                Requirements, ORE);
10438 
10439   // Get user vectorization factor and interleave count.
10440   ElementCount UserVF = Hints.getWidth();
10441   unsigned UserIC = Hints.getInterleave();
10442 
10443   // Plan how to best vectorize, return the best VF and its cost.
10444   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
10445 
10446   VectorizationFactor VF = VectorizationFactor::Disabled();
10447   unsigned IC = 1;
10448 
10449   if (MaybeVF) {
10450     VF = *MaybeVF;
10451     // Select the interleave count.
10452     IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());
10453   }
10454 
10455   // Identify the diagnostic messages that should be produced.
10456   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10457   bool VectorizeLoop = true, InterleaveLoop = true;
10458   if (VF.Width.isScalar()) {
10459     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10460     VecDiagMsg = std::make_pair(
10461         "VectorizationNotBeneficial",
10462         "the cost-model indicates that vectorization is not beneficial");
10463     VectorizeLoop = false;
10464   }
10465 
10466   if (!MaybeVF && UserIC > 1) {
10467     // Tell the user interleaving was avoided up-front, despite being explicitly
10468     // requested.
10469     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10470                          "interleaving should be avoided up front\n");
10471     IntDiagMsg = std::make_pair(
10472         "InterleavingAvoided",
10473         "Ignoring UserIC, because interleaving was avoided up front");
10474     InterleaveLoop = false;
10475   } else if (IC == 1 && UserIC <= 1) {
10476     // Tell the user interleaving is not beneficial.
10477     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10478     IntDiagMsg = std::make_pair(
10479         "InterleavingNotBeneficial",
10480         "the cost-model indicates that interleaving is not beneficial");
10481     InterleaveLoop = false;
10482     if (UserIC == 1) {
10483       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10484       IntDiagMsg.second +=
10485           " and is explicitly disabled or interleave count is set to 1";
10486     }
10487   } else if (IC > 1 && UserIC == 1) {
10488     // Tell the user interleaving is beneficial, but it explicitly disabled.
10489     LLVM_DEBUG(
10490         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10491     IntDiagMsg = std::make_pair(
10492         "InterleavingBeneficialButDisabled",
10493         "the cost-model indicates that interleaving is beneficial "
10494         "but is explicitly disabled or interleave count is set to 1");
10495     InterleaveLoop = false;
10496   }
10497 
10498   // Override IC if user provided an interleave count.
10499   IC = UserIC > 0 ? UserIC : IC;
10500 
10501   // Emit diagnostic messages, if any.
10502   const char *VAPassName = Hints.vectorizeAnalysisPassName();
10503   if (!VectorizeLoop && !InterleaveLoop) {
10504     // Do not vectorize or interleaving the loop.
10505     ORE->emit([&]() {
10506       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10507                                       L->getStartLoc(), L->getHeader())
10508              << VecDiagMsg.second;
10509     });
10510     ORE->emit([&]() {
10511       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10512                                       L->getStartLoc(), L->getHeader())
10513              << IntDiagMsg.second;
10514     });
10515     return false;
10516   } else if (!VectorizeLoop && InterleaveLoop) {
10517     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10518     ORE->emit([&]() {
10519       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10520                                         L->getStartLoc(), L->getHeader())
10521              << VecDiagMsg.second;
10522     });
10523   } else if (VectorizeLoop && !InterleaveLoop) {
10524     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10525                       << ") in " << DebugLocStr << '\n');
10526     ORE->emit([&]() {
10527       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10528                                         L->getStartLoc(), L->getHeader())
10529              << IntDiagMsg.second;
10530     });
10531   } else if (VectorizeLoop && InterleaveLoop) {
10532     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10533                       << ") in " << DebugLocStr << '\n');
10534     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10535   }
10536 
10537   bool DisableRuntimeUnroll = false;
10538   MDNode *OrigLoopID = L->getLoopID();
10539   {
10540     // Optimistically generate runtime checks. Drop them if they turn out to not
10541     // be profitable. Limit the scope of Checks, so the cleanup happens
10542     // immediately after vector codegeneration is done.
10543     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10544                              F->getParent()->getDataLayout());
10545     if (!VF.Width.isScalar() || IC > 1)
10546       Checks.Create(L, *LVL.getLAI(), PSE.getPredicate());
10547 
10548     using namespace ore;
10549     if (!VectorizeLoop) {
10550       assert(IC > 1 && "interleave count should not be 1 or 0");
10551       // If we decided that it is not legal to vectorize the loop, then
10552       // interleave it.
10553       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10554                                  &CM, BFI, PSI, Checks);
10555 
10556       VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10557       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT);
10558 
10559       ORE->emit([&]() {
10560         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10561                                   L->getHeader())
10562                << "interleaved loop (interleaved count: "
10563                << NV("InterleaveCount", IC) << ")";
10564       });
10565     } else {
10566       // If we decided that it is *legal* to vectorize the loop, then do it.
10567 
10568       // Consider vectorizing the epilogue too if it's profitable.
10569       VectorizationFactor EpilogueVF =
10570           CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
10571       if (EpilogueVF.Width.isVector()) {
10572 
10573         // The first pass vectorizes the main loop and creates a scalar epilogue
10574         // to be vectorized by executing the plan (potentially with a different
10575         // factor) again shortly afterwards.
10576         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10577         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10578                                            EPI, &LVL, &CM, BFI, PSI, Checks);
10579 
10580         VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
10581         LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV,
10582                         DT);
10583         ++LoopsVectorized;
10584 
10585         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10586         formLCSSARecursively(*L, *DT, LI, SE);
10587 
10588         // Second pass vectorizes the epilogue and adjusts the control flow
10589         // edges from the first pass.
10590         EPI.MainLoopVF = EPI.EpilogueVF;
10591         EPI.MainLoopUF = EPI.EpilogueUF;
10592         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10593                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
10594                                                  Checks);
10595 
10596         VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
10597 
10598         // Ensure that the start values for any VPReductionPHIRecipes are
10599         // updated before vectorising the epilogue loop.
10600         VPBasicBlock *Header = BestEpiPlan.getEntry()->getEntryBasicBlock();
10601         for (VPRecipeBase &R : Header->phis()) {
10602           if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10603             if (auto *Resume = MainILV.getReductionResumeValue(
10604                     ReductionPhi->getRecurrenceDescriptor())) {
10605               VPValue *StartVal = new VPValue(Resume);
10606               BestEpiPlan.addExternalDef(StartVal);
10607               ReductionPhi->setOperand(0, StartVal);
10608             }
10609           }
10610         }
10611 
10612         LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10613                         DT);
10614         ++LoopsEpilogueVectorized;
10615 
10616         if (!MainILV.areSafetyChecksAdded())
10617           DisableRuntimeUnroll = true;
10618       } else {
10619         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
10620                                &LVL, &CM, BFI, PSI, Checks);
10621 
10622         VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10623         LVP.executePlan(VF.Width, IC, BestPlan, LB, DT);
10624         ++LoopsVectorized;
10625 
10626         // Add metadata to disable runtime unrolling a scalar loop when there
10627         // are no runtime checks about strides and memory. A scalar loop that is
10628         // rarely used is not worth unrolling.
10629         if (!LB.areSafetyChecksAdded())
10630           DisableRuntimeUnroll = true;
10631       }
10632       // Report the vectorization decision.
10633       ORE->emit([&]() {
10634         return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
10635                                   L->getHeader())
10636                << "vectorized loop (vectorization width: "
10637                << NV("VectorizationFactor", VF.Width)
10638                << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
10639       });
10640     }
10641 
10642     if (ORE->allowExtraAnalysis(LV_NAME))
10643       checkMixedPrecision(L, ORE);
10644   }
10645 
10646   Optional<MDNode *> RemainderLoopID =
10647       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10648                                       LLVMLoopVectorizeFollowupEpilogue});
10649   if (RemainderLoopID.hasValue()) {
10650     L->setLoopID(RemainderLoopID.getValue());
10651   } else {
10652     if (DisableRuntimeUnroll)
10653       AddRuntimeUnrollDisableMetaData(L);
10654 
10655     // Mark the loop as already vectorized to avoid vectorizing again.
10656     Hints.setAlreadyVectorized();
10657   }
10658 
10659   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10660   return true;
10661 }
10662 
10663 LoopVectorizeResult LoopVectorizePass::runImpl(
10664     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10665     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
10666     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
10667     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
10668     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10669   SE = &SE_;
10670   LI = &LI_;
10671   TTI = &TTI_;
10672   DT = &DT_;
10673   BFI = &BFI_;
10674   TLI = TLI_;
10675   AA = &AA_;
10676   AC = &AC_;
10677   GetLAA = &GetLAA_;
10678   DB = &DB_;
10679   ORE = &ORE_;
10680   PSI = PSI_;
10681 
10682   // Don't attempt if
10683   // 1. the target claims to have no vector registers, and
10684   // 2. interleaving won't help ILP.
10685   //
10686   // The second condition is necessary because, even if the target has no
10687   // vector registers, loop vectorization may still enable scalar
10688   // interleaving.
10689   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10690       TTI->getMaxInterleaveFactor(1) < 2)
10691     return LoopVectorizeResult(false, false);
10692 
10693   bool Changed = false, CFGChanged = false;
10694 
10695   // The vectorizer requires loops to be in simplified form.
10696   // Since simplification may add new inner loops, it has to run before the
10697   // legality and profitability checks. This means running the loop vectorizer
10698   // will simplify all loops, regardless of whether anything end up being
10699   // vectorized.
10700   for (auto &L : *LI)
10701     Changed |= CFGChanged |=
10702         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10703 
10704   // Build up a worklist of inner-loops to vectorize. This is necessary as
10705   // the act of vectorizing or partially unrolling a loop creates new loops
10706   // and can invalidate iterators across the loops.
10707   SmallVector<Loop *, 8> Worklist;
10708 
10709   for (Loop *L : *LI)
10710     collectSupportedLoops(*L, LI, ORE, Worklist);
10711 
10712   LoopsAnalyzed += Worklist.size();
10713 
10714   // Now walk the identified inner loops.
10715   while (!Worklist.empty()) {
10716     Loop *L = Worklist.pop_back_val();
10717 
10718     // For the inner loops we actually process, form LCSSA to simplify the
10719     // transform.
10720     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10721 
10722     Changed |= CFGChanged |= processLoop(L);
10723   }
10724 
10725   // Process each loop nest in the function.
10726   return LoopVectorizeResult(Changed, CFGChanged);
10727 }
10728 
10729 PreservedAnalyses LoopVectorizePass::run(Function &F,
10730                                          FunctionAnalysisManager &AM) {
10731     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
10732     auto &LI = AM.getResult<LoopAnalysis>(F);
10733     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10734     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10735     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
10736     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10737     auto &AA = AM.getResult<AAManager>(F);
10738     auto &AC = AM.getResult<AssumptionAnalysis>(F);
10739     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10740     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10741 
10742     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
10743     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
10744         [&](Loop &L) -> const LoopAccessInfo & {
10745       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,      SE,
10746                                         TLI, TTI, nullptr, nullptr, nullptr};
10747       return LAM.getResult<LoopAccessAnalysis>(L, AR);
10748     };
10749     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10750     ProfileSummaryInfo *PSI =
10751         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10752     LoopVectorizeResult Result =
10753         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
10754     if (!Result.MadeAnyChange)
10755       return PreservedAnalyses::all();
10756     PreservedAnalyses PA;
10757 
10758     // We currently do not preserve loopinfo/dominator analyses with outer loop
10759     // vectorization. Until this is addressed, mark these analyses as preserved
10760     // only for non-VPlan-native path.
10761     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10762     if (!EnableVPlanNativePath) {
10763       PA.preserve<LoopAnalysis>();
10764       PA.preserve<DominatorTreeAnalysis>();
10765     }
10766 
10767     if (Result.MadeCFGChange) {
10768       // Making CFG changes likely means a loop got vectorized. Indicate that
10769       // extra simplification passes should be run.
10770       // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10771       // be run if runtime checks have been added.
10772       AM.getResult<ShouldRunExtraVectorPasses>(F);
10773       PA.preserve<ShouldRunExtraVectorPasses>();
10774     } else {
10775       PA.preserveSet<CFGAnalyses>();
10776     }
10777     return PA;
10778 }
10779 
10780 void LoopVectorizePass::printPipeline(
10781     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10782   static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10783       OS, MapClassName2PassName);
10784 
10785   OS << "<";
10786   OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10787   OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10788   OS << ">";
10789 }
10790