1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SmallPtrSet.h"
73 #include "llvm/ADT/SmallSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
91 #include "llvm/Analysis/ProfileSummaryInfo.h"
92 #include "llvm/Analysis/ScalarEvolution.h"
93 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
94 #include "llvm/Analysis/TargetLibraryInfo.h"
95 #include "llvm/Analysis/TargetTransformInfo.h"
96 #include "llvm/Analysis/VectorUtils.h"
97 #include "llvm/IR/Attributes.h"
98 #include "llvm/IR/BasicBlock.h"
99 #include "llvm/IR/CFG.h"
100 #include "llvm/IR/Constant.h"
101 #include "llvm/IR/Constants.h"
102 #include "llvm/IR/DataLayout.h"
103 #include "llvm/IR/DebugInfoMetadata.h"
104 #include "llvm/IR/DebugLoc.h"
105 #include "llvm/IR/DerivedTypes.h"
106 #include "llvm/IR/DiagnosticInfo.h"
107 #include "llvm/IR/Dominators.h"
108 #include "llvm/IR/Function.h"
109 #include "llvm/IR/IRBuilder.h"
110 #include "llvm/IR/InstrTypes.h"
111 #include "llvm/IR/Instruction.h"
112 #include "llvm/IR/Instructions.h"
113 #include "llvm/IR/IntrinsicInst.h"
114 #include "llvm/IR/Intrinsics.h"
115 #include "llvm/IR/Metadata.h"
116 #include "llvm/IR/Module.h"
117 #include "llvm/IR/Operator.h"
118 #include "llvm/IR/PatternMatch.h"
119 #include "llvm/IR/Type.h"
120 #include "llvm/IR/Use.h"
121 #include "llvm/IR/User.h"
122 #include "llvm/IR/Value.h"
123 #include "llvm/IR/ValueHandle.h"
124 #include "llvm/IR/Verifier.h"
125 #include "llvm/InitializePasses.h"
126 #include "llvm/Pass.h"
127 #include "llvm/Support/Casting.h"
128 #include "llvm/Support/CommandLine.h"
129 #include "llvm/Support/Compiler.h"
130 #include "llvm/Support/Debug.h"
131 #include "llvm/Support/ErrorHandling.h"
132 #include "llvm/Support/InstructionCost.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <functional>
147 #include <iterator>
148 #include <limits>
149 #include <map>
150 #include <memory>
151 #include <string>
152 #include <tuple>
153 #include <utility>
154 
155 using namespace llvm;
156 
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
159 
160 #ifndef NDEBUG
161 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
162 #endif
163 
164 /// @{
165 /// Metadata attribute names
166 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
167 const char LLVMLoopVectorizeFollowupVectorized[] =
168     "llvm.loop.vectorize.followup_vectorized";
169 const char LLVMLoopVectorizeFollowupEpilogue[] =
170     "llvm.loop.vectorize.followup_epilogue";
171 /// @}
172 
173 STATISTIC(LoopsVectorized, "Number of loops vectorized");
174 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
175 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
176 
177 static cl::opt<bool> EnableEpilogueVectorization(
178     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
179     cl::desc("Enable vectorization of epilogue loops."));
180 
181 static cl::opt<unsigned> EpilogueVectorizationForceVF(
182     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
183     cl::desc("When epilogue vectorization is enabled, and a value greater than "
184              "1 is specified, forces the given VF for all applicable epilogue "
185              "loops."));
186 
187 static cl::opt<unsigned> EpilogueVectorizationMinVF(
188     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
189     cl::desc("Only loops with vectorization factor equal to or larger than "
190              "the specified value are considered for epilogue vectorization."));
191 
192 /// Loops with a known constant trip count below this number are vectorized only
193 /// if no scalar iteration overheads are incurred.
194 static cl::opt<unsigned> TinyTripCountVectorThreshold(
195     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
196     cl::desc("Loops with a constant trip count that is smaller than this "
197              "value are vectorized only if no scalar iteration overheads "
198              "are incurred."));
199 
200 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
201     "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
202     cl::desc("The maximum allowed number of runtime memory checks with a "
203              "vectorize(enable) pragma."));
204 
205 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
206 // that predication is preferred, and this lists all options. I.e., the
207 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
208 // and predicate the instructions accordingly. If tail-folding fails, there are
209 // different fallback strategies depending on these values:
210 namespace PreferPredicateTy {
211   enum Option {
212     ScalarEpilogue = 0,
213     PredicateElseScalarEpilogue,
214     PredicateOrDontVectorize
215   };
216 } // namespace PreferPredicateTy
217 
218 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
219     "prefer-predicate-over-epilogue",
220     cl::init(PreferPredicateTy::ScalarEpilogue),
221     cl::Hidden,
222     cl::desc("Tail-folding and predication preferences over creating a scalar "
223              "epilogue loop."),
224     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
225                          "scalar-epilogue",
226                          "Don't tail-predicate loops, create scalar epilogue"),
227               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
228                          "predicate-else-scalar-epilogue",
229                          "prefer tail-folding, create scalar epilogue if tail "
230                          "folding fails."),
231               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
232                          "predicate-dont-vectorize",
233                          "prefers tail-folding, don't attempt vectorization if "
234                          "tail-folding fails.")));
235 
236 static cl::opt<bool> MaximizeBandwidth(
237     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
238     cl::desc("Maximize bandwidth when selecting vectorization factor which "
239              "will be determined by the smallest type in loop."));
240 
241 static cl::opt<bool> EnableInterleavedMemAccesses(
242     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
243     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
244 
245 /// An interleave-group may need masking if it resides in a block that needs
246 /// predication, or in order to mask away gaps.
247 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
248     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
249     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
250 
251 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
252     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
253     cl::desc("We don't interleave loops with a estimated constant trip count "
254              "below this number"));
255 
256 static cl::opt<unsigned> ForceTargetNumScalarRegs(
257     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
258     cl::desc("A flag that overrides the target's number of scalar registers."));
259 
260 static cl::opt<unsigned> ForceTargetNumVectorRegs(
261     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
262     cl::desc("A flag that overrides the target's number of vector registers."));
263 
264 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
265     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
266     cl::desc("A flag that overrides the target's max interleave factor for "
267              "scalar loops."));
268 
269 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
270     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
271     cl::desc("A flag that overrides the target's max interleave factor for "
272              "vectorized loops."));
273 
274 static cl::opt<unsigned> ForceTargetInstructionCost(
275     "force-target-instruction-cost", cl::init(0), cl::Hidden,
276     cl::desc("A flag that overrides the target's expected cost for "
277              "an instruction to a single constant value. Mostly "
278              "useful for getting consistent testing."));
279 
280 static cl::opt<bool> ForceTargetSupportsScalableVectors(
281     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
282     cl::desc(
283         "Pretend that scalable vectors are supported, even if the target does "
284         "not support them. This flag should only be used for testing."));
285 
286 static cl::opt<unsigned> SmallLoopCost(
287     "small-loop-cost", cl::init(20), cl::Hidden,
288     cl::desc(
289         "The cost of a loop that is considered 'small' by the interleaver."));
290 
291 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
292     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
293     cl::desc("Enable the use of the block frequency analysis to access PGO "
294              "heuristics minimizing code growth in cold regions and being more "
295              "aggressive in hot regions."));
296 
297 // Runtime interleave loops for load/store throughput.
298 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
299     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
300     cl::desc(
301         "Enable runtime interleaving until load/store ports are saturated"));
302 
303 /// Interleave small loops with scalar reductions.
304 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
305     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
306     cl::desc("Enable interleaving for loops with small iteration counts that "
307              "contain scalar reductions to expose ILP."));
308 
309 /// The number of stores in a loop that are allowed to need predication.
310 static cl::opt<unsigned> NumberOfStoresToPredicate(
311     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
312     cl::desc("Max number of stores to be predicated behind an if."));
313 
314 static cl::opt<bool> EnableIndVarRegisterHeur(
315     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
316     cl::desc("Count the induction variable only once when interleaving"));
317 
318 static cl::opt<bool> EnableCondStoresVectorization(
319     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
320     cl::desc("Enable if predication of stores during vectorization."));
321 
322 static cl::opt<unsigned> MaxNestedScalarReductionIC(
323     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
324     cl::desc("The maximum interleave count to use when interleaving a scalar "
325              "reduction in a nested loop."));
326 
327 static cl::opt<bool>
328     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
329                            cl::Hidden,
330                            cl::desc("Prefer in-loop vector reductions, "
331                                     "overriding the targets preference."));
332 
333 static cl::opt<bool> ForceOrderedReductions(
334     "force-ordered-reductions", cl::init(false), cl::Hidden,
335     cl::desc("Enable the vectorisation of loops with in-order (strict) "
336              "FP reductions"));
337 
338 static cl::opt<bool> PreferPredicatedReductionSelect(
339     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
340     cl::desc(
341         "Prefer predicating a reduction operation over an after loop select."));
342 
343 cl::opt<bool> EnableVPlanNativePath(
344     "enable-vplan-native-path", cl::init(false), cl::Hidden,
345     cl::desc("Enable VPlan-native vectorization path with "
346              "support for outer loop vectorization."));
347 
348 // FIXME: Remove this switch once we have divergence analysis. Currently we
349 // assume divergent non-backedge branches when this switch is true.
350 cl::opt<bool> EnableVPlanPredication(
351     "enable-vplan-predication", cl::init(false), cl::Hidden,
352     cl::desc("Enable VPlan-native vectorization path predicator with "
353              "support for outer loop vectorization."));
354 
355 // This flag enables the stress testing of the VPlan H-CFG construction in the
356 // VPlan-native vectorization path. It must be used in conjuction with
357 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
358 // verification of the H-CFGs built.
359 static cl::opt<bool> VPlanBuildStressTest(
360     "vplan-build-stress-test", cl::init(false), cl::Hidden,
361     cl::desc(
362         "Build VPlan for every supported loop nest in the function and bail "
363         "out right after the build (stress test the VPlan H-CFG construction "
364         "in the VPlan-native vectorization path)."));
365 
366 cl::opt<bool> llvm::EnableLoopInterleaving(
367     "interleave-loops", cl::init(true), cl::Hidden,
368     cl::desc("Enable loop interleaving in Loop vectorization passes"));
369 cl::opt<bool> llvm::EnableLoopVectorization(
370     "vectorize-loops", cl::init(true), cl::Hidden,
371     cl::desc("Run the Loop vectorization passes"));
372 
373 cl::opt<bool> PrintVPlansInDotFormat(
374     "vplan-print-in-dot-format", cl::init(false), cl::Hidden,
375     cl::desc("Use dot format instead of plain text when dumping VPlans"));
376 
377 /// A helper function that returns true if the given type is irregular. The
378 /// type is irregular if its allocated size doesn't equal the store size of an
379 /// element of the corresponding vector type.
380 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
381   // Determine if an array of N elements of type Ty is "bitcast compatible"
382   // with a <N x Ty> vector.
383   // This is only true if there is no padding between the array elements.
384   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
385 }
386 
387 /// A helper function that returns the reciprocal of the block probability of
388 /// predicated blocks. If we return X, we are assuming the predicated block
389 /// will execute once for every X iterations of the loop header.
390 ///
391 /// TODO: We should use actual block probability here, if available. Currently,
392 ///       we always assume predicated blocks have a 50% chance of executing.
393 static unsigned getReciprocalPredBlockProb() { return 2; }
394 
395 /// A helper function that returns an integer or floating-point constant with
396 /// value C.
397 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
398   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
399                            : ConstantFP::get(Ty, C);
400 }
401 
402 /// Returns "best known" trip count for the specified loop \p L as defined by
403 /// the following procedure:
404 ///   1) Returns exact trip count if it is known.
405 ///   2) Returns expected trip count according to profile data if any.
406 ///   3) Returns upper bound estimate if it is known.
407 ///   4) Returns None if all of the above failed.
408 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
409   // Check if exact trip count is known.
410   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
411     return ExpectedTC;
412 
413   // Check if there is an expected trip count available from profile data.
414   if (LoopVectorizeWithBlockFrequency)
415     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
416       return EstimatedTC;
417 
418   // Check if upper bound estimate is known.
419   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
420     return ExpectedTC;
421 
422   return None;
423 }
424 
425 // Forward declare GeneratedRTChecks.
426 class GeneratedRTChecks;
427 
428 namespace llvm {
429 
430 AnalysisKey ShouldRunExtraVectorPasses::Key;
431 
432 /// InnerLoopVectorizer vectorizes loops which contain only one basic
433 /// block to a specified vectorization factor (VF).
434 /// This class performs the widening of scalars into vectors, or multiple
435 /// scalars. This class also implements the following features:
436 /// * It inserts an epilogue loop for handling loops that don't have iteration
437 ///   counts that are known to be a multiple of the vectorization factor.
438 /// * It handles the code generation for reduction variables.
439 /// * Scalarization (implementation using scalars) of un-vectorizable
440 ///   instructions.
441 /// InnerLoopVectorizer does not perform any vectorization-legality
442 /// checks, and relies on the caller to check for the different legality
443 /// aspects. The InnerLoopVectorizer relies on the
444 /// LoopVectorizationLegality class to provide information about the induction
445 /// and reduction variables that were found to a given vectorization factor.
446 class InnerLoopVectorizer {
447 public:
448   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
449                       LoopInfo *LI, DominatorTree *DT,
450                       const TargetLibraryInfo *TLI,
451                       const TargetTransformInfo *TTI, AssumptionCache *AC,
452                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
453                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
454                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
455                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
456       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
457         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
458         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
459         PSI(PSI), RTChecks(RTChecks) {
460     // Query this against the original loop and save it here because the profile
461     // of the original loop header may change as the transformation happens.
462     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
463         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
464   }
465 
466   virtual ~InnerLoopVectorizer() = default;
467 
468   /// Create a new empty loop that will contain vectorized instructions later
469   /// on, while the old loop will be used as the scalar remainder. Control flow
470   /// is generated around the vectorized (and scalar epilogue) loops consisting
471   /// of various checks and bypasses. Return the pre-header block of the new
472   /// loop and the start value for the canonical induction, if it is != 0. The
473   /// latter is the case when vectorizing the epilogue loop. In the case of
474   /// epilogue vectorization, this function is overriden to handle the more
475   /// complex control flow around the loops.
476   virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton();
477 
478   /// Widen a single call instruction within the innermost loop.
479   void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
480                             VPTransformState &State);
481 
482   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
483   void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
484 
485   // Return true if any runtime check is added.
486   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
487 
488   /// A type for vectorized values in the new loop. Each value from the
489   /// original loop, when vectorized, is represented by UF vector values in the
490   /// new unrolled loop, where UF is the unroll factor.
491   using VectorParts = SmallVector<Value *, 2>;
492 
493   /// Vectorize a single vector PHINode in a block in the VPlan-native path
494   /// only.
495   void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR,
496                            VPTransformState &State);
497 
498   /// A helper function to scalarize a single Instruction in the innermost loop.
499   /// Generates a sequence of scalar instances for each lane between \p MinLane
500   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
501   /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
502   /// Instr's operands.
503   void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe,
504                             const VPIteration &Instance, bool IfPredicateInstr,
505                             VPTransformState &State);
506 
507   /// Construct the vector value of a scalarized value \p V one lane at a time.
508   void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
509                                  VPTransformState &State);
510 
511   /// Try to vectorize interleaved access group \p Group with the base address
512   /// given in \p Addr, optionally masking the vector operations if \p
513   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
514   /// values in the vectorized loop.
515   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
516                                 ArrayRef<VPValue *> VPDefs,
517                                 VPTransformState &State, VPValue *Addr,
518                                 ArrayRef<VPValue *> StoredValues,
519                                 VPValue *BlockInMask = nullptr);
520 
521   /// Set the debug location in the builder \p Ptr using the debug location in
522   /// \p V. If \p Ptr is None then it uses the class member's Builder.
523   void setDebugLocFromInst(const Value *V,
524                            Optional<IRBuilderBase *> CustomBuilder = None);
525 
526   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
527   void fixNonInductionPHIs(VPTransformState &State);
528 
529   /// Returns true if the reordering of FP operations is not allowed, but we are
530   /// able to vectorize with strict in-order reductions for the given RdxDesc.
531   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
532 
533   /// Create a broadcast instruction. This method generates a broadcast
534   /// instruction (shuffle) for loop invariant values and for the induction
535   /// value. If this is the induction variable then we extend it to N, N+1, ...
536   /// this is needed because each iteration in the loop corresponds to a SIMD
537   /// element.
538   virtual Value *getBroadcastInstrs(Value *V);
539 
540   /// Add metadata from one instruction to another.
541   ///
542   /// This includes both the original MDs from \p From and additional ones (\see
543   /// addNewMetadata).  Use this for *newly created* instructions in the vector
544   /// loop.
545   void addMetadata(Instruction *To, Instruction *From);
546 
547   /// Similar to the previous function but it adds the metadata to a
548   /// vector of instructions.
549   void addMetadata(ArrayRef<Value *> To, Instruction *From);
550 
551   // Returns the resume value (bc.merge.rdx) for a reduction as
552   // generated by fixReduction.
553   PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);
554 
555 protected:
556   friend class LoopVectorizationPlanner;
557 
558   /// A small list of PHINodes.
559   using PhiVector = SmallVector<PHINode *, 4>;
560 
561   /// A type for scalarized values in the new loop. Each value from the
562   /// original loop, when scalarized, is represented by UF x VF scalar values
563   /// in the new unrolled loop, where UF is the unroll factor and VF is the
564   /// vectorization factor.
565   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
566 
567   /// Set up the values of the IVs correctly when exiting the vector loop.
568   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
569                     Value *VectorTripCount, Value *EndValue,
570                     BasicBlock *MiddleBlock, BasicBlock *VectorHeader);
571 
572   /// Handle all cross-iteration phis in the header.
573   void fixCrossIterationPHIs(VPTransformState &State);
574 
575   /// Create the exit value of first order recurrences in the middle block and
576   /// update their users.
577   void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
578                                VPTransformState &State);
579 
580   /// Create code for the loop exit value of the reduction.
581   void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
582 
583   /// Clear NSW/NUW flags from reduction instructions if necessary.
584   void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
585                                VPTransformState &State);
586 
587   /// Fixup the LCSSA phi nodes in the unique exit block.  This simply
588   /// means we need to add the appropriate incoming value from the middle
589   /// block as exiting edges from the scalar epilogue loop (if present) are
590   /// already in place, and we exit the vector loop exclusively to the middle
591   /// block.
592   void fixLCSSAPHIs(VPTransformState &State);
593 
594   /// Iteratively sink the scalarized operands of a predicated instruction into
595   /// the block that was created for it.
596   void sinkScalarOperands(Instruction *PredInst);
597 
598   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
599   /// represented as.
600   void truncateToMinimalBitwidths(VPTransformState &State);
601 
602   /// Returns (and creates if needed) the original loop trip count.
603   Value *getOrCreateTripCount(BasicBlock *InsertBlock);
604 
605   /// Returns (and creates if needed) the trip count of the widened loop.
606   Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
607 
608   /// Returns a bitcasted value to the requested vector type.
609   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
610   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
611                                 const DataLayout &DL);
612 
613   /// Emit a bypass check to see if the vector trip count is zero, including if
614   /// it overflows.
615   void emitIterationCountCheck(BasicBlock *Bypass);
616 
617   /// Emit a bypass check to see if all of the SCEV assumptions we've
618   /// had to make are correct. Returns the block containing the checks or
619   /// nullptr if no checks have been added.
620   BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
621 
622   /// Emit bypass checks to check any memory assumptions we may have made.
623   /// Returns the block containing the checks or nullptr if no checks have been
624   /// added.
625   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
626 
627   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
628   /// vector loop preheader, middle block and scalar preheader.
629   void createVectorLoopSkeleton(StringRef Prefix);
630 
631   /// Create new phi nodes for the induction variables to resume iteration count
632   /// in the scalar epilogue, from where the vectorized loop left off.
633   /// In cases where the loop skeleton is more complicated (eg. epilogue
634   /// vectorization) and the resume values can come from an additional bypass
635   /// block, the \p AdditionalBypass pair provides information about the bypass
636   /// block and the end value on the edge from bypass to this loop.
637   void createInductionResumeValues(
638       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
639 
640   /// Complete the loop skeleton by adding debug MDs, creating appropriate
641   /// conditional branches in the middle block, preparing the builder and
642   /// running the verifier. Return the preheader of the completed vector loop.
643   BasicBlock *completeLoopSkeleton(MDNode *OrigLoopID);
644 
645   /// Add additional metadata to \p To that was not present on \p Orig.
646   ///
647   /// Currently this is used to add the noalias annotations based on the
648   /// inserted memchecks.  Use this for instructions that are *cloned* into the
649   /// vector loop.
650   void addNewMetadata(Instruction *To, const Instruction *Orig);
651 
652   /// Collect poison-generating recipes that may generate a poison value that is
653   /// used after vectorization, even when their operands are not poison. Those
654   /// recipes meet the following conditions:
655   ///  * Contribute to the address computation of a recipe generating a widen
656   ///    memory load/store (VPWidenMemoryInstructionRecipe or
657   ///    VPInterleaveRecipe).
658   ///  * Such a widen memory load/store has at least one underlying Instruction
659   ///    that is in a basic block that needs predication and after vectorization
660   ///    the generated instruction won't be predicated.
661   void collectPoisonGeneratingRecipes(VPTransformState &State);
662 
663   /// Allow subclasses to override and print debug traces before/after vplan
664   /// execution, when trace information is requested.
665   virtual void printDebugTracesAtStart(){};
666   virtual void printDebugTracesAtEnd(){};
667 
668   /// The original loop.
669   Loop *OrigLoop;
670 
671   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
672   /// dynamic knowledge to simplify SCEV expressions and converts them to a
673   /// more usable form.
674   PredicatedScalarEvolution &PSE;
675 
676   /// Loop Info.
677   LoopInfo *LI;
678 
679   /// Dominator Tree.
680   DominatorTree *DT;
681 
682   /// Alias Analysis.
683   AAResults *AA;
684 
685   /// Target Library Info.
686   const TargetLibraryInfo *TLI;
687 
688   /// Target Transform Info.
689   const TargetTransformInfo *TTI;
690 
691   /// Assumption Cache.
692   AssumptionCache *AC;
693 
694   /// Interface to emit optimization remarks.
695   OptimizationRemarkEmitter *ORE;
696 
697   /// LoopVersioning.  It's only set up (non-null) if memchecks were
698   /// used.
699   ///
700   /// This is currently only used to add no-alias metadata based on the
701   /// memchecks.  The actually versioning is performed manually.
702   std::unique_ptr<LoopVersioning> LVer;
703 
704   /// The vectorization SIMD factor to use. Each vector will have this many
705   /// vector elements.
706   ElementCount VF;
707 
708   /// The vectorization unroll factor to use. Each scalar is vectorized to this
709   /// many different vector instructions.
710   unsigned UF;
711 
712   /// The builder that we use
713   IRBuilder<> Builder;
714 
715   // --- Vectorization state ---
716 
717   /// The vector-loop preheader.
718   BasicBlock *LoopVectorPreHeader;
719 
720   /// The scalar-loop preheader.
721   BasicBlock *LoopScalarPreHeader;
722 
723   /// Middle Block between the vector and the scalar.
724   BasicBlock *LoopMiddleBlock;
725 
726   /// The unique ExitBlock of the scalar loop if one exists.  Note that
727   /// there can be multiple exiting edges reaching this block.
728   BasicBlock *LoopExitBlock;
729 
730   /// The scalar loop body.
731   BasicBlock *LoopScalarBody;
732 
733   /// A list of all bypass blocks. The first block is the entry of the loop.
734   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
735 
736   /// Store instructions that were predicated.
737   SmallVector<Instruction *, 4> PredicatedInstructions;
738 
739   /// Trip count of the original loop.
740   Value *TripCount = nullptr;
741 
742   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
743   Value *VectorTripCount = nullptr;
744 
745   /// The legality analysis.
746   LoopVectorizationLegality *Legal;
747 
748   /// The profitablity analysis.
749   LoopVectorizationCostModel *Cost;
750 
751   // Record whether runtime checks are added.
752   bool AddedSafetyChecks = false;
753 
754   // Holds the end values for each induction variable. We save the end values
755   // so we can later fix-up the external users of the induction variables.
756   DenseMap<PHINode *, Value *> IVEndValues;
757 
758   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
759   // fixed up at the end of vector code generation.
760   SmallVector<PHINode *, 8> OrigPHIsToFix;
761 
762   /// BFI and PSI are used to check for profile guided size optimizations.
763   BlockFrequencyInfo *BFI;
764   ProfileSummaryInfo *PSI;
765 
766   // Whether this loop should be optimized for size based on profile guided size
767   // optimizatios.
768   bool OptForSizeBasedOnProfile;
769 
770   /// Structure to hold information about generated runtime checks, responsible
771   /// for cleaning the checks, if vectorization turns out unprofitable.
772   GeneratedRTChecks &RTChecks;
773 
774   // Holds the resume values for reductions in the loops, used to set the
775   // correct start value of reduction PHIs when vectorizing the epilogue.
776   SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
777       ReductionResumeValues;
778 };
779 
780 class InnerLoopUnroller : public InnerLoopVectorizer {
781 public:
782   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
783                     LoopInfo *LI, DominatorTree *DT,
784                     const TargetLibraryInfo *TLI,
785                     const TargetTransformInfo *TTI, AssumptionCache *AC,
786                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
787                     LoopVectorizationLegality *LVL,
788                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
789                     ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
790       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
791                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
792                             BFI, PSI, Check) {}
793 
794 private:
795   Value *getBroadcastInstrs(Value *V) override;
796 };
797 
798 /// Encapsulate information regarding vectorization of a loop and its epilogue.
799 /// This information is meant to be updated and used across two stages of
800 /// epilogue vectorization.
801 struct EpilogueLoopVectorizationInfo {
802   ElementCount MainLoopVF = ElementCount::getFixed(0);
803   unsigned MainLoopUF = 0;
804   ElementCount EpilogueVF = ElementCount::getFixed(0);
805   unsigned EpilogueUF = 0;
806   BasicBlock *MainLoopIterationCountCheck = nullptr;
807   BasicBlock *EpilogueIterationCountCheck = nullptr;
808   BasicBlock *SCEVSafetyCheck = nullptr;
809   BasicBlock *MemSafetyCheck = nullptr;
810   Value *TripCount = nullptr;
811   Value *VectorTripCount = nullptr;
812 
813   EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
814                                 ElementCount EVF, unsigned EUF)
815       : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
816     assert(EUF == 1 &&
817            "A high UF for the epilogue loop is likely not beneficial.");
818   }
819 };
820 
821 /// An extension of the inner loop vectorizer that creates a skeleton for a
822 /// vectorized loop that has its epilogue (residual) also vectorized.
823 /// The idea is to run the vplan on a given loop twice, firstly to setup the
824 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
825 /// from the first step and vectorize the epilogue.  This is achieved by
826 /// deriving two concrete strategy classes from this base class and invoking
827 /// them in succession from the loop vectorizer planner.
828 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
829 public:
830   InnerLoopAndEpilogueVectorizer(
831       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
832       DominatorTree *DT, const TargetLibraryInfo *TLI,
833       const TargetTransformInfo *TTI, AssumptionCache *AC,
834       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
835       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
836       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
837       GeneratedRTChecks &Checks)
838       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
839                             EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI,
840                             Checks),
841         EPI(EPI) {}
842 
843   // Override this function to handle the more complex control flow around the
844   // three loops.
845   std::pair<BasicBlock *, Value *>
846   createVectorizedLoopSkeleton() final override {
847     return createEpilogueVectorizedLoopSkeleton();
848   }
849 
850   /// The interface for creating a vectorized skeleton using one of two
851   /// different strategies, each corresponding to one execution of the vplan
852   /// as described above.
853   virtual std::pair<BasicBlock *, Value *>
854   createEpilogueVectorizedLoopSkeleton() = 0;
855 
856   /// Holds and updates state information required to vectorize the main loop
857   /// and its epilogue in two separate passes. This setup helps us avoid
858   /// regenerating and recomputing runtime safety checks. It also helps us to
859   /// shorten the iteration-count-check path length for the cases where the
860   /// iteration count of the loop is so small that the main vector loop is
861   /// completely skipped.
862   EpilogueLoopVectorizationInfo &EPI;
863 };
864 
865 /// A specialized derived class of inner loop vectorizer that performs
866 /// vectorization of *main* loops in the process of vectorizing loops and their
867 /// epilogues.
868 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
869 public:
870   EpilogueVectorizerMainLoop(
871       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
872       DominatorTree *DT, const TargetLibraryInfo *TLI,
873       const TargetTransformInfo *TTI, AssumptionCache *AC,
874       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
875       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
876       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
877       GeneratedRTChecks &Check)
878       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
879                                        EPI, LVL, CM, BFI, PSI, Check) {}
880   /// Implements the interface for creating a vectorized skeleton using the
881   /// *main loop* strategy (ie the first pass of vplan execution).
882   std::pair<BasicBlock *, Value *>
883   createEpilogueVectorizedLoopSkeleton() final override;
884 
885 protected:
886   /// Emits an iteration count bypass check once for the main loop (when \p
887   /// ForEpilogue is false) and once for the epilogue loop (when \p
888   /// ForEpilogue is true).
889   BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
890   void printDebugTracesAtStart() override;
891   void printDebugTracesAtEnd() override;
892 };
893 
894 // A specialized derived class of inner loop vectorizer that performs
895 // vectorization of *epilogue* loops in the process of vectorizing loops and
896 // their epilogues.
897 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
898 public:
899   EpilogueVectorizerEpilogueLoop(
900       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
901       DominatorTree *DT, const TargetLibraryInfo *TLI,
902       const TargetTransformInfo *TTI, AssumptionCache *AC,
903       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
904       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
905       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
906       GeneratedRTChecks &Checks)
907       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
908                                        EPI, LVL, CM, BFI, PSI, Checks) {
909     TripCount = EPI.TripCount;
910   }
911   /// Implements the interface for creating a vectorized skeleton using the
912   /// *epilogue loop* strategy (ie the second pass of vplan execution).
913   std::pair<BasicBlock *, Value *>
914   createEpilogueVectorizedLoopSkeleton() final override;
915 
916 protected:
917   /// Emits an iteration count bypass check after the main vector loop has
918   /// finished to see if there are any iterations left to execute by either
919   /// the vector epilogue or the scalar epilogue.
920   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
921                                                       BasicBlock *Bypass,
922                                                       BasicBlock *Insert);
923   void printDebugTracesAtStart() override;
924   void printDebugTracesAtEnd() override;
925 };
926 } // end namespace llvm
927 
928 /// Look for a meaningful debug location on the instruction or it's
929 /// operands.
930 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
931   if (!I)
932     return I;
933 
934   DebugLoc Empty;
935   if (I->getDebugLoc() != Empty)
936     return I;
937 
938   for (Use &Op : I->operands()) {
939     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
940       if (OpInst->getDebugLoc() != Empty)
941         return OpInst;
942   }
943 
944   return I;
945 }
946 
947 void InnerLoopVectorizer::setDebugLocFromInst(
948     const Value *V, Optional<IRBuilderBase *> CustomBuilder) {
949   IRBuilderBase *B = (CustomBuilder == None) ? &Builder : *CustomBuilder;
950   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) {
951     const DILocation *DIL = Inst->getDebugLoc();
952 
953     // When a FSDiscriminator is enabled, we don't need to add the multiply
954     // factors to the discriminators.
955     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
956         !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) {
957       // FIXME: For scalable vectors, assume vscale=1.
958       auto NewDIL =
959           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
960       if (NewDIL)
961         B->SetCurrentDebugLocation(NewDIL.getValue());
962       else
963         LLVM_DEBUG(dbgs()
964                    << "Failed to create new discriminator: "
965                    << DIL->getFilename() << " Line: " << DIL->getLine());
966     } else
967       B->SetCurrentDebugLocation(DIL);
968   } else
969     B->SetCurrentDebugLocation(DebugLoc());
970 }
971 
972 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
973 /// is passed, the message relates to that particular instruction.
974 #ifndef NDEBUG
975 static void debugVectorizationMessage(const StringRef Prefix,
976                                       const StringRef DebugMsg,
977                                       Instruction *I) {
978   dbgs() << "LV: " << Prefix << DebugMsg;
979   if (I != nullptr)
980     dbgs() << " " << *I;
981   else
982     dbgs() << '.';
983   dbgs() << '\n';
984 }
985 #endif
986 
987 /// Create an analysis remark that explains why vectorization failed
988 ///
989 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
990 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
991 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
992 /// the location of the remark.  \return the remark object that can be
993 /// streamed to.
994 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
995     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
996   Value *CodeRegion = TheLoop->getHeader();
997   DebugLoc DL = TheLoop->getStartLoc();
998 
999   if (I) {
1000     CodeRegion = I->getParent();
1001     // If there is no debug location attached to the instruction, revert back to
1002     // using the loop's.
1003     if (I->getDebugLoc())
1004       DL = I->getDebugLoc();
1005   }
1006 
1007   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
1008 }
1009 
1010 namespace llvm {
1011 
1012 /// Return a value for Step multiplied by VF.
1013 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
1014                        int64_t Step) {
1015   assert(Ty->isIntegerTy() && "Expected an integer step");
1016   Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue());
1017   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
1018 }
1019 
1020 /// Return the runtime value for VF.
1021 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
1022   Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
1023   return VF.isScalable() ? B.CreateVScale(EC) : EC;
1024 }
1025 
1026 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
1027                                   ElementCount VF) {
1028   assert(FTy->isFloatingPointTy() && "Expected floating point type!");
1029   Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
1030   Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
1031   return B.CreateUIToFP(RuntimeVF, FTy);
1032 }
1033 
1034 void reportVectorizationFailure(const StringRef DebugMsg,
1035                                 const StringRef OREMsg, const StringRef ORETag,
1036                                 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1037                                 Instruction *I) {
1038   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
1039   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1040   ORE->emit(
1041       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1042       << "loop not vectorized: " << OREMsg);
1043 }
1044 
1045 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1046                              OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1047                              Instruction *I) {
1048   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
1049   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1050   ORE->emit(
1051       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1052       << Msg);
1053 }
1054 
1055 } // end namespace llvm
1056 
1057 #ifndef NDEBUG
1058 /// \return string containing a file name and a line # for the given loop.
1059 static std::string getDebugLocString(const Loop *L) {
1060   std::string Result;
1061   if (L) {
1062     raw_string_ostream OS(Result);
1063     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1064       LoopDbgLoc.print(OS);
1065     else
1066       // Just print the module name.
1067       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1068     OS.flush();
1069   }
1070   return Result;
1071 }
1072 #endif
1073 
1074 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1075                                          const Instruction *Orig) {
1076   // If the loop was versioned with memchecks, add the corresponding no-alias
1077   // metadata.
1078   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1079     LVer->annotateInstWithNoAlias(To, Orig);
1080 }
1081 
1082 void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
1083     VPTransformState &State) {
1084 
1085   // Collect recipes in the backward slice of `Root` that may generate a poison
1086   // value that is used after vectorization.
1087   SmallPtrSet<VPRecipeBase *, 16> Visited;
1088   auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
1089     SmallVector<VPRecipeBase *, 16> Worklist;
1090     Worklist.push_back(Root);
1091 
1092     // Traverse the backward slice of Root through its use-def chain.
1093     while (!Worklist.empty()) {
1094       VPRecipeBase *CurRec = Worklist.back();
1095       Worklist.pop_back();
1096 
1097       if (!Visited.insert(CurRec).second)
1098         continue;
1099 
1100       // Prune search if we find another recipe generating a widen memory
1101       // instruction. Widen memory instructions involved in address computation
1102       // will lead to gather/scatter instructions, which don't need to be
1103       // handled.
1104       if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
1105           isa<VPInterleaveRecipe>(CurRec) ||
1106           isa<VPScalarIVStepsRecipe>(CurRec) ||
1107           isa<VPCanonicalIVPHIRecipe>(CurRec))
1108         continue;
1109 
1110       // This recipe contributes to the address computation of a widen
1111       // load/store. Collect recipe if its underlying instruction has
1112       // poison-generating flags.
1113       Instruction *Instr = CurRec->getUnderlyingInstr();
1114       if (Instr && Instr->hasPoisonGeneratingFlags())
1115         State.MayGeneratePoisonRecipes.insert(CurRec);
1116 
1117       // Add new definitions to the worklist.
1118       for (VPValue *operand : CurRec->operands())
1119         if (VPDef *OpDef = operand->getDef())
1120           Worklist.push_back(cast<VPRecipeBase>(OpDef));
1121     }
1122   });
1123 
1124   // Traverse all the recipes in the VPlan and collect the poison-generating
1125   // recipes in the backward slice starting at the address of a VPWidenRecipe or
1126   // VPInterleaveRecipe.
1127   auto Iter = depth_first(
1128       VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry()));
1129   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1130     for (VPRecipeBase &Recipe : *VPBB) {
1131       if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
1132         Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr();
1133         VPDef *AddrDef = WidenRec->getAddr()->getDef();
1134         if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr &&
1135             Legal->blockNeedsPredication(UnderlyingInstr->getParent()))
1136           collectPoisonGeneratingInstrsInBackwardSlice(
1137               cast<VPRecipeBase>(AddrDef));
1138       } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1139         VPDef *AddrDef = InterleaveRec->getAddr()->getDef();
1140         if (AddrDef) {
1141           // Check if any member of the interleave group needs predication.
1142           const InterleaveGroup<Instruction> *InterGroup =
1143               InterleaveRec->getInterleaveGroup();
1144           bool NeedPredication = false;
1145           for (int I = 0, NumMembers = InterGroup->getNumMembers();
1146                I < NumMembers; ++I) {
1147             Instruction *Member = InterGroup->getMember(I);
1148             if (Member)
1149               NeedPredication |=
1150                   Legal->blockNeedsPredication(Member->getParent());
1151           }
1152 
1153           if (NeedPredication)
1154             collectPoisonGeneratingInstrsInBackwardSlice(
1155                 cast<VPRecipeBase>(AddrDef));
1156         }
1157       }
1158     }
1159   }
1160 }
1161 
1162 void InnerLoopVectorizer::addMetadata(Instruction *To,
1163                                       Instruction *From) {
1164   propagateMetadata(To, From);
1165   addNewMetadata(To, From);
1166 }
1167 
1168 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1169                                       Instruction *From) {
1170   for (Value *V : To) {
1171     if (Instruction *I = dyn_cast<Instruction>(V))
1172       addMetadata(I, From);
1173   }
1174 }
1175 
1176 PHINode *InnerLoopVectorizer::getReductionResumeValue(
1177     const RecurrenceDescriptor &RdxDesc) {
1178   auto It = ReductionResumeValues.find(&RdxDesc);
1179   assert(It != ReductionResumeValues.end() &&
1180          "Expected to find a resume value for the reduction.");
1181   return It->second;
1182 }
1183 
1184 namespace llvm {
1185 
1186 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1187 // lowered.
1188 enum ScalarEpilogueLowering {
1189 
1190   // The default: allowing scalar epilogues.
1191   CM_ScalarEpilogueAllowed,
1192 
1193   // Vectorization with OptForSize: don't allow epilogues.
1194   CM_ScalarEpilogueNotAllowedOptSize,
1195 
1196   // A special case of vectorisation with OptForSize: loops with a very small
1197   // trip count are considered for vectorization under OptForSize, thereby
1198   // making sure the cost of their loop body is dominant, free of runtime
1199   // guards and scalar iteration overheads.
1200   CM_ScalarEpilogueNotAllowedLowTripLoop,
1201 
1202   // Loop hint predicate indicating an epilogue is undesired.
1203   CM_ScalarEpilogueNotNeededUsePredicate,
1204 
1205   // Directive indicating we must either tail fold or not vectorize
1206   CM_ScalarEpilogueNotAllowedUsePredicate
1207 };
1208 
1209 /// ElementCountComparator creates a total ordering for ElementCount
1210 /// for the purposes of using it in a set structure.
1211 struct ElementCountComparator {
1212   bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
1213     return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
1214            std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
1215   }
1216 };
1217 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
1218 
1219 /// LoopVectorizationCostModel - estimates the expected speedups due to
1220 /// vectorization.
1221 /// In many cases vectorization is not profitable. This can happen because of
1222 /// a number of reasons. In this class we mainly attempt to predict the
1223 /// expected speedup/slowdowns due to the supported instruction set. We use the
1224 /// TargetTransformInfo to query the different backends for the cost of
1225 /// different operations.
1226 class LoopVectorizationCostModel {
1227 public:
1228   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1229                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1230                              LoopVectorizationLegality *Legal,
1231                              const TargetTransformInfo &TTI,
1232                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1233                              AssumptionCache *AC,
1234                              OptimizationRemarkEmitter *ORE, const Function *F,
1235                              const LoopVectorizeHints *Hints,
1236                              InterleavedAccessInfo &IAI)
1237       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1238         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1239         Hints(Hints), InterleaveInfo(IAI) {}
1240 
1241   /// \return An upper bound for the vectorization factors (both fixed and
1242   /// scalable). If the factors are 0, vectorization and interleaving should be
1243   /// avoided up front.
1244   FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1245 
1246   /// \return True if runtime checks are required for vectorization, and false
1247   /// otherwise.
1248   bool runtimeChecksRequired();
1249 
1250   /// \return The most profitable vectorization factor and the cost of that VF.
1251   /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
1252   /// then this vectorization factor will be selected if vectorization is
1253   /// possible.
1254   VectorizationFactor
1255   selectVectorizationFactor(const ElementCountSet &CandidateVFs);
1256 
1257   VectorizationFactor
1258   selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1259                                     const LoopVectorizationPlanner &LVP);
1260 
1261   /// Setup cost-based decisions for user vectorization factor.
1262   /// \return true if the UserVF is a feasible VF to be chosen.
1263   bool selectUserVectorizationFactor(ElementCount UserVF) {
1264     collectUniformsAndScalars(UserVF);
1265     collectInstsToScalarize(UserVF);
1266     return expectedCost(UserVF).first.isValid();
1267   }
1268 
1269   /// \return The size (in bits) of the smallest and widest types in the code
1270   /// that needs to be vectorized. We ignore values that remain scalar such as
1271   /// 64 bit loop indices.
1272   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1273 
1274   /// \return The desired interleave count.
1275   /// If interleave count has been specified by metadata it will be returned.
1276   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1277   /// are the selected vectorization factor and the cost of the selected VF.
1278   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1279 
1280   /// Memory access instruction may be vectorized in more than one way.
1281   /// Form of instruction after vectorization depends on cost.
1282   /// This function takes cost-based decisions for Load/Store instructions
1283   /// and collects them in a map. This decisions map is used for building
1284   /// the lists of loop-uniform and loop-scalar instructions.
1285   /// The calculated cost is saved with widening decision in order to
1286   /// avoid redundant calculations.
1287   void setCostBasedWideningDecision(ElementCount VF);
1288 
1289   /// A struct that represents some properties of the register usage
1290   /// of a loop.
1291   struct RegisterUsage {
1292     /// Holds the number of loop invariant values that are used in the loop.
1293     /// The key is ClassID of target-provided register class.
1294     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1295     /// Holds the maximum number of concurrent live intervals in the loop.
1296     /// The key is ClassID of target-provided register class.
1297     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1298   };
1299 
1300   /// \return Returns information about the register usages of the loop for the
1301   /// given vectorization factors.
1302   SmallVector<RegisterUsage, 8>
1303   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1304 
1305   /// Collect values we want to ignore in the cost model.
1306   void collectValuesToIgnore();
1307 
1308   /// Collect all element types in the loop for which widening is needed.
1309   void collectElementTypesForWidening();
1310 
1311   /// Split reductions into those that happen in the loop, and those that happen
1312   /// outside. In loop reductions are collected into InLoopReductionChains.
1313   void collectInLoopReductions();
1314 
1315   /// Returns true if we should use strict in-order reductions for the given
1316   /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1317   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1318   /// of FP operations.
1319   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) {
1320     return !Hints->allowReordering() && RdxDesc.isOrdered();
1321   }
1322 
1323   /// \returns The smallest bitwidth each instruction can be represented with.
1324   /// The vector equivalents of these instructions should be truncated to this
1325   /// type.
1326   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1327     return MinBWs;
1328   }
1329 
1330   /// \returns True if it is more profitable to scalarize instruction \p I for
1331   /// vectorization factor \p VF.
1332   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1333     assert(VF.isVector() &&
1334            "Profitable to scalarize relevant only for VF > 1.");
1335 
1336     // Cost model is not run in the VPlan-native path - return conservative
1337     // result until this changes.
1338     if (EnableVPlanNativePath)
1339       return false;
1340 
1341     auto Scalars = InstsToScalarize.find(VF);
1342     assert(Scalars != InstsToScalarize.end() &&
1343            "VF not yet analyzed for scalarization profitability");
1344     return Scalars->second.find(I) != Scalars->second.end();
1345   }
1346 
1347   /// Returns true if \p I is known to be uniform after vectorization.
1348   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1349     if (VF.isScalar())
1350       return true;
1351 
1352     // Cost model is not run in the VPlan-native path - return conservative
1353     // result until this changes.
1354     if (EnableVPlanNativePath)
1355       return false;
1356 
1357     auto UniformsPerVF = Uniforms.find(VF);
1358     assert(UniformsPerVF != Uniforms.end() &&
1359            "VF not yet analyzed for uniformity");
1360     return UniformsPerVF->second.count(I);
1361   }
1362 
1363   /// Returns true if \p I is known to be scalar after vectorization.
1364   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1365     if (VF.isScalar())
1366       return true;
1367 
1368     // Cost model is not run in the VPlan-native path - return conservative
1369     // result until this changes.
1370     if (EnableVPlanNativePath)
1371       return false;
1372 
1373     auto ScalarsPerVF = Scalars.find(VF);
1374     assert(ScalarsPerVF != Scalars.end() &&
1375            "Scalar values are not calculated for VF");
1376     return ScalarsPerVF->second.count(I);
1377   }
1378 
1379   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1380   /// for vectorization factor \p VF.
1381   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1382     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1383            !isProfitableToScalarize(I, VF) &&
1384            !isScalarAfterVectorization(I, VF);
1385   }
1386 
1387   /// Decision that was taken during cost calculation for memory instruction.
1388   enum InstWidening {
1389     CM_Unknown,
1390     CM_Widen,         // For consecutive accesses with stride +1.
1391     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1392     CM_Interleave,
1393     CM_GatherScatter,
1394     CM_Scalarize
1395   };
1396 
1397   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1398   /// instruction \p I and vector width \p VF.
1399   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1400                            InstructionCost Cost) {
1401     assert(VF.isVector() && "Expected VF >=2");
1402     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1403   }
1404 
1405   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1406   /// interleaving group \p Grp and vector width \p VF.
1407   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1408                            ElementCount VF, InstWidening W,
1409                            InstructionCost Cost) {
1410     assert(VF.isVector() && "Expected VF >=2");
1411     /// Broadcast this decicion to all instructions inside the group.
1412     /// But the cost will be assigned to one instruction only.
1413     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1414       if (auto *I = Grp->getMember(i)) {
1415         if (Grp->getInsertPos() == I)
1416           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1417         else
1418           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1419       }
1420     }
1421   }
1422 
1423   /// Return the cost model decision for the given instruction \p I and vector
1424   /// width \p VF. Return CM_Unknown if this instruction did not pass
1425   /// through the cost modeling.
1426   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1427     assert(VF.isVector() && "Expected VF to be a vector VF");
1428     // Cost model is not run in the VPlan-native path - return conservative
1429     // result until this changes.
1430     if (EnableVPlanNativePath)
1431       return CM_GatherScatter;
1432 
1433     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1434     auto Itr = WideningDecisions.find(InstOnVF);
1435     if (Itr == WideningDecisions.end())
1436       return CM_Unknown;
1437     return Itr->second.first;
1438   }
1439 
1440   /// Return the vectorization cost for the given instruction \p I and vector
1441   /// width \p VF.
1442   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1443     assert(VF.isVector() && "Expected VF >=2");
1444     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1445     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1446            "The cost is not calculated");
1447     return WideningDecisions[InstOnVF].second;
1448   }
1449 
1450   /// Return True if instruction \p I is an optimizable truncate whose operand
1451   /// is an induction variable. Such a truncate will be removed by adding a new
1452   /// induction variable with the destination type.
1453   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1454     // If the instruction is not a truncate, return false.
1455     auto *Trunc = dyn_cast<TruncInst>(I);
1456     if (!Trunc)
1457       return false;
1458 
1459     // Get the source and destination types of the truncate.
1460     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1461     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1462 
1463     // If the truncate is free for the given types, return false. Replacing a
1464     // free truncate with an induction variable would add an induction variable
1465     // update instruction to each iteration of the loop. We exclude from this
1466     // check the primary induction variable since it will need an update
1467     // instruction regardless.
1468     Value *Op = Trunc->getOperand(0);
1469     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1470       return false;
1471 
1472     // If the truncated value is not an induction variable, return false.
1473     return Legal->isInductionPhi(Op);
1474   }
1475 
1476   /// Collects the instructions to scalarize for each predicated instruction in
1477   /// the loop.
1478   void collectInstsToScalarize(ElementCount VF);
1479 
1480   /// Collect Uniform and Scalar values for the given \p VF.
1481   /// The sets depend on CM decision for Load/Store instructions
1482   /// that may be vectorized as interleave, gather-scatter or scalarized.
1483   void collectUniformsAndScalars(ElementCount VF) {
1484     // Do the analysis once.
1485     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1486       return;
1487     setCostBasedWideningDecision(VF);
1488     collectLoopUniforms(VF);
1489     collectLoopScalars(VF);
1490   }
1491 
1492   /// Returns true if the target machine supports masked store operation
1493   /// for the given \p DataType and kind of access to \p Ptr.
1494   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1495     return Legal->isConsecutivePtr(DataType, Ptr) &&
1496            TTI.isLegalMaskedStore(DataType, Alignment);
1497   }
1498 
1499   /// Returns true if the target machine supports masked load operation
1500   /// for the given \p DataType and kind of access to \p Ptr.
1501   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1502     return Legal->isConsecutivePtr(DataType, Ptr) &&
1503            TTI.isLegalMaskedLoad(DataType, Alignment);
1504   }
1505 
1506   /// Returns true if the target machine can represent \p V as a masked gather
1507   /// or scatter operation.
1508   bool isLegalGatherOrScatter(Value *V,
1509                               ElementCount VF = ElementCount::getFixed(1)) {
1510     bool LI = isa<LoadInst>(V);
1511     bool SI = isa<StoreInst>(V);
1512     if (!LI && !SI)
1513       return false;
1514     auto *Ty = getLoadStoreType(V);
1515     Align Align = getLoadStoreAlignment(V);
1516     if (VF.isVector())
1517       Ty = VectorType::get(Ty, VF);
1518     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1519            (SI && TTI.isLegalMaskedScatter(Ty, Align));
1520   }
1521 
1522   /// Returns true if the target machine supports all of the reduction
1523   /// variables found for the given VF.
1524   bool canVectorizeReductions(ElementCount VF) const {
1525     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1526       const RecurrenceDescriptor &RdxDesc = Reduction.second;
1527       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1528     }));
1529   }
1530 
1531   /// Returns true if \p I is an instruction that will be scalarized with
1532   /// predication when vectorizing \p I with vectorization factor \p VF. Such
1533   /// instructions include conditional stores and instructions that may divide
1534   /// by zero.
1535   bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1536 
1537   // Returns true if \p I is an instruction that will be predicated either
1538   // through scalar predication or masked load/store or masked gather/scatter.
1539   // \p VF is the vectorization factor that will be used to vectorize \p I.
1540   // Superset of instructions that return true for isScalarWithPredication.
1541   bool isPredicatedInst(Instruction *I, ElementCount VF,
1542                         bool IsKnownUniform = false) {
1543     // When we know the load is uniform and the original scalar loop was not
1544     // predicated we don't need to mark it as a predicated instruction. Any
1545     // vectorised blocks created when tail-folding are something artificial we
1546     // have introduced and we know there is always at least one active lane.
1547     // That's why we call Legal->blockNeedsPredication here because it doesn't
1548     // query tail-folding.
1549     if (IsKnownUniform && isa<LoadInst>(I) &&
1550         !Legal->blockNeedsPredication(I->getParent()))
1551       return false;
1552     if (!blockNeedsPredicationForAnyReason(I->getParent()))
1553       return false;
1554     // Loads and stores that need some form of masked operation are predicated
1555     // instructions.
1556     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1557       return Legal->isMaskRequired(I);
1558     return isScalarWithPredication(I, VF);
1559   }
1560 
1561   /// Returns true if \p I is a memory instruction with consecutive memory
1562   /// access that can be widened.
1563   bool
1564   memoryInstructionCanBeWidened(Instruction *I,
1565                                 ElementCount VF = ElementCount::getFixed(1));
1566 
1567   /// Returns true if \p I is a memory instruction in an interleaved-group
1568   /// of memory accesses that can be vectorized with wide vector loads/stores
1569   /// and shuffles.
1570   bool
1571   interleavedAccessCanBeWidened(Instruction *I,
1572                                 ElementCount VF = ElementCount::getFixed(1));
1573 
1574   /// Check if \p Instr belongs to any interleaved access group.
1575   bool isAccessInterleaved(Instruction *Instr) {
1576     return InterleaveInfo.isInterleaved(Instr);
1577   }
1578 
1579   /// Get the interleaved access group that \p Instr belongs to.
1580   const InterleaveGroup<Instruction> *
1581   getInterleavedAccessGroup(Instruction *Instr) {
1582     return InterleaveInfo.getInterleaveGroup(Instr);
1583   }
1584 
1585   /// Returns true if we're required to use a scalar epilogue for at least
1586   /// the final iteration of the original loop.
1587   bool requiresScalarEpilogue(ElementCount VF) const {
1588     if (!isScalarEpilogueAllowed())
1589       return false;
1590     // If we might exit from anywhere but the latch, must run the exiting
1591     // iteration in scalar form.
1592     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1593       return true;
1594     return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();
1595   }
1596 
1597   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1598   /// loop hint annotation.
1599   bool isScalarEpilogueAllowed() const {
1600     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1601   }
1602 
1603   /// Returns true if all loop blocks should be masked to fold tail loop.
1604   bool foldTailByMasking() const { return FoldTailByMasking; }
1605 
1606   /// Returns true if the instructions in this block requires predication
1607   /// for any reason, e.g. because tail folding now requires a predicate
1608   /// or because the block in the original loop was predicated.
1609   bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1610     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1611   }
1612 
1613   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1614   /// nodes to the chain of instructions representing the reductions. Uses a
1615   /// MapVector to ensure deterministic iteration order.
1616   using ReductionChainMap =
1617       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1618 
1619   /// Return the chain of instructions representing an inloop reduction.
1620   const ReductionChainMap &getInLoopReductionChains() const {
1621     return InLoopReductionChains;
1622   }
1623 
1624   /// Returns true if the Phi is part of an inloop reduction.
1625   bool isInLoopReduction(PHINode *Phi) const {
1626     return InLoopReductionChains.count(Phi);
1627   }
1628 
1629   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1630   /// with factor VF.  Return the cost of the instruction, including
1631   /// scalarization overhead if it's needed.
1632   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1633 
1634   /// Estimate cost of a call instruction CI if it were vectorized with factor
1635   /// VF. Return the cost of the instruction, including scalarization overhead
1636   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1637   /// scalarized -
1638   /// i.e. either vector version isn't available, or is too expensive.
1639   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1640                                     bool &NeedToScalarize) const;
1641 
1642   /// Returns true if the per-lane cost of VectorizationFactor A is lower than
1643   /// that of B.
1644   bool isMoreProfitable(const VectorizationFactor &A,
1645                         const VectorizationFactor &B) const;
1646 
1647   /// Invalidates decisions already taken by the cost model.
1648   void invalidateCostModelingDecisions() {
1649     WideningDecisions.clear();
1650     Uniforms.clear();
1651     Scalars.clear();
1652   }
1653 
1654 private:
1655   unsigned NumPredStores = 0;
1656 
1657   /// Convenience function that returns the value of vscale_range iff
1658   /// vscale_range.min == vscale_range.max or otherwise returns the value
1659   /// returned by the corresponding TLI method.
1660   Optional<unsigned> getVScaleForTuning() const;
1661 
1662   /// \return An upper bound for the vectorization factors for both
1663   /// fixed and scalable vectorization, where the minimum-known number of
1664   /// elements is a power-of-2 larger than zero. If scalable vectorization is
1665   /// disabled or unsupported, then the scalable part will be equal to
1666   /// ElementCount::getScalable(0).
1667   FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1668                                            ElementCount UserVF,
1669                                            bool FoldTailByMasking);
1670 
1671   /// \return the maximized element count based on the targets vector
1672   /// registers and the loop trip-count, but limited to a maximum safe VF.
1673   /// This is a helper function of computeFeasibleMaxVF.
1674   /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure
1675   /// issue that occurred on one of the buildbots which cannot be reproduced
1676   /// without having access to the properietary compiler (see comments on
1677   /// D98509). The issue is currently under investigation and this workaround
1678   /// will be removed as soon as possible.
1679   ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
1680                                        unsigned SmallestType,
1681                                        unsigned WidestType,
1682                                        const ElementCount &MaxSafeVF,
1683                                        bool FoldTailByMasking);
1684 
1685   /// \return the maximum legal scalable VF, based on the safe max number
1686   /// of elements.
1687   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1688 
1689   /// The vectorization cost is a combination of the cost itself and a boolean
1690   /// indicating whether any of the contributing operations will actually
1691   /// operate on vector values after type legalization in the backend. If this
1692   /// latter value is false, then all operations will be scalarized (i.e. no
1693   /// vectorization has actually taken place).
1694   using VectorizationCostTy = std::pair<InstructionCost, bool>;
1695 
1696   /// Returns the expected execution cost. The unit of the cost does
1697   /// not matter because we use the 'cost' units to compare different
1698   /// vector widths. The cost that is returned is *not* normalized by
1699   /// the factor width. If \p Invalid is not nullptr, this function
1700   /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1701   /// each instruction that has an Invalid cost for the given VF.
1702   using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1703   VectorizationCostTy
1704   expectedCost(ElementCount VF,
1705                SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1706 
1707   /// Returns the execution time cost of an instruction for a given vector
1708   /// width. Vector width of one means scalar.
1709   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1710 
1711   /// The cost-computation logic from getInstructionCost which provides
1712   /// the vector type as an output parameter.
1713   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1714                                      Type *&VectorTy);
1715 
1716   /// Return the cost of instructions in an inloop reduction pattern, if I is
1717   /// part of that pattern.
1718   Optional<InstructionCost>
1719   getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1720                           TTI::TargetCostKind CostKind);
1721 
1722   /// Calculate vectorization cost of memory instruction \p I.
1723   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1724 
1725   /// The cost computation for scalarized memory instruction.
1726   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1727 
1728   /// The cost computation for interleaving group of memory instructions.
1729   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1730 
1731   /// The cost computation for Gather/Scatter instruction.
1732   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1733 
1734   /// The cost computation for widening instruction \p I with consecutive
1735   /// memory access.
1736   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1737 
1738   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1739   /// Load: scalar load + broadcast.
1740   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1741   /// element)
1742   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1743 
1744   /// Estimate the overhead of scalarizing an instruction. This is a
1745   /// convenience wrapper for the type-based getScalarizationOverhead API.
1746   InstructionCost getScalarizationOverhead(Instruction *I,
1747                                            ElementCount VF) const;
1748 
1749   /// Returns whether the instruction is a load or store and will be a emitted
1750   /// as a vector operation.
1751   bool isConsecutiveLoadOrStore(Instruction *I);
1752 
1753   /// Returns true if an artificially high cost for emulated masked memrefs
1754   /// should be used.
1755   bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1756 
1757   /// Map of scalar integer values to the smallest bitwidth they can be legally
1758   /// represented as. The vector equivalents of these values should be truncated
1759   /// to this type.
1760   MapVector<Instruction *, uint64_t> MinBWs;
1761 
1762   /// A type representing the costs for instructions if they were to be
1763   /// scalarized rather than vectorized. The entries are Instruction-Cost
1764   /// pairs.
1765   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1766 
1767   /// A set containing all BasicBlocks that are known to present after
1768   /// vectorization as a predicated block.
1769   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1770 
1771   /// Records whether it is allowed to have the original scalar loop execute at
1772   /// least once. This may be needed as a fallback loop in case runtime
1773   /// aliasing/dependence checks fail, or to handle the tail/remainder
1774   /// iterations when the trip count is unknown or doesn't divide by the VF,
1775   /// or as a peel-loop to handle gaps in interleave-groups.
1776   /// Under optsize and when the trip count is very small we don't allow any
1777   /// iterations to execute in the scalar loop.
1778   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1779 
1780   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1781   bool FoldTailByMasking = false;
1782 
1783   /// A map holding scalar costs for different vectorization factors. The
1784   /// presence of a cost for an instruction in the mapping indicates that the
1785   /// instruction will be scalarized when vectorizing with the associated
1786   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1787   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1788 
1789   /// Holds the instructions known to be uniform after vectorization.
1790   /// The data is collected per VF.
1791   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1792 
1793   /// Holds the instructions known to be scalar after vectorization.
1794   /// The data is collected per VF.
1795   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1796 
1797   /// Holds the instructions (address computations) that are forced to be
1798   /// scalarized.
1799   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1800 
1801   /// PHINodes of the reductions that should be expanded in-loop along with
1802   /// their associated chains of reduction operations, in program order from top
1803   /// (PHI) to bottom
1804   ReductionChainMap InLoopReductionChains;
1805 
1806   /// A Map of inloop reduction operations and their immediate chain operand.
1807   /// FIXME: This can be removed once reductions can be costed correctly in
1808   /// vplan. This was added to allow quick lookup to the inloop operations,
1809   /// without having to loop through InLoopReductionChains.
1810   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1811 
1812   /// Returns the expected difference in cost from scalarizing the expression
1813   /// feeding a predicated instruction \p PredInst. The instructions to
1814   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1815   /// non-negative return value implies the expression will be scalarized.
1816   /// Currently, only single-use chains are considered for scalarization.
1817   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1818                               ElementCount VF);
1819 
1820   /// Collect the instructions that are uniform after vectorization. An
1821   /// instruction is uniform if we represent it with a single scalar value in
1822   /// the vectorized loop corresponding to each vector iteration. Examples of
1823   /// uniform instructions include pointer operands of consecutive or
1824   /// interleaved memory accesses. Note that although uniformity implies an
1825   /// instruction will be scalar, the reverse is not true. In general, a
1826   /// scalarized instruction will be represented by VF scalar values in the
1827   /// vectorized loop, each corresponding to an iteration of the original
1828   /// scalar loop.
1829   void collectLoopUniforms(ElementCount VF);
1830 
1831   /// Collect the instructions that are scalar after vectorization. An
1832   /// instruction is scalar if it is known to be uniform or will be scalarized
1833   /// during vectorization. collectLoopScalars should only add non-uniform nodes
1834   /// to the list if they are used by a load/store instruction that is marked as
1835   /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1836   /// VF values in the vectorized loop, each corresponding to an iteration of
1837   /// the original scalar loop.
1838   void collectLoopScalars(ElementCount VF);
1839 
1840   /// Keeps cost model vectorization decision and cost for instructions.
1841   /// Right now it is used for memory instructions only.
1842   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1843                                 std::pair<InstWidening, InstructionCost>>;
1844 
1845   DecisionList WideningDecisions;
1846 
1847   /// Returns true if \p V is expected to be vectorized and it needs to be
1848   /// extracted.
1849   bool needsExtract(Value *V, ElementCount VF) const {
1850     Instruction *I = dyn_cast<Instruction>(V);
1851     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1852         TheLoop->isLoopInvariant(I))
1853       return false;
1854 
1855     // Assume we can vectorize V (and hence we need extraction) if the
1856     // scalars are not computed yet. This can happen, because it is called
1857     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1858     // the scalars are collected. That should be a safe assumption in most
1859     // cases, because we check if the operands have vectorizable types
1860     // beforehand in LoopVectorizationLegality.
1861     return Scalars.find(VF) == Scalars.end() ||
1862            !isScalarAfterVectorization(I, VF);
1863   };
1864 
1865   /// Returns a range containing only operands needing to be extracted.
1866   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1867                                                    ElementCount VF) const {
1868     return SmallVector<Value *, 4>(make_filter_range(
1869         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1870   }
1871 
1872   /// Determines if we have the infrastructure to vectorize loop \p L and its
1873   /// epilogue, assuming the main loop is vectorized by \p VF.
1874   bool isCandidateForEpilogueVectorization(const Loop &L,
1875                                            const ElementCount VF) const;
1876 
1877   /// Returns true if epilogue vectorization is considered profitable, and
1878   /// false otherwise.
1879   /// \p VF is the vectorization factor chosen for the original loop.
1880   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1881 
1882 public:
1883   /// The loop that we evaluate.
1884   Loop *TheLoop;
1885 
1886   /// Predicated scalar evolution analysis.
1887   PredicatedScalarEvolution &PSE;
1888 
1889   /// Loop Info analysis.
1890   LoopInfo *LI;
1891 
1892   /// Vectorization legality.
1893   LoopVectorizationLegality *Legal;
1894 
1895   /// Vector target information.
1896   const TargetTransformInfo &TTI;
1897 
1898   /// Target Library Info.
1899   const TargetLibraryInfo *TLI;
1900 
1901   /// Demanded bits analysis.
1902   DemandedBits *DB;
1903 
1904   /// Assumption cache.
1905   AssumptionCache *AC;
1906 
1907   /// Interface to emit optimization remarks.
1908   OptimizationRemarkEmitter *ORE;
1909 
1910   const Function *TheFunction;
1911 
1912   /// Loop Vectorize Hint.
1913   const LoopVectorizeHints *Hints;
1914 
1915   /// The interleave access information contains groups of interleaved accesses
1916   /// with the same stride and close to each other.
1917   InterleavedAccessInfo &InterleaveInfo;
1918 
1919   /// Values to ignore in the cost model.
1920   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1921 
1922   /// Values to ignore in the cost model when VF > 1.
1923   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1924 
1925   /// All element types found in the loop.
1926   SmallPtrSet<Type *, 16> ElementTypesInLoop;
1927 
1928   /// Profitable vector factors.
1929   SmallVector<VectorizationFactor, 8> ProfitableVFs;
1930 };
1931 } // end namespace llvm
1932 
1933 /// Helper struct to manage generating runtime checks for vectorization.
1934 ///
1935 /// The runtime checks are created up-front in temporary blocks to allow better
1936 /// estimating the cost and un-linked from the existing IR. After deciding to
1937 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1938 /// temporary blocks are completely removed.
1939 class GeneratedRTChecks {
1940   /// Basic block which contains the generated SCEV checks, if any.
1941   BasicBlock *SCEVCheckBlock = nullptr;
1942 
1943   /// The value representing the result of the generated SCEV checks. If it is
1944   /// nullptr, either no SCEV checks have been generated or they have been used.
1945   Value *SCEVCheckCond = nullptr;
1946 
1947   /// Basic block which contains the generated memory runtime checks, if any.
1948   BasicBlock *MemCheckBlock = nullptr;
1949 
1950   /// The value representing the result of the generated memory runtime checks.
1951   /// If it is nullptr, either no memory runtime checks have been generated or
1952   /// they have been used.
1953   Value *MemRuntimeCheckCond = nullptr;
1954 
1955   DominatorTree *DT;
1956   LoopInfo *LI;
1957 
1958   SCEVExpander SCEVExp;
1959   SCEVExpander MemCheckExp;
1960 
1961 public:
1962   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1963                     const DataLayout &DL)
1964       : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"),
1965         MemCheckExp(SE, DL, "scev.check") {}
1966 
1967   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1968   /// accurately estimate the cost of the runtime checks. The blocks are
1969   /// un-linked from the IR and is added back during vector code generation. If
1970   /// there is no vector code generation, the check blocks are removed
1971   /// completely.
1972   void Create(Loop *L, const LoopAccessInfo &LAI,
1973               const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1974 
1975     BasicBlock *LoopHeader = L->getHeader();
1976     BasicBlock *Preheader = L->getLoopPreheader();
1977 
1978     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1979     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1980     // may be used by SCEVExpander. The blocks will be un-linked from their
1981     // predecessors and removed from LI & DT at the end of the function.
1982     if (!UnionPred.isAlwaysTrue()) {
1983       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1984                                   nullptr, "vector.scevcheck");
1985 
1986       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1987           &UnionPred, SCEVCheckBlock->getTerminator());
1988     }
1989 
1990     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1991     if (RtPtrChecking.Need) {
1992       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1993       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1994                                  "vector.memcheck");
1995 
1996       auto DiffChecks = RtPtrChecking.getDiffChecks();
1997       if (DiffChecks) {
1998         MemRuntimeCheckCond = addDiffRuntimeChecks(
1999             MemCheckBlock->getTerminator(), L, *DiffChecks, MemCheckExp,
2000             [VF](IRBuilderBase &B, unsigned Bits) {
2001               return getRuntimeVF(B, B.getIntNTy(Bits), VF);
2002             },
2003             IC);
2004       } else {
2005         MemRuntimeCheckCond =
2006             addRuntimeChecks(MemCheckBlock->getTerminator(), L,
2007                              RtPtrChecking.getChecks(), MemCheckExp);
2008       }
2009       assert(MemRuntimeCheckCond &&
2010              "no RT checks generated although RtPtrChecking "
2011              "claimed checks are required");
2012     }
2013 
2014     if (!MemCheckBlock && !SCEVCheckBlock)
2015       return;
2016 
2017     // Unhook the temporary block with the checks, update various places
2018     // accordingly.
2019     if (SCEVCheckBlock)
2020       SCEVCheckBlock->replaceAllUsesWith(Preheader);
2021     if (MemCheckBlock)
2022       MemCheckBlock->replaceAllUsesWith(Preheader);
2023 
2024     if (SCEVCheckBlock) {
2025       SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2026       new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
2027       Preheader->getTerminator()->eraseFromParent();
2028     }
2029     if (MemCheckBlock) {
2030       MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2031       new UnreachableInst(Preheader->getContext(), MemCheckBlock);
2032       Preheader->getTerminator()->eraseFromParent();
2033     }
2034 
2035     DT->changeImmediateDominator(LoopHeader, Preheader);
2036     if (MemCheckBlock) {
2037       DT->eraseNode(MemCheckBlock);
2038       LI->removeBlock(MemCheckBlock);
2039     }
2040     if (SCEVCheckBlock) {
2041       DT->eraseNode(SCEVCheckBlock);
2042       LI->removeBlock(SCEVCheckBlock);
2043     }
2044   }
2045 
2046   /// Remove the created SCEV & memory runtime check blocks & instructions, if
2047   /// unused.
2048   ~GeneratedRTChecks() {
2049     SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2050     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2051     if (!SCEVCheckCond)
2052       SCEVCleaner.markResultUsed();
2053 
2054     if (!MemRuntimeCheckCond)
2055       MemCheckCleaner.markResultUsed();
2056 
2057     if (MemRuntimeCheckCond) {
2058       auto &SE = *MemCheckExp.getSE();
2059       // Memory runtime check generation creates compares that use expanded
2060       // values. Remove them before running the SCEVExpanderCleaners.
2061       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2062         if (MemCheckExp.isInsertedInstruction(&I))
2063           continue;
2064         SE.forgetValue(&I);
2065         I.eraseFromParent();
2066       }
2067     }
2068     MemCheckCleaner.cleanup();
2069     SCEVCleaner.cleanup();
2070 
2071     if (SCEVCheckCond)
2072       SCEVCheckBlock->eraseFromParent();
2073     if (MemRuntimeCheckCond)
2074       MemCheckBlock->eraseFromParent();
2075   }
2076 
2077   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2078   /// adjusts the branches to branch to the vector preheader or \p Bypass,
2079   /// depending on the generated condition.
2080   BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2081                              BasicBlock *LoopVectorPreHeader,
2082                              BasicBlock *LoopExitBlock) {
2083     if (!SCEVCheckCond)
2084       return nullptr;
2085 
2086     Value *Cond = SCEVCheckCond;
2087     // Mark the check as used, to prevent it from being removed during cleanup.
2088     SCEVCheckCond = nullptr;
2089     if (auto *C = dyn_cast<ConstantInt>(Cond))
2090       if (C->isZero())
2091         return nullptr;
2092 
2093     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2094 
2095     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2096     // Create new preheader for vector loop.
2097     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2098       PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2099 
2100     SCEVCheckBlock->getTerminator()->eraseFromParent();
2101     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2102     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2103                                                 SCEVCheckBlock);
2104 
2105     DT->addNewBlock(SCEVCheckBlock, Pred);
2106     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2107 
2108     ReplaceInstWithInst(SCEVCheckBlock->getTerminator(),
2109                         BranchInst::Create(Bypass, LoopVectorPreHeader, Cond));
2110     return SCEVCheckBlock;
2111   }
2112 
2113   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2114   /// the branches to branch to the vector preheader or \p Bypass, depending on
2115   /// the generated condition.
2116   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2117                                    BasicBlock *LoopVectorPreHeader) {
2118     // Check if we generated code that checks in runtime if arrays overlap.
2119     if (!MemRuntimeCheckCond)
2120       return nullptr;
2121 
2122     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2123     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2124                                                 MemCheckBlock);
2125 
2126     DT->addNewBlock(MemCheckBlock, Pred);
2127     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2128     MemCheckBlock->moveBefore(LoopVectorPreHeader);
2129 
2130     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2131       PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2132 
2133     ReplaceInstWithInst(
2134         MemCheckBlock->getTerminator(),
2135         BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2136     MemCheckBlock->getTerminator()->setDebugLoc(
2137         Pred->getTerminator()->getDebugLoc());
2138 
2139     // Mark the check as used, to prevent it from being removed during cleanup.
2140     MemRuntimeCheckCond = nullptr;
2141     return MemCheckBlock;
2142   }
2143 };
2144 
2145 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2146 // vectorization. The loop needs to be annotated with #pragma omp simd
2147 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2148 // vector length information is not provided, vectorization is not considered
2149 // explicit. Interleave hints are not allowed either. These limitations will be
2150 // relaxed in the future.
2151 // Please, note that we are currently forced to abuse the pragma 'clang
2152 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2153 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2154 // provides *explicit vectorization hints* (LV can bypass legal checks and
2155 // assume that vectorization is legal). However, both hints are implemented
2156 // using the same metadata (llvm.loop.vectorize, processed by
2157 // LoopVectorizeHints). This will be fixed in the future when the native IR
2158 // representation for pragma 'omp simd' is introduced.
2159 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2160                                    OptimizationRemarkEmitter *ORE) {
2161   assert(!OuterLp->isInnermost() && "This is not an outer loop");
2162   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2163 
2164   // Only outer loops with an explicit vectorization hint are supported.
2165   // Unannotated outer loops are ignored.
2166   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2167     return false;
2168 
2169   Function *Fn = OuterLp->getHeader()->getParent();
2170   if (!Hints.allowVectorization(Fn, OuterLp,
2171                                 true /*VectorizeOnlyWhenForced*/)) {
2172     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2173     return false;
2174   }
2175 
2176   if (Hints.getInterleave() > 1) {
2177     // TODO: Interleave support is future work.
2178     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2179                          "outer loops.\n");
2180     Hints.emitRemarkWithHints();
2181     return false;
2182   }
2183 
2184   return true;
2185 }
2186 
2187 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2188                                   OptimizationRemarkEmitter *ORE,
2189                                   SmallVectorImpl<Loop *> &V) {
2190   // Collect inner loops and outer loops without irreducible control flow. For
2191   // now, only collect outer loops that have explicit vectorization hints. If we
2192   // are stress testing the VPlan H-CFG construction, we collect the outermost
2193   // loop of every loop nest.
2194   if (L.isInnermost() || VPlanBuildStressTest ||
2195       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2196     LoopBlocksRPO RPOT(&L);
2197     RPOT.perform(LI);
2198     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2199       V.push_back(&L);
2200       // TODO: Collect inner loops inside marked outer loops in case
2201       // vectorization fails for the outer loop. Do not invoke
2202       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2203       // already known to be reducible. We can use an inherited attribute for
2204       // that.
2205       return;
2206     }
2207   }
2208   for (Loop *InnerL : L)
2209     collectSupportedLoops(*InnerL, LI, ORE, V);
2210 }
2211 
2212 namespace {
2213 
2214 /// The LoopVectorize Pass.
2215 struct LoopVectorize : public FunctionPass {
2216   /// Pass identification, replacement for typeid
2217   static char ID;
2218 
2219   LoopVectorizePass Impl;
2220 
2221   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2222                          bool VectorizeOnlyWhenForced = false)
2223       : FunctionPass(ID),
2224         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2225     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2226   }
2227 
2228   bool runOnFunction(Function &F) override {
2229     if (skipFunction(F))
2230       return false;
2231 
2232     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2233     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2234     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2235     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2236     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2237     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2238     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2239     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2240     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2241     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2242     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2243     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2244     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2245 
2246     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2247         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2248 
2249     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2250                         GetLAA, *ORE, PSI).MadeAnyChange;
2251   }
2252 
2253   void getAnalysisUsage(AnalysisUsage &AU) const override {
2254     AU.addRequired<AssumptionCacheTracker>();
2255     AU.addRequired<BlockFrequencyInfoWrapperPass>();
2256     AU.addRequired<DominatorTreeWrapperPass>();
2257     AU.addRequired<LoopInfoWrapperPass>();
2258     AU.addRequired<ScalarEvolutionWrapperPass>();
2259     AU.addRequired<TargetTransformInfoWrapperPass>();
2260     AU.addRequired<AAResultsWrapperPass>();
2261     AU.addRequired<LoopAccessLegacyAnalysis>();
2262     AU.addRequired<DemandedBitsWrapperPass>();
2263     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2264     AU.addRequired<InjectTLIMappingsLegacy>();
2265 
2266     // We currently do not preserve loopinfo/dominator analyses with outer loop
2267     // vectorization. Until this is addressed, mark these analyses as preserved
2268     // only for non-VPlan-native path.
2269     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2270     if (!EnableVPlanNativePath) {
2271       AU.addPreserved<LoopInfoWrapperPass>();
2272       AU.addPreserved<DominatorTreeWrapperPass>();
2273     }
2274 
2275     AU.addPreserved<BasicAAWrapperPass>();
2276     AU.addPreserved<GlobalsAAWrapperPass>();
2277     AU.addRequired<ProfileSummaryInfoWrapperPass>();
2278   }
2279 };
2280 
2281 } // end anonymous namespace
2282 
2283 //===----------------------------------------------------------------------===//
2284 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2285 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2286 //===----------------------------------------------------------------------===//
2287 
2288 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2289   // We need to place the broadcast of invariant variables outside the loop,
2290   // but only if it's proven safe to do so. Else, broadcast will be inside
2291   // vector loop body.
2292   Instruction *Instr = dyn_cast<Instruction>(V);
2293   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2294                      (!Instr ||
2295                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2296   // Place the code for broadcasting invariant variables in the new preheader.
2297   IRBuilder<>::InsertPointGuard Guard(Builder);
2298   if (SafeToHoist)
2299     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2300 
2301   // Broadcast the scalar into all locations in the vector.
2302   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2303 
2304   return Shuf;
2305 }
2306 
2307 /// This function adds
2308 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
2309 /// to each vector element of Val. The sequence starts at StartIndex.
2310 /// \p Opcode is relevant for FP induction variable.
2311 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
2312                             Instruction::BinaryOps BinOp, ElementCount VF,
2313                             IRBuilderBase &Builder) {
2314   assert(VF.isVector() && "only vector VFs are supported");
2315 
2316   // Create and check the types.
2317   auto *ValVTy = cast<VectorType>(Val->getType());
2318   ElementCount VLen = ValVTy->getElementCount();
2319 
2320   Type *STy = Val->getType()->getScalarType();
2321   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2322          "Induction Step must be an integer or FP");
2323   assert(Step->getType() == STy && "Step has wrong type");
2324 
2325   SmallVector<Constant *, 8> Indices;
2326 
2327   // Create a vector of consecutive numbers from zero to VF.
2328   VectorType *InitVecValVTy = ValVTy;
2329   if (STy->isFloatingPointTy()) {
2330     Type *InitVecValSTy =
2331         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
2332     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2333   }
2334   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2335 
2336   // Splat the StartIdx
2337   Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
2338 
2339   if (STy->isIntegerTy()) {
2340     InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2341     Step = Builder.CreateVectorSplat(VLen, Step);
2342     assert(Step->getType() == Val->getType() && "Invalid step vec");
2343     // FIXME: The newly created binary instructions should contain nsw/nuw
2344     // flags, which can be found from the original scalar operations.
2345     Step = Builder.CreateMul(InitVec, Step);
2346     return Builder.CreateAdd(Val, Step, "induction");
2347   }
2348 
2349   // Floating point induction.
2350   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2351          "Binary Opcode should be specified for FP induction");
2352   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2353   InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
2354 
2355   Step = Builder.CreateVectorSplat(VLen, Step);
2356   Value *MulOp = Builder.CreateFMul(InitVec, Step);
2357   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2358 }
2359 
2360 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
2361 /// variable on which to base the steps, \p Step is the size of the step.
2362 static void buildScalarSteps(Value *ScalarIV, Value *Step,
2363                              const InductionDescriptor &ID, VPValue *Def,
2364                              VPTransformState &State) {
2365   IRBuilderBase &Builder = State.Builder;
2366   // We shouldn't have to build scalar steps if we aren't vectorizing.
2367   assert(State.VF.isVector() && "VF should be greater than one");
2368   // Get the value type and ensure it and the step have the same integer type.
2369   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2370   assert(ScalarIVTy == Step->getType() &&
2371          "Val and Step should have the same type");
2372 
2373   // We build scalar steps for both integer and floating-point induction
2374   // variables. Here, we determine the kind of arithmetic we will perform.
2375   Instruction::BinaryOps AddOp;
2376   Instruction::BinaryOps MulOp;
2377   if (ScalarIVTy->isIntegerTy()) {
2378     AddOp = Instruction::Add;
2379     MulOp = Instruction::Mul;
2380   } else {
2381     AddOp = ID.getInductionOpcode();
2382     MulOp = Instruction::FMul;
2383   }
2384 
2385   // Determine the number of scalars we need to generate for each unroll
2386   // iteration.
2387   bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def);
2388   unsigned Lanes = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2389   // Compute the scalar steps and save the results in State.
2390   Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2391                                      ScalarIVTy->getScalarSizeInBits());
2392   Type *VecIVTy = nullptr;
2393   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2394   if (!FirstLaneOnly && State.VF.isScalable()) {
2395     VecIVTy = VectorType::get(ScalarIVTy, State.VF);
2396     UnitStepVec =
2397         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
2398     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
2399     SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV);
2400   }
2401 
2402   for (unsigned Part = 0; Part < State.UF; ++Part) {
2403     Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
2404 
2405     if (!FirstLaneOnly && State.VF.isScalable()) {
2406       auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
2407       auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2408       if (ScalarIVTy->isFloatingPointTy())
2409         InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2410       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2411       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2412       State.set(Def, Add, Part);
2413       // It's useful to record the lane values too for the known minimum number
2414       // of elements so we do those below. This improves the code quality when
2415       // trying to extract the first element, for example.
2416     }
2417 
2418     if (ScalarIVTy->isFloatingPointTy())
2419       StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2420 
2421     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2422       Value *StartIdx = Builder.CreateBinOp(
2423           AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2424       // The step returned by `createStepForVF` is a runtime-evaluated value
2425       // when VF is scalable. Otherwise, it should be folded into a Constant.
2426       assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
2427              "Expected StartIdx to be folded to a constant when VF is not "
2428              "scalable");
2429       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2430       auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2431       State.set(Def, Add, VPIteration(Part, Lane));
2432     }
2433   }
2434 }
2435 
2436 // Generate code for the induction step. Note that induction steps are
2437 // required to be loop-invariant
2438 static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE,
2439                               Instruction *InsertBefore,
2440                               Loop *OrigLoop = nullptr) {
2441   const DataLayout &DL = SE.getDataLayout();
2442   assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) &&
2443          "Induction step should be loop invariant");
2444   if (auto *E = dyn_cast<SCEVUnknown>(Step))
2445     return E->getValue();
2446 
2447   SCEVExpander Exp(SE, DL, "induction");
2448   return Exp.expandCodeFor(Step, Step->getType(), InsertBefore);
2449 }
2450 
2451 /// Compute the transformed value of Index at offset StartValue using step
2452 /// StepValue.
2453 /// For integer induction, returns StartValue + Index * StepValue.
2454 /// For pointer induction, returns StartValue[Index * StepValue].
2455 /// FIXME: The newly created binary instructions should contain nsw/nuw
2456 /// flags, which can be found from the original scalar operations.
2457 static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index,
2458                                    Value *StartValue, Value *Step,
2459                                    const InductionDescriptor &ID) {
2460   assert(Index->getType()->getScalarType() == Step->getType() &&
2461          "Index scalar type does not match StepValue type");
2462 
2463   // Note: the IR at this point is broken. We cannot use SE to create any new
2464   // SCEV and then expand it, hoping that SCEV's simplification will give us
2465   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2466   // lead to various SCEV crashes. So all we can do is to use builder and rely
2467   // on InstCombine for future simplifications. Here we handle some trivial
2468   // cases only.
2469   auto CreateAdd = [&B](Value *X, Value *Y) {
2470     assert(X->getType() == Y->getType() && "Types don't match!");
2471     if (auto *CX = dyn_cast<ConstantInt>(X))
2472       if (CX->isZero())
2473         return Y;
2474     if (auto *CY = dyn_cast<ConstantInt>(Y))
2475       if (CY->isZero())
2476         return X;
2477     return B.CreateAdd(X, Y);
2478   };
2479 
2480   // We allow X to be a vector type, in which case Y will potentially be
2481   // splatted into a vector with the same element count.
2482   auto CreateMul = [&B](Value *X, Value *Y) {
2483     assert(X->getType()->getScalarType() == Y->getType() &&
2484            "Types don't match!");
2485     if (auto *CX = dyn_cast<ConstantInt>(X))
2486       if (CX->isOne())
2487         return Y;
2488     if (auto *CY = dyn_cast<ConstantInt>(Y))
2489       if (CY->isOne())
2490         return X;
2491     VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2492     if (XVTy && !isa<VectorType>(Y->getType()))
2493       Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2494     return B.CreateMul(X, Y);
2495   };
2496 
2497   switch (ID.getKind()) {
2498   case InductionDescriptor::IK_IntInduction: {
2499     assert(!isa<VectorType>(Index->getType()) &&
2500            "Vector indices not supported for integer inductions yet");
2501     assert(Index->getType() == StartValue->getType() &&
2502            "Index type does not match StartValue type");
2503     if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2504       return B.CreateSub(StartValue, Index);
2505     auto *Offset = CreateMul(Index, Step);
2506     return CreateAdd(StartValue, Offset);
2507   }
2508   case InductionDescriptor::IK_PtrInduction: {
2509     assert(isa<Constant>(Step) &&
2510            "Expected constant step for pointer induction");
2511     return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step));
2512   }
2513   case InductionDescriptor::IK_FpInduction: {
2514     assert(!isa<VectorType>(Index->getType()) &&
2515            "Vector indices not supported for FP inductions yet");
2516     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2517     auto InductionBinOp = ID.getInductionBinOp();
2518     assert(InductionBinOp &&
2519            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2520             InductionBinOp->getOpcode() == Instruction::FSub) &&
2521            "Original bin op should be defined for FP induction");
2522 
2523     Value *MulExp = B.CreateFMul(Step, Index);
2524     return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2525                          "induction");
2526   }
2527   case InductionDescriptor::IK_NoInduction:
2528     return nullptr;
2529   }
2530   llvm_unreachable("invalid enum");
2531 }
2532 
2533 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2534                                                     const VPIteration &Instance,
2535                                                     VPTransformState &State) {
2536   Value *ScalarInst = State.get(Def, Instance);
2537   Value *VectorValue = State.get(Def, Instance.Part);
2538   VectorValue = Builder.CreateInsertElement(
2539       VectorValue, ScalarInst,
2540       Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2541   State.set(Def, VectorValue, Instance.Part);
2542 }
2543 
2544 // Return whether we allow using masked interleave-groups (for dealing with
2545 // strided loads/stores that reside in predicated blocks, or for dealing
2546 // with gaps).
2547 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2548   // If an override option has been passed in for interleaved accesses, use it.
2549   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2550     return EnableMaskedInterleavedMemAccesses;
2551 
2552   return TTI.enableMaskedInterleavedAccessVectorization();
2553 }
2554 
2555 // Try to vectorize the interleave group that \p Instr belongs to.
2556 //
2557 // E.g. Translate following interleaved load group (factor = 3):
2558 //   for (i = 0; i < N; i+=3) {
2559 //     R = Pic[i];             // Member of index 0
2560 //     G = Pic[i+1];           // Member of index 1
2561 //     B = Pic[i+2];           // Member of index 2
2562 //     ... // do something to R, G, B
2563 //   }
2564 // To:
2565 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2566 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2567 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2568 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2569 //
2570 // Or translate following interleaved store group (factor = 3):
2571 //   for (i = 0; i < N; i+=3) {
2572 //     ... do something to R, G, B
2573 //     Pic[i]   = R;           // Member of index 0
2574 //     Pic[i+1] = G;           // Member of index 1
2575 //     Pic[i+2] = B;           // Member of index 2
2576 //   }
2577 // To:
2578 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2579 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2580 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2581 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2582 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2583 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2584     const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2585     VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2586     VPValue *BlockInMask) {
2587   Instruction *Instr = Group->getInsertPos();
2588   const DataLayout &DL = Instr->getModule()->getDataLayout();
2589 
2590   // Prepare for the vector type of the interleaved load/store.
2591   Type *ScalarTy = getLoadStoreType(Instr);
2592   unsigned InterleaveFactor = Group->getFactor();
2593   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2594   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2595 
2596   // Prepare for the new pointers.
2597   SmallVector<Value *, 2> AddrParts;
2598   unsigned Index = Group->getIndex(Instr);
2599 
2600   // TODO: extend the masked interleaved-group support to reversed access.
2601   assert((!BlockInMask || !Group->isReverse()) &&
2602          "Reversed masked interleave-group not supported.");
2603 
2604   // If the group is reverse, adjust the index to refer to the last vector lane
2605   // instead of the first. We adjust the index from the first vector lane,
2606   // rather than directly getting the pointer for lane VF - 1, because the
2607   // pointer operand of the interleaved access is supposed to be uniform. For
2608   // uniform instructions, we're only required to generate a value for the
2609   // first vector lane in each unroll iteration.
2610   if (Group->isReverse())
2611     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2612 
2613   for (unsigned Part = 0; Part < UF; Part++) {
2614     Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2615     setDebugLocFromInst(AddrPart);
2616 
2617     // Notice current instruction could be any index. Need to adjust the address
2618     // to the member of index 0.
2619     //
2620     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2621     //       b = A[i];       // Member of index 0
2622     // Current pointer is pointed to A[i+1], adjust it to A[i].
2623     //
2624     // E.g.  A[i+1] = a;     // Member of index 1
2625     //       A[i]   = b;     // Member of index 0
2626     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2627     // Current pointer is pointed to A[i+2], adjust it to A[i].
2628 
2629     bool InBounds = false;
2630     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2631       InBounds = gep->isInBounds();
2632     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2633     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2634 
2635     // Cast to the vector pointer type.
2636     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2637     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2638     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2639   }
2640 
2641   setDebugLocFromInst(Instr);
2642   Value *PoisonVec = PoisonValue::get(VecTy);
2643 
2644   Value *MaskForGaps = nullptr;
2645   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2646     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2647     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2648   }
2649 
2650   // Vectorize the interleaved load group.
2651   if (isa<LoadInst>(Instr)) {
2652     // For each unroll part, create a wide load for the group.
2653     SmallVector<Value *, 2> NewLoads;
2654     for (unsigned Part = 0; Part < UF; Part++) {
2655       Instruction *NewLoad;
2656       if (BlockInMask || MaskForGaps) {
2657         assert(useMaskedInterleavedAccesses(*TTI) &&
2658                "masked interleaved groups are not allowed.");
2659         Value *GroupMask = MaskForGaps;
2660         if (BlockInMask) {
2661           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2662           Value *ShuffledMask = Builder.CreateShuffleVector(
2663               BlockInMaskPart,
2664               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2665               "interleaved.mask");
2666           GroupMask = MaskForGaps
2667                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2668                                                 MaskForGaps)
2669                           : ShuffledMask;
2670         }
2671         NewLoad =
2672             Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2673                                      GroupMask, PoisonVec, "wide.masked.vec");
2674       }
2675       else
2676         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2677                                             Group->getAlign(), "wide.vec");
2678       Group->addMetadata(NewLoad);
2679       NewLoads.push_back(NewLoad);
2680     }
2681 
2682     // For each member in the group, shuffle out the appropriate data from the
2683     // wide loads.
2684     unsigned J = 0;
2685     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2686       Instruction *Member = Group->getMember(I);
2687 
2688       // Skip the gaps in the group.
2689       if (!Member)
2690         continue;
2691 
2692       auto StrideMask =
2693           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2694       for (unsigned Part = 0; Part < UF; Part++) {
2695         Value *StridedVec = Builder.CreateShuffleVector(
2696             NewLoads[Part], StrideMask, "strided.vec");
2697 
2698         // If this member has different type, cast the result type.
2699         if (Member->getType() != ScalarTy) {
2700           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2701           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2702           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2703         }
2704 
2705         if (Group->isReverse())
2706           StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2707 
2708         State.set(VPDefs[J], StridedVec, Part);
2709       }
2710       ++J;
2711     }
2712     return;
2713   }
2714 
2715   // The sub vector type for current instruction.
2716   auto *SubVT = VectorType::get(ScalarTy, VF);
2717 
2718   // Vectorize the interleaved store group.
2719   MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2720   assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2721          "masked interleaved groups are not allowed.");
2722   assert((!MaskForGaps || !VF.isScalable()) &&
2723          "masking gaps for scalable vectors is not yet supported.");
2724   for (unsigned Part = 0; Part < UF; Part++) {
2725     // Collect the stored vector from each member.
2726     SmallVector<Value *, 4> StoredVecs;
2727     for (unsigned i = 0; i < InterleaveFactor; i++) {
2728       assert((Group->getMember(i) || MaskForGaps) &&
2729              "Fail to get a member from an interleaved store group");
2730       Instruction *Member = Group->getMember(i);
2731 
2732       // Skip the gaps in the group.
2733       if (!Member) {
2734         Value *Undef = PoisonValue::get(SubVT);
2735         StoredVecs.push_back(Undef);
2736         continue;
2737       }
2738 
2739       Value *StoredVec = State.get(StoredValues[i], Part);
2740 
2741       if (Group->isReverse())
2742         StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
2743 
2744       // If this member has different type, cast it to a unified type.
2745 
2746       if (StoredVec->getType() != SubVT)
2747         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2748 
2749       StoredVecs.push_back(StoredVec);
2750     }
2751 
2752     // Concatenate all vectors into a wide vector.
2753     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2754 
2755     // Interleave the elements in the wide vector.
2756     Value *IVec = Builder.CreateShuffleVector(
2757         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2758         "interleaved.vec");
2759 
2760     Instruction *NewStoreInstr;
2761     if (BlockInMask || MaskForGaps) {
2762       Value *GroupMask = MaskForGaps;
2763       if (BlockInMask) {
2764         Value *BlockInMaskPart = State.get(BlockInMask, Part);
2765         Value *ShuffledMask = Builder.CreateShuffleVector(
2766             BlockInMaskPart,
2767             createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2768             "interleaved.mask");
2769         GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And,
2770                                                       ShuffledMask, MaskForGaps)
2771                                 : ShuffledMask;
2772       }
2773       NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2774                                                 Group->getAlign(), GroupMask);
2775     } else
2776       NewStoreInstr =
2777           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2778 
2779     Group->addMetadata(NewStoreInstr);
2780   }
2781 }
2782 
2783 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2784                                                VPReplicateRecipe *RepRecipe,
2785                                                const VPIteration &Instance,
2786                                                bool IfPredicateInstr,
2787                                                VPTransformState &State) {
2788   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2789 
2790   // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2791   // the first lane and part.
2792   if (isa<NoAliasScopeDeclInst>(Instr))
2793     if (!Instance.isFirstIteration())
2794       return;
2795 
2796   // Does this instruction return a value ?
2797   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2798 
2799   Instruction *Cloned = Instr->clone();
2800   if (!IsVoidRetTy)
2801     Cloned->setName(Instr->getName() + ".cloned");
2802 
2803   // If the scalarized instruction contributes to the address computation of a
2804   // widen masked load/store which was in a basic block that needed predication
2805   // and is not predicated after vectorization, we can't propagate
2806   // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized
2807   // instruction could feed a poison value to the base address of the widen
2808   // load/store.
2809   if (State.MayGeneratePoisonRecipes.contains(RepRecipe))
2810     Cloned->dropPoisonGeneratingFlags();
2811 
2812   if (Instr->getDebugLoc())
2813     setDebugLocFromInst(Instr);
2814 
2815   // Replace the operands of the cloned instructions with their scalar
2816   // equivalents in the new loop.
2817   for (auto &I : enumerate(RepRecipe->operands())) {
2818     auto InputInstance = Instance;
2819     VPValue *Operand = I.value();
2820     VPReplicateRecipe *OperandR = dyn_cast<VPReplicateRecipe>(Operand);
2821     if (OperandR && OperandR->isUniform())
2822       InputInstance.Lane = VPLane::getFirstLane();
2823     Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2824   }
2825   addNewMetadata(Cloned, Instr);
2826 
2827   // Place the cloned scalar in the new loop.
2828   State.Builder.Insert(Cloned);
2829 
2830   State.set(RepRecipe, Cloned, Instance);
2831 
2832   // If we just cloned a new assumption, add it the assumption cache.
2833   if (auto *II = dyn_cast<AssumeInst>(Cloned))
2834     AC->registerAssumption(II);
2835 
2836   // End if-block.
2837   if (IfPredicateInstr)
2838     PredicatedInstructions.push_back(Cloned);
2839 }
2840 
2841 Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) {
2842   if (TripCount)
2843     return TripCount;
2844 
2845   assert(InsertBlock);
2846   IRBuilder<> Builder(InsertBlock->getTerminator());
2847   // Find the loop boundaries.
2848   ScalarEvolution *SE = PSE.getSE();
2849   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2850   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
2851          "Invalid loop count");
2852 
2853   Type *IdxTy = Legal->getWidestInductionType();
2854   assert(IdxTy && "No type for induction");
2855 
2856   // The exit count might have the type of i64 while the phi is i32. This can
2857   // happen if we have an induction variable that is sign extended before the
2858   // compare. The only way that we get a backedge taken count is that the
2859   // induction variable was signed and as such will not overflow. In such a case
2860   // truncation is legal.
2861   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2862       IdxTy->getPrimitiveSizeInBits())
2863     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2864   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2865 
2866   // Get the total trip count from the count by adding 1.
2867   const SCEV *ExitCount = SE->getAddExpr(
2868       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2869 
2870   const DataLayout &DL = InsertBlock->getModule()->getDataLayout();
2871 
2872   // Expand the trip count and place the new instructions in the preheader.
2873   // Notice that the pre-header does not change, only the loop body.
2874   SCEVExpander Exp(*SE, DL, "induction");
2875 
2876   // Count holds the overall loop count (N).
2877   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2878                                 InsertBlock->getTerminator());
2879 
2880   if (TripCount->getType()->isPointerTy())
2881     TripCount =
2882         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2883                                     InsertBlock->getTerminator());
2884 
2885   return TripCount;
2886 }
2887 
2888 Value *
2889 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2890   if (VectorTripCount)
2891     return VectorTripCount;
2892 
2893   Value *TC = getOrCreateTripCount(InsertBlock);
2894   IRBuilder<> Builder(InsertBlock->getTerminator());
2895 
2896   Type *Ty = TC->getType();
2897   // This is where we can make the step a runtime constant.
2898   Value *Step = createStepForVF(Builder, Ty, VF, UF);
2899 
2900   // If the tail is to be folded by masking, round the number of iterations N
2901   // up to a multiple of Step instead of rounding down. This is done by first
2902   // adding Step-1 and then rounding down. Note that it's ok if this addition
2903   // overflows: the vector induction variable will eventually wrap to zero given
2904   // that it starts at zero and its Step is a power of two; the loop will then
2905   // exit, with the last early-exit vector comparison also producing all-true.
2906   // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2907   // is accounted for in emitIterationCountCheck that adds an overflow check.
2908   if (Cost->foldTailByMasking()) {
2909     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2910            "VF*UF must be a power of 2 when folding tail by masking");
2911     Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
2912     TC = Builder.CreateAdd(
2913         TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
2914   }
2915 
2916   // Now we need to generate the expression for the part of the loop that the
2917   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2918   // iterations are not required for correctness, or N - Step, otherwise. Step
2919   // is equal to the vectorization factor (number of SIMD elements) times the
2920   // unroll factor (number of SIMD instructions).
2921   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2922 
2923   // There are cases where we *must* run at least one iteration in the remainder
2924   // loop.  See the cost model for when this can happen.  If the step evenly
2925   // divides the trip count, we set the remainder to be equal to the step. If
2926   // the step does not evenly divide the trip count, no adjustment is necessary
2927   // since there will already be scalar iterations. Note that the minimum
2928   // iterations check ensures that N >= Step.
2929   if (Cost->requiresScalarEpilogue(VF)) {
2930     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2931     R = Builder.CreateSelect(IsZero, Step, R);
2932   }
2933 
2934   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2935 
2936   return VectorTripCount;
2937 }
2938 
2939 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2940                                                    const DataLayout &DL) {
2941   // Verify that V is a vector type with same number of elements as DstVTy.
2942   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
2943   unsigned VF = DstFVTy->getNumElements();
2944   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
2945   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2946   Type *SrcElemTy = SrcVecTy->getElementType();
2947   Type *DstElemTy = DstFVTy->getElementType();
2948   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2949          "Vector elements must have same size");
2950 
2951   // Do a direct cast if element types are castable.
2952   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2953     return Builder.CreateBitOrPointerCast(V, DstFVTy);
2954   }
2955   // V cannot be directly casted to desired vector type.
2956   // May happen when V is a floating point vector but DstVTy is a vector of
2957   // pointers or vice-versa. Handle this using a two-step bitcast using an
2958   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2959   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2960          "Only one type should be a pointer type");
2961   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2962          "Only one type should be a floating point type");
2963   Type *IntTy =
2964       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2965   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
2966   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2967   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2968 }
2969 
2970 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2971   Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
2972   // Reuse existing vector loop preheader for TC checks.
2973   // Note that new preheader block is generated for vector loop.
2974   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2975   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2976 
2977   // Generate code to check if the loop's trip count is less than VF * UF, or
2978   // equal to it in case a scalar epilogue is required; this implies that the
2979   // vector trip count is zero. This check also covers the case where adding one
2980   // to the backedge-taken count overflowed leading to an incorrect trip count
2981   // of zero. In this case we will also jump to the scalar loop.
2982   auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
2983                                             : ICmpInst::ICMP_ULT;
2984 
2985   // If tail is to be folded, vector loop takes care of all iterations.
2986   Type *CountTy = Count->getType();
2987   Value *CheckMinIters = Builder.getFalse();
2988   Value *Step = createStepForVF(Builder, CountTy, VF, UF);
2989   if (!Cost->foldTailByMasking())
2990     CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
2991   else if (VF.isScalable()) {
2992     // vscale is not necessarily a power-of-2, which means we cannot guarantee
2993     // an overflow to zero when updating induction variables and so an
2994     // additional overflow check is required before entering the vector loop.
2995 
2996     // Get the maximum unsigned value for the type.
2997     Value *MaxUIntTripCount =
2998         ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2999     Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
3000 
3001     // Don't execute the vector loop if (UMax - n) < (VF * UF).
3002     CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, Step);
3003   }
3004   // Create new preheader for vector loop.
3005   LoopVectorPreHeader =
3006       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3007                  "vector.ph");
3008 
3009   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
3010                                DT->getNode(Bypass)->getIDom()) &&
3011          "TC check is expected to dominate Bypass");
3012 
3013   // Update dominator for Bypass & LoopExit (if needed).
3014   DT->changeImmediateDominator(Bypass, TCCheckBlock);
3015   if (!Cost->requiresScalarEpilogue(VF))
3016     // If there is an epilogue which must run, there's no edge from the
3017     // middle block to exit blocks  and thus no need to update the immediate
3018     // dominator of the exit blocks.
3019     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3020 
3021   ReplaceInstWithInst(
3022       TCCheckBlock->getTerminator(),
3023       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3024   LoopBypassBlocks.push_back(TCCheckBlock);
3025 }
3026 
3027 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
3028 
3029   BasicBlock *const SCEVCheckBlock =
3030       RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
3031   if (!SCEVCheckBlock)
3032     return nullptr;
3033 
3034   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3035            (OptForSizeBasedOnProfile &&
3036             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
3037          "Cannot SCEV check stride or overflow when optimizing for size");
3038 
3039 
3040   // Update dominator only if this is first RT check.
3041   if (LoopBypassBlocks.empty()) {
3042     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3043     if (!Cost->requiresScalarEpilogue(VF))
3044       // If there is an epilogue which must run, there's no edge from the
3045       // middle block to exit blocks  and thus no need to update the immediate
3046       // dominator of the exit blocks.
3047       DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3048   }
3049 
3050   LoopBypassBlocks.push_back(SCEVCheckBlock);
3051   AddedSafetyChecks = true;
3052   return SCEVCheckBlock;
3053 }
3054 
3055 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
3056   // VPlan-native path does not do any analysis for runtime checks currently.
3057   if (EnableVPlanNativePath)
3058     return nullptr;
3059 
3060   BasicBlock *const MemCheckBlock =
3061       RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
3062 
3063   // Check if we generated code that checks in runtime if arrays overlap. We put
3064   // the checks into a separate block to make the more common case of few
3065   // elements faster.
3066   if (!MemCheckBlock)
3067     return nullptr;
3068 
3069   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3070     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3071            "Cannot emit memory checks when optimizing for size, unless forced "
3072            "to vectorize.");
3073     ORE->emit([&]() {
3074       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3075                                         OrigLoop->getStartLoc(),
3076                                         OrigLoop->getHeader())
3077              << "Code-size may be reduced by not forcing "
3078                 "vectorization, or by source-code modifications "
3079                 "eliminating the need for runtime checks "
3080                 "(e.g., adding 'restrict').";
3081     });
3082   }
3083 
3084   LoopBypassBlocks.push_back(MemCheckBlock);
3085 
3086   AddedSafetyChecks = true;
3087 
3088   // Only use noalias metadata when using memory checks guaranteeing no overlap
3089   // across all iterations.
3090   if (!Legal->getLAI()->getRuntimePointerChecking()->getDiffChecks()) {
3091     //  We currently don't use LoopVersioning for the actual loop cloning but we
3092     //  still use it to add the noalias metadata.
3093     LVer = std::make_unique<LoopVersioning>(
3094         *Legal->getLAI(),
3095         Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3096         DT, PSE.getSE());
3097     LVer->prepareNoAliasMetadata();
3098   }
3099   return MemCheckBlock;
3100 }
3101 
3102 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3103   LoopScalarBody = OrigLoop->getHeader();
3104   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3105   assert(LoopVectorPreHeader && "Invalid loop structure");
3106   LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3107   assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) &&
3108          "multiple exit loop without required epilogue?");
3109 
3110   LoopMiddleBlock =
3111       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3112                  LI, nullptr, Twine(Prefix) + "middle.block");
3113   LoopScalarPreHeader =
3114       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3115                  nullptr, Twine(Prefix) + "scalar.ph");
3116 
3117   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3118 
3119   // Set up the middle block terminator.  Two cases:
3120   // 1) If we know that we must execute the scalar epilogue, emit an
3121   //    unconditional branch.
3122   // 2) Otherwise, we must have a single unique exit block (due to how we
3123   //    implement the multiple exit case).  In this case, set up a conditonal
3124   //    branch from the middle block to the loop scalar preheader, and the
3125   //    exit block.  completeLoopSkeleton will update the condition to use an
3126   //    iteration check, if required to decide whether to execute the remainder.
3127   BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ?
3128     BranchInst::Create(LoopScalarPreHeader) :
3129     BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3130                        Builder.getTrue());
3131   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3132   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3133 
3134   // Update dominator for loop exit. During skeleton creation, only the vector
3135   // pre-header and the middle block are created. The vector loop is entirely
3136   // created during VPlan exection.
3137   if (!Cost->requiresScalarEpilogue(VF))
3138     // If there is an epilogue which must run, there's no edge from the
3139     // middle block to exit blocks  and thus no need to update the immediate
3140     // dominator of the exit blocks.
3141     DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3142 }
3143 
3144 void InnerLoopVectorizer::createInductionResumeValues(
3145     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3146   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3147           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3148          "Inconsistent information about additional bypass.");
3149 
3150   Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3151   assert(VectorTripCount && "Expected valid arguments");
3152   // We are going to resume the execution of the scalar loop.
3153   // Go over all of the induction variables that we found and fix the
3154   // PHIs that are left in the scalar version of the loop.
3155   // The starting values of PHI nodes depend on the counter of the last
3156   // iteration in the vectorized loop.
3157   // If we come from a bypass edge then we need to start from the original
3158   // start value.
3159   Instruction *OldInduction = Legal->getPrimaryInduction();
3160   for (auto &InductionEntry : Legal->getInductionVars()) {
3161     PHINode *OrigPhi = InductionEntry.first;
3162     InductionDescriptor II = InductionEntry.second;
3163 
3164     // Create phi nodes to merge from the  backedge-taken check block.
3165     PHINode *BCResumeVal =
3166         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3167                         LoopScalarPreHeader->getTerminator());
3168     // Copy original phi DL over to the new one.
3169     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3170     Value *&EndValue = IVEndValues[OrigPhi];
3171     Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3172     if (OrigPhi == OldInduction) {
3173       // We know what the end value is.
3174       EndValue = VectorTripCount;
3175     } else {
3176       IRBuilder<> B(LoopVectorPreHeader->getTerminator());
3177 
3178       // Fast-math-flags propagate from the original induction instruction.
3179       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3180         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3181 
3182       Type *StepType = II.getStep()->getType();
3183       Instruction::CastOps CastOp =
3184           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3185       Value *VTC = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.vtc");
3186       Value *Step =
3187           CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3188       EndValue = emitTransformedIndex(B, VTC, II.getStartValue(), Step, II);
3189       EndValue->setName("ind.end");
3190 
3191       // Compute the end value for the additional bypass (if applicable).
3192       if (AdditionalBypass.first) {
3193         B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3194         CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3195                                          StepType, true);
3196         Value *Step =
3197             CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3198         VTC =
3199             B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.vtc");
3200         EndValueFromAdditionalBypass =
3201             emitTransformedIndex(B, VTC, II.getStartValue(), Step, II);
3202         EndValueFromAdditionalBypass->setName("ind.end");
3203       }
3204     }
3205     // The new PHI merges the original incoming value, in case of a bypass,
3206     // or the value at the end of the vectorized loop.
3207     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3208 
3209     // Fix the scalar body counter (PHI node).
3210     // The old induction's phi node in the scalar body needs the truncated
3211     // value.
3212     for (BasicBlock *BB : LoopBypassBlocks)
3213       BCResumeVal->addIncoming(II.getStartValue(), BB);
3214 
3215     if (AdditionalBypass.first)
3216       BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3217                                             EndValueFromAdditionalBypass);
3218 
3219     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3220   }
3221 }
3222 
3223 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(MDNode *OrigLoopID) {
3224   // The trip counts should be cached by now.
3225   Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
3226   Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3227 
3228   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3229 
3230   // Add a check in the middle block to see if we have completed
3231   // all of the iterations in the first vector loop.  Three cases:
3232   // 1) If we require a scalar epilogue, there is no conditional branch as
3233   //    we unconditionally branch to the scalar preheader.  Do nothing.
3234   // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3235   //    Thus if tail is to be folded, we know we don't need to run the
3236   //    remainder and we can use the previous value for the condition (true).
3237   // 3) Otherwise, construct a runtime check.
3238   if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) {
3239     Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3240                                         Count, VectorTripCount, "cmp.n",
3241                                         LoopMiddleBlock->getTerminator());
3242 
3243     // Here we use the same DebugLoc as the scalar loop latch terminator instead
3244     // of the corresponding compare because they may have ended up with
3245     // different line numbers and we want to avoid awkward line stepping while
3246     // debugging. Eg. if the compare has got a line number inside the loop.
3247     CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3248     cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3249   }
3250 
3251 #ifdef EXPENSIVE_CHECKS
3252   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3253 #endif
3254 
3255   return LoopVectorPreHeader;
3256 }
3257 
3258 std::pair<BasicBlock *, Value *>
3259 InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3260   /*
3261    In this function we generate a new loop. The new loop will contain
3262    the vectorized instructions while the old loop will continue to run the
3263    scalar remainder.
3264 
3265        [ ] <-- loop iteration number check.
3266     /   |
3267    /    v
3268   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3269   |  /  |
3270   | /   v
3271   ||   [ ]     <-- vector pre header.
3272   |/    |
3273   |     v
3274   |    [  ] \
3275   |    [  ]_|   <-- vector loop (created during VPlan execution).
3276   |     |
3277   |     v
3278   \   -[ ]   <--- middle-block.
3279    \/   |
3280    /\   v
3281    | ->[ ]     <--- new preheader.
3282    |    |
3283  (opt)  v      <-- edge from middle to exit iff epilogue is not required.
3284    |   [ ] \
3285    |   [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue).
3286     \   |
3287      \  v
3288       >[ ]     <-- exit block(s).
3289    ...
3290    */
3291 
3292   // Get the metadata of the original loop before it gets modified.
3293   MDNode *OrigLoopID = OrigLoop->getLoopID();
3294 
3295   // Workaround!  Compute the trip count of the original loop and cache it
3296   // before we start modifying the CFG.  This code has a systemic problem
3297   // wherein it tries to run analysis over partially constructed IR; this is
3298   // wrong, and not simply for SCEV.  The trip count of the original loop
3299   // simply happens to be prone to hitting this in practice.  In theory, we
3300   // can hit the same issue for any SCEV, or ValueTracking query done during
3301   // mutation.  See PR49900.
3302   getOrCreateTripCount(OrigLoop->getLoopPreheader());
3303 
3304   // Create an empty vector loop, and prepare basic blocks for the runtime
3305   // checks.
3306   createVectorLoopSkeleton("");
3307 
3308   // Now, compare the new count to zero. If it is zero skip the vector loop and
3309   // jump to the scalar loop. This check also covers the case where the
3310   // backedge-taken count is uint##_max: adding one to it will overflow leading
3311   // to an incorrect trip count of zero. In this (rare) case we will also jump
3312   // to the scalar loop.
3313   emitIterationCountCheck(LoopScalarPreHeader);
3314 
3315   // Generate the code to check any assumptions that we've made for SCEV
3316   // expressions.
3317   emitSCEVChecks(LoopScalarPreHeader);
3318 
3319   // Generate the code that checks in runtime if arrays overlap. We put the
3320   // checks into a separate block to make the more common case of few elements
3321   // faster.
3322   emitMemRuntimeChecks(LoopScalarPreHeader);
3323 
3324   // Emit phis for the new starting index of the scalar loop.
3325   createInductionResumeValues();
3326 
3327   return {completeLoopSkeleton(OrigLoopID), nullptr};
3328 }
3329 
3330 // Fix up external users of the induction variable. At this point, we are
3331 // in LCSSA form, with all external PHIs that use the IV having one input value,
3332 // coming from the remainder loop. We need those PHIs to also have a correct
3333 // value for the IV when arriving directly from the middle block.
3334 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3335                                        const InductionDescriptor &II,
3336                                        Value *VectorTripCount, Value *EndValue,
3337                                        BasicBlock *MiddleBlock,
3338                                        BasicBlock *VectorHeader) {
3339   // There are two kinds of external IV usages - those that use the value
3340   // computed in the last iteration (the PHI) and those that use the penultimate
3341   // value (the value that feeds into the phi from the loop latch).
3342   // We allow both, but they, obviously, have different values.
3343 
3344   assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3345 
3346   DenseMap<Value *, Value *> MissingVals;
3347 
3348   // An external user of the last iteration's value should see the value that
3349   // the remainder loop uses to initialize its own IV.
3350   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3351   for (User *U : PostInc->users()) {
3352     Instruction *UI = cast<Instruction>(U);
3353     if (!OrigLoop->contains(UI)) {
3354       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3355       MissingVals[UI] = EndValue;
3356     }
3357   }
3358 
3359   // An external user of the penultimate value need to see EndValue - Step.
3360   // The simplest way to get this is to recompute it from the constituent SCEVs,
3361   // that is Start + (Step * (CRD - 1)).
3362   for (User *U : OrigPhi->users()) {
3363     auto *UI = cast<Instruction>(U);
3364     if (!OrigLoop->contains(UI)) {
3365       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3366 
3367       IRBuilder<> B(MiddleBlock->getTerminator());
3368 
3369       // Fast-math-flags propagate from the original induction instruction.
3370       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3371         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3372 
3373       Value *CountMinusOne = B.CreateSub(
3374           VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
3375       Value *CMO =
3376           !II.getStep()->getType()->isIntegerTy()
3377               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3378                              II.getStep()->getType())
3379               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3380       CMO->setName("cast.cmo");
3381 
3382       Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(),
3383                                     VectorHeader->getTerminator());
3384       Value *Escape =
3385           emitTransformedIndex(B, CMO, II.getStartValue(), Step, II);
3386       Escape->setName("ind.escape");
3387       MissingVals[UI] = Escape;
3388     }
3389   }
3390 
3391   for (auto &I : MissingVals) {
3392     PHINode *PHI = cast<PHINode>(I.first);
3393     // One corner case we have to handle is two IVs "chasing" each-other,
3394     // that is %IV2 = phi [...], [ %IV1, %latch ]
3395     // In this case, if IV1 has an external use, we need to avoid adding both
3396     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3397     // don't already have an incoming value for the middle block.
3398     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3399       PHI->addIncoming(I.second, MiddleBlock);
3400   }
3401 }
3402 
3403 namespace {
3404 
3405 struct CSEDenseMapInfo {
3406   static bool canHandle(const Instruction *I) {
3407     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3408            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3409   }
3410 
3411   static inline Instruction *getEmptyKey() {
3412     return DenseMapInfo<Instruction *>::getEmptyKey();
3413   }
3414 
3415   static inline Instruction *getTombstoneKey() {
3416     return DenseMapInfo<Instruction *>::getTombstoneKey();
3417   }
3418 
3419   static unsigned getHashValue(const Instruction *I) {
3420     assert(canHandle(I) && "Unknown instruction!");
3421     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3422                                                            I->value_op_end()));
3423   }
3424 
3425   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3426     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3427         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3428       return LHS == RHS;
3429     return LHS->isIdenticalTo(RHS);
3430   }
3431 };
3432 
3433 } // end anonymous namespace
3434 
3435 ///Perform cse of induction variable instructions.
3436 static void cse(BasicBlock *BB) {
3437   // Perform simple cse.
3438   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3439   for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3440     if (!CSEDenseMapInfo::canHandle(&In))
3441       continue;
3442 
3443     // Check if we can replace this instruction with any of the
3444     // visited instructions.
3445     if (Instruction *V = CSEMap.lookup(&In)) {
3446       In.replaceAllUsesWith(V);
3447       In.eraseFromParent();
3448       continue;
3449     }
3450 
3451     CSEMap[&In] = &In;
3452   }
3453 }
3454 
3455 InstructionCost
3456 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3457                                               bool &NeedToScalarize) const {
3458   Function *F = CI->getCalledFunction();
3459   Type *ScalarRetTy = CI->getType();
3460   SmallVector<Type *, 4> Tys, ScalarTys;
3461   for (auto &ArgOp : CI->args())
3462     ScalarTys.push_back(ArgOp->getType());
3463 
3464   // Estimate cost of scalarized vector call. The source operands are assumed
3465   // to be vectors, so we need to extract individual elements from there,
3466   // execute VF scalar calls, and then gather the result into the vector return
3467   // value.
3468   InstructionCost ScalarCallCost =
3469       TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
3470   if (VF.isScalar())
3471     return ScalarCallCost;
3472 
3473   // Compute corresponding vector type for return value and arguments.
3474   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3475   for (Type *ScalarTy : ScalarTys)
3476     Tys.push_back(ToVectorTy(ScalarTy, VF));
3477 
3478   // Compute costs of unpacking argument values for the scalar calls and
3479   // packing the return values to a vector.
3480   InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
3481 
3482   InstructionCost Cost =
3483       ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3484 
3485   // If we can't emit a vector call for this function, then the currently found
3486   // cost is the cost we need to return.
3487   NeedToScalarize = true;
3488   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3489   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3490 
3491   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3492     return Cost;
3493 
3494   // If the corresponding vector cost is cheaper, return its cost.
3495   InstructionCost VectorCallCost =
3496       TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
3497   if (VectorCallCost < Cost) {
3498     NeedToScalarize = false;
3499     Cost = VectorCallCost;
3500   }
3501   return Cost;
3502 }
3503 
3504 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3505   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3506     return Elt;
3507   return VectorType::get(Elt, VF);
3508 }
3509 
3510 InstructionCost
3511 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3512                                                    ElementCount VF) const {
3513   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3514   assert(ID && "Expected intrinsic call!");
3515   Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3516   FastMathFlags FMF;
3517   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3518     FMF = FPMO->getFastMathFlags();
3519 
3520   SmallVector<const Value *> Arguments(CI->args());
3521   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3522   SmallVector<Type *> ParamTys;
3523   std::transform(FTy->param_begin(), FTy->param_end(),
3524                  std::back_inserter(ParamTys),
3525                  [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3526 
3527   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3528                                     dyn_cast<IntrinsicInst>(CI));
3529   return TTI.getIntrinsicInstrCost(CostAttrs,
3530                                    TargetTransformInfo::TCK_RecipThroughput);
3531 }
3532 
3533 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3534   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3535   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3536   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3537 }
3538 
3539 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3540   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3541   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3542   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3543 }
3544 
3545 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3546   // For every instruction `I` in MinBWs, truncate the operands, create a
3547   // truncated version of `I` and reextend its result. InstCombine runs
3548   // later and will remove any ext/trunc pairs.
3549   SmallPtrSet<Value *, 4> Erased;
3550   for (const auto &KV : Cost->getMinimalBitwidths()) {
3551     // If the value wasn't vectorized, we must maintain the original scalar
3552     // type. The absence of the value from State indicates that it
3553     // wasn't vectorized.
3554     // FIXME: Should not rely on getVPValue at this point.
3555     VPValue *Def = State.Plan->getVPValue(KV.first, true);
3556     if (!State.hasAnyVectorValue(Def))
3557       continue;
3558     for (unsigned Part = 0; Part < UF; ++Part) {
3559       Value *I = State.get(Def, Part);
3560       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3561         continue;
3562       Type *OriginalTy = I->getType();
3563       Type *ScalarTruncatedTy =
3564           IntegerType::get(OriginalTy->getContext(), KV.second);
3565       auto *TruncatedTy = VectorType::get(
3566           ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
3567       if (TruncatedTy == OriginalTy)
3568         continue;
3569 
3570       IRBuilder<> B(cast<Instruction>(I));
3571       auto ShrinkOperand = [&](Value *V) -> Value * {
3572         if (auto *ZI = dyn_cast<ZExtInst>(V))
3573           if (ZI->getSrcTy() == TruncatedTy)
3574             return ZI->getOperand(0);
3575         return B.CreateZExtOrTrunc(V, TruncatedTy);
3576       };
3577 
3578       // The actual instruction modification depends on the instruction type,
3579       // unfortunately.
3580       Value *NewI = nullptr;
3581       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3582         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3583                              ShrinkOperand(BO->getOperand(1)));
3584 
3585         // Any wrapping introduced by shrinking this operation shouldn't be
3586         // considered undefined behavior. So, we can't unconditionally copy
3587         // arithmetic wrapping flags to NewI.
3588         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3589       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3590         NewI =
3591             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3592                          ShrinkOperand(CI->getOperand(1)));
3593       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3594         NewI = B.CreateSelect(SI->getCondition(),
3595                               ShrinkOperand(SI->getTrueValue()),
3596                               ShrinkOperand(SI->getFalseValue()));
3597       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3598         switch (CI->getOpcode()) {
3599         default:
3600           llvm_unreachable("Unhandled cast!");
3601         case Instruction::Trunc:
3602           NewI = ShrinkOperand(CI->getOperand(0));
3603           break;
3604         case Instruction::SExt:
3605           NewI = B.CreateSExtOrTrunc(
3606               CI->getOperand(0),
3607               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3608           break;
3609         case Instruction::ZExt:
3610           NewI = B.CreateZExtOrTrunc(
3611               CI->getOperand(0),
3612               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3613           break;
3614         }
3615       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3616         auto Elements0 =
3617             cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
3618         auto *O0 = B.CreateZExtOrTrunc(
3619             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3620         auto Elements1 =
3621             cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
3622         auto *O1 = B.CreateZExtOrTrunc(
3623             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3624 
3625         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3626       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3627         // Don't do anything with the operands, just extend the result.
3628         continue;
3629       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3630         auto Elements =
3631             cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
3632         auto *O0 = B.CreateZExtOrTrunc(
3633             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3634         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3635         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3636       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3637         auto Elements =
3638             cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
3639         auto *O0 = B.CreateZExtOrTrunc(
3640             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3641         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3642       } else {
3643         // If we don't know what to do, be conservative and don't do anything.
3644         continue;
3645       }
3646 
3647       // Lastly, extend the result.
3648       NewI->takeName(cast<Instruction>(I));
3649       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3650       I->replaceAllUsesWith(Res);
3651       cast<Instruction>(I)->eraseFromParent();
3652       Erased.insert(I);
3653       State.reset(Def, Res, Part);
3654     }
3655   }
3656 
3657   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3658   for (const auto &KV : Cost->getMinimalBitwidths()) {
3659     // If the value wasn't vectorized, we must maintain the original scalar
3660     // type. The absence of the value from State indicates that it
3661     // wasn't vectorized.
3662     // FIXME: Should not rely on getVPValue at this point.
3663     VPValue *Def = State.Plan->getVPValue(KV.first, true);
3664     if (!State.hasAnyVectorValue(Def))
3665       continue;
3666     for (unsigned Part = 0; Part < UF; ++Part) {
3667       Value *I = State.get(Def, Part);
3668       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3669       if (Inst && Inst->use_empty()) {
3670         Value *NewI = Inst->getOperand(0);
3671         Inst->eraseFromParent();
3672         State.reset(Def, NewI, Part);
3673       }
3674     }
3675   }
3676 }
3677 
3678 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
3679                                             VPlan &Plan) {
3680   // Insert truncates and extends for any truncated instructions as hints to
3681   // InstCombine.
3682   if (VF.isVector())
3683     truncateToMinimalBitwidths(State);
3684 
3685   // Fix widened non-induction PHIs by setting up the PHI operands.
3686   if (OrigPHIsToFix.size()) {
3687     assert(EnableVPlanNativePath &&
3688            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3689     fixNonInductionPHIs(State);
3690   }
3691 
3692   // At this point every instruction in the original loop is widened to a
3693   // vector form. Now we need to fix the recurrences in the loop. These PHI
3694   // nodes are currently empty because we did not want to introduce cycles.
3695   // This is the second stage of vectorizing recurrences.
3696   fixCrossIterationPHIs(State);
3697 
3698   // Forget the original basic block.
3699   PSE.getSE()->forgetLoop(OrigLoop);
3700 
3701   VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitBasicBlock();
3702   Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
3703   // If we inserted an edge from the middle block to the unique exit block,
3704   // update uses outside the loop (phis) to account for the newly inserted
3705   // edge.
3706   if (!Cost->requiresScalarEpilogue(VF)) {
3707     // Fix-up external users of the induction variables.
3708     for (auto &Entry : Legal->getInductionVars())
3709       fixupIVUsers(Entry.first, Entry.second,
3710                    getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
3711                    IVEndValues[Entry.first], LoopMiddleBlock,
3712                    VectorLoop->getHeader());
3713 
3714     fixLCSSAPHIs(State);
3715   }
3716 
3717   for (Instruction *PI : PredicatedInstructions)
3718     sinkScalarOperands(&*PI);
3719 
3720   // Remove redundant induction instructions.
3721   cse(VectorLoop->getHeader());
3722 
3723   // Set/update profile weights for the vector and remainder loops as original
3724   // loop iterations are now distributed among them. Note that original loop
3725   // represented by LoopScalarBody becomes remainder loop after vectorization.
3726   //
3727   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3728   // end up getting slightly roughened result but that should be OK since
3729   // profile is not inherently precise anyway. Note also possible bypass of
3730   // vector code caused by legality checks is ignored, assigning all the weight
3731   // to the vector loop, optimistically.
3732   //
3733   // For scalable vectorization we can't know at compile time how many iterations
3734   // of the loop are handled in one vector iteration, so instead assume a pessimistic
3735   // vscale of '1'.
3736   setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop,
3737                                LI->getLoopFor(LoopScalarBody),
3738                                VF.getKnownMinValue() * UF);
3739 }
3740 
3741 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
3742   // In order to support recurrences we need to be able to vectorize Phi nodes.
3743   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3744   // stage #2: We now need to fix the recurrences by adding incoming edges to
3745   // the currently empty PHI nodes. At this point every instruction in the
3746   // original loop is widened to a vector form so we can use them to construct
3747   // the incoming edges.
3748   VPBasicBlock *Header =
3749       State.Plan->getVectorLoopRegion()->getEntryBasicBlock();
3750   for (VPRecipeBase &R : Header->phis()) {
3751     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
3752       fixReduction(ReductionPhi, State);
3753     else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3754       fixFirstOrderRecurrence(FOR, State);
3755   }
3756 }
3757 
3758 void InnerLoopVectorizer::fixFirstOrderRecurrence(
3759     VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
3760   // This is the second phase of vectorizing first-order recurrences. An
3761   // overview of the transformation is described below. Suppose we have the
3762   // following loop.
3763   //
3764   //   for (int i = 0; i < n; ++i)
3765   //     b[i] = a[i] - a[i - 1];
3766   //
3767   // There is a first-order recurrence on "a". For this loop, the shorthand
3768   // scalar IR looks like:
3769   //
3770   //   scalar.ph:
3771   //     s_init = a[-1]
3772   //     br scalar.body
3773   //
3774   //   scalar.body:
3775   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3776   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3777   //     s2 = a[i]
3778   //     b[i] = s2 - s1
3779   //     br cond, scalar.body, ...
3780   //
3781   // In this example, s1 is a recurrence because it's value depends on the
3782   // previous iteration. In the first phase of vectorization, we created a
3783   // vector phi v1 for s1. We now complete the vectorization and produce the
3784   // shorthand vector IR shown below (for VF = 4, UF = 1).
3785   //
3786   //   vector.ph:
3787   //     v_init = vector(..., ..., ..., a[-1])
3788   //     br vector.body
3789   //
3790   //   vector.body
3791   //     i = phi [0, vector.ph], [i+4, vector.body]
3792   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3793   //     v2 = a[i, i+1, i+2, i+3];
3794   //     v3 = vector(v1(3), v2(0, 1, 2))
3795   //     b[i, i+1, i+2, i+3] = v2 - v3
3796   //     br cond, vector.body, middle.block
3797   //
3798   //   middle.block:
3799   //     x = v2(3)
3800   //     br scalar.ph
3801   //
3802   //   scalar.ph:
3803   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3804   //     br scalar.body
3805   //
3806   // After execution completes the vector loop, we extract the next value of
3807   // the recurrence (x) to use as the initial value in the scalar loop.
3808 
3809   // Extract the last vector element in the middle block. This will be the
3810   // initial value for the recurrence when jumping to the scalar loop.
3811   VPValue *PreviousDef = PhiR->getBackedgeValue();
3812   Value *Incoming = State.get(PreviousDef, UF - 1);
3813   auto *ExtractForScalar = Incoming;
3814   auto *IdxTy = Builder.getInt32Ty();
3815   if (VF.isVector()) {
3816     auto *One = ConstantInt::get(IdxTy, 1);
3817     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3818     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3819     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3820     ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
3821                                                     "vector.recur.extract");
3822   }
3823   // Extract the second last element in the middle block if the
3824   // Phi is used outside the loop. We need to extract the phi itself
3825   // and not the last element (the phi update in the current iteration). This
3826   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3827   // when the scalar loop is not run at all.
3828   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3829   if (VF.isVector()) {
3830     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3831     auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
3832     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3833         Incoming, Idx, "vector.recur.extract.for.phi");
3834   } else if (UF > 1)
3835     // When loop is unrolled without vectorizing, initialize
3836     // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
3837     // of `Incoming`. This is analogous to the vectorized case above: extracting
3838     // the second last element when VF > 1.
3839     ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
3840 
3841   // Fix the initial value of the original recurrence in the scalar loop.
3842   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3843   PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
3844   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3845   auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
3846   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3847     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3848     Start->addIncoming(Incoming, BB);
3849   }
3850 
3851   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3852   Phi->setName("scalar.recur");
3853 
3854   // Finally, fix users of the recurrence outside the loop. The users will need
3855   // either the last value of the scalar recurrence or the last value of the
3856   // vector recurrence we extracted in the middle block. Since the loop is in
3857   // LCSSA form, we just need to find all the phi nodes for the original scalar
3858   // recurrence in the exit block, and then add an edge for the middle block.
3859   // Note that LCSSA does not imply single entry when the original scalar loop
3860   // had multiple exiting edges (as we always run the last iteration in the
3861   // scalar epilogue); in that case, there is no edge from middle to exit and
3862   // and thus no phis which needed updated.
3863   if (!Cost->requiresScalarEpilogue(VF))
3864     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
3865       if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi))
3866         LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3867 }
3868 
3869 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
3870                                        VPTransformState &State) {
3871   PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
3872   // Get it's reduction variable descriptor.
3873   assert(Legal->isReductionVariable(OrigPhi) &&
3874          "Unable to find the reduction variable");
3875   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
3876 
3877   RecurKind RK = RdxDesc.getRecurrenceKind();
3878   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3879   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3880   setDebugLocFromInst(ReductionStartValue);
3881 
3882   VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
3883   // This is the vector-clone of the value that leaves the loop.
3884   Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
3885 
3886   // Wrap flags are in general invalid after vectorization, clear them.
3887   clearReductionWrapFlags(RdxDesc, State);
3888 
3889   // Before each round, move the insertion point right between
3890   // the PHIs and the values we are going to write.
3891   // This allows us to write both PHINodes and the extractelement
3892   // instructions.
3893   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3894 
3895   setDebugLocFromInst(LoopExitInst);
3896 
3897   Type *PhiTy = OrigPhi->getType();
3898 
3899   VPBasicBlock *LatchVPBB =
3900       PhiR->getParent()->getEnclosingLoopRegion()->getExitBasicBlock();
3901   BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB];
3902   // If tail is folded by masking, the vector value to leave the loop should be
3903   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3904   // instead of the former. For an inloop reduction the reduction will already
3905   // be predicated, and does not need to be handled here.
3906   if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
3907     for (unsigned Part = 0; Part < UF; ++Part) {
3908       Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
3909       Value *Sel = nullptr;
3910       for (User *U : VecLoopExitInst->users()) {
3911         if (isa<SelectInst>(U)) {
3912           assert(!Sel && "Reduction exit feeding two selects");
3913           Sel = U;
3914         } else
3915           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3916       }
3917       assert(Sel && "Reduction exit feeds no select");
3918       State.reset(LoopExitInstDef, Sel, Part);
3919 
3920       // If the target can create a predicated operator for the reduction at no
3921       // extra cost in the loop (for example a predicated vadd), it can be
3922       // cheaper for the select to remain in the loop than be sunk out of it,
3923       // and so use the select value for the phi instead of the old
3924       // LoopExitValue.
3925       if (PreferPredicatedReductionSelect ||
3926           TTI->preferPredicatedReductionSelect(
3927               RdxDesc.getOpcode(), PhiTy,
3928               TargetTransformInfo::ReductionFlags())) {
3929         auto *VecRdxPhi =
3930             cast<PHINode>(State.get(PhiR, Part));
3931         VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel);
3932       }
3933     }
3934   }
3935 
3936   // If the vector reduction can be performed in a smaller type, we truncate
3937   // then extend the loop exit value to enable InstCombine to evaluate the
3938   // entire expression in the smaller type.
3939   if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
3940     assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
3941     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3942     Builder.SetInsertPoint(VectorLoopLatch->getTerminator());
3943     VectorParts RdxParts(UF);
3944     for (unsigned Part = 0; Part < UF; ++Part) {
3945       RdxParts[Part] = State.get(LoopExitInstDef, Part);
3946       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3947       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3948                                         : Builder.CreateZExt(Trunc, VecTy);
3949       for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users()))
3950         if (U != Trunc) {
3951           U->replaceUsesOfWith(RdxParts[Part], Extnd);
3952           RdxParts[Part] = Extnd;
3953         }
3954     }
3955     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3956     for (unsigned Part = 0; Part < UF; ++Part) {
3957       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3958       State.reset(LoopExitInstDef, RdxParts[Part], Part);
3959     }
3960   }
3961 
3962   // Reduce all of the unrolled parts into a single vector.
3963   Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
3964   unsigned Op = RecurrenceDescriptor::getOpcode(RK);
3965 
3966   // The middle block terminator has already been assigned a DebugLoc here (the
3967   // OrigLoop's single latch terminator). We want the whole middle block to
3968   // appear to execute on this line because: (a) it is all compiler generated,
3969   // (b) these instructions are always executed after evaluating the latch
3970   // conditional branch, and (c) other passes may add new predecessors which
3971   // terminate on this line. This is the easiest way to ensure we don't
3972   // accidentally cause an extra step back into the loop while debugging.
3973   setDebugLocFromInst(LoopMiddleBlock->getTerminator());
3974   if (PhiR->isOrdered())
3975     ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
3976   else {
3977     // Floating-point operations should have some FMF to enable the reduction.
3978     IRBuilderBase::FastMathFlagGuard FMFG(Builder);
3979     Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
3980     for (unsigned Part = 1; Part < UF; ++Part) {
3981       Value *RdxPart = State.get(LoopExitInstDef, Part);
3982       if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
3983         ReducedPartRdx = Builder.CreateBinOp(
3984             (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
3985       } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK))
3986         ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK,
3987                                            ReducedPartRdx, RdxPart);
3988       else
3989         ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
3990     }
3991   }
3992 
3993   // Create the reduction after the loop. Note that inloop reductions create the
3994   // target reduction in the loop using a Reduction recipe.
3995   if (VF.isVector() && !PhiR->isInLoop()) {
3996     ReducedPartRdx =
3997         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi);
3998     // If the reduction can be performed in a smaller type, we need to extend
3999     // the reduction to the wider type before we branch to the original loop.
4000     if (PhiTy != RdxDesc.getRecurrenceType())
4001       ReducedPartRdx = RdxDesc.isSigned()
4002                            ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
4003                            : Builder.CreateZExt(ReducedPartRdx, PhiTy);
4004   }
4005 
4006   PHINode *ResumePhi =
4007       dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
4008 
4009   // Create a phi node that merges control-flow from the backedge-taken check
4010   // block and the middle block.
4011   PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
4012                                         LoopScalarPreHeader->getTerminator());
4013 
4014   // If we are fixing reductions in the epilogue loop then we should already
4015   // have created a bc.merge.rdx Phi after the main vector body. Ensure that
4016   // we carry over the incoming values correctly.
4017   for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
4018     if (Incoming == LoopMiddleBlock)
4019       BCBlockPhi->addIncoming(ReducedPartRdx, Incoming);
4020     else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming))
4021       BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
4022                               Incoming);
4023     else
4024       BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
4025   }
4026 
4027   // Set the resume value for this reduction
4028   ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
4029 
4030   // If there were stores of the reduction value to a uniform memory address
4031   // inside the loop, create the final store here.
4032   if (StoreInst *SI = RdxDesc.IntermediateStore) {
4033     StoreInst *NewSI =
4034         Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand());
4035     propagateMetadata(NewSI, SI);
4036 
4037     // If the reduction value is used in other places,
4038     // then let the code below create PHI's for that.
4039   }
4040 
4041   // Now, we need to fix the users of the reduction variable
4042   // inside and outside of the scalar remainder loop.
4043 
4044   // We know that the loop is in LCSSA form. We need to update the PHI nodes
4045   // in the exit blocks.  See comment on analogous loop in
4046   // fixFirstOrderRecurrence for a more complete explaination of the logic.
4047   if (!Cost->requiresScalarEpilogue(VF))
4048     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4049       if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst))
4050         LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4051 
4052   // Fix the scalar loop reduction variable with the incoming reduction sum
4053   // from the vector body and from the backedge value.
4054   int IncomingEdgeBlockIdx =
4055       OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4056   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4057   // Pick the other block.
4058   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4059   OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4060   OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4061 }
4062 
4063 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
4064                                                   VPTransformState &State) {
4065   RecurKind RK = RdxDesc.getRecurrenceKind();
4066   if (RK != RecurKind::Add && RK != RecurKind::Mul)
4067     return;
4068 
4069   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4070   assert(LoopExitInstr && "null loop exit instruction");
4071   SmallVector<Instruction *, 8> Worklist;
4072   SmallPtrSet<Instruction *, 8> Visited;
4073   Worklist.push_back(LoopExitInstr);
4074   Visited.insert(LoopExitInstr);
4075 
4076   while (!Worklist.empty()) {
4077     Instruction *Cur = Worklist.pop_back_val();
4078     if (isa<OverflowingBinaryOperator>(Cur))
4079       for (unsigned Part = 0; Part < UF; ++Part) {
4080         // FIXME: Should not rely on getVPValue at this point.
4081         Value *V = State.get(State.Plan->getVPValue(Cur, true), Part);
4082         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4083       }
4084 
4085     for (User *U : Cur->users()) {
4086       Instruction *UI = cast<Instruction>(U);
4087       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4088           Visited.insert(UI).second)
4089         Worklist.push_back(UI);
4090     }
4091   }
4092 }
4093 
4094 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) {
4095   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4096     if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
4097       // Some phis were already hand updated by the reduction and recurrence
4098       // code above, leave them alone.
4099       continue;
4100 
4101     auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4102     // Non-instruction incoming values will have only one value.
4103 
4104     VPLane Lane = VPLane::getFirstLane();
4105     if (isa<Instruction>(IncomingValue) &&
4106         !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue),
4107                                            VF))
4108       Lane = VPLane::getLastLaneForVF(VF);
4109 
4110     // Can be a loop invariant incoming value or the last scalar value to be
4111     // extracted from the vectorized loop.
4112     // FIXME: Should not rely on getVPValue at this point.
4113     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4114     Value *lastIncomingValue =
4115         OrigLoop->isLoopInvariant(IncomingValue)
4116             ? IncomingValue
4117             : State.get(State.Plan->getVPValue(IncomingValue, true),
4118                         VPIteration(UF - 1, Lane));
4119     LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4120   }
4121 }
4122 
4123 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4124   // The basic block and loop containing the predicated instruction.
4125   auto *PredBB = PredInst->getParent();
4126   auto *VectorLoop = LI->getLoopFor(PredBB);
4127 
4128   // Initialize a worklist with the operands of the predicated instruction.
4129   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4130 
4131   // Holds instructions that we need to analyze again. An instruction may be
4132   // reanalyzed if we don't yet know if we can sink it or not.
4133   SmallVector<Instruction *, 8> InstsToReanalyze;
4134 
4135   // Returns true if a given use occurs in the predicated block. Phi nodes use
4136   // their operands in their corresponding predecessor blocks.
4137   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4138     auto *I = cast<Instruction>(U.getUser());
4139     BasicBlock *BB = I->getParent();
4140     if (auto *Phi = dyn_cast<PHINode>(I))
4141       BB = Phi->getIncomingBlock(
4142           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4143     return BB == PredBB;
4144   };
4145 
4146   // Iteratively sink the scalarized operands of the predicated instruction
4147   // into the block we created for it. When an instruction is sunk, it's
4148   // operands are then added to the worklist. The algorithm ends after one pass
4149   // through the worklist doesn't sink a single instruction.
4150   bool Changed;
4151   do {
4152     // Add the instructions that need to be reanalyzed to the worklist, and
4153     // reset the changed indicator.
4154     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4155     InstsToReanalyze.clear();
4156     Changed = false;
4157 
4158     while (!Worklist.empty()) {
4159       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4160 
4161       // We can't sink an instruction if it is a phi node, is not in the loop,
4162       // or may have side effects.
4163       if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
4164           I->mayHaveSideEffects())
4165         continue;
4166 
4167       // If the instruction is already in PredBB, check if we can sink its
4168       // operands. In that case, VPlan's sinkScalarOperands() succeeded in
4169       // sinking the scalar instruction I, hence it appears in PredBB; but it
4170       // may have failed to sink I's operands (recursively), which we try
4171       // (again) here.
4172       if (I->getParent() == PredBB) {
4173         Worklist.insert(I->op_begin(), I->op_end());
4174         continue;
4175       }
4176 
4177       // It's legal to sink the instruction if all its uses occur in the
4178       // predicated block. Otherwise, there's nothing to do yet, and we may
4179       // need to reanalyze the instruction.
4180       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4181         InstsToReanalyze.push_back(I);
4182         continue;
4183       }
4184 
4185       // Move the instruction to the beginning of the predicated block, and add
4186       // it's operands to the worklist.
4187       I->moveBefore(&*PredBB->getFirstInsertionPt());
4188       Worklist.insert(I->op_begin(), I->op_end());
4189 
4190       // The sinking may have enabled other instructions to be sunk, so we will
4191       // need to iterate.
4192       Changed = true;
4193     }
4194   } while (Changed);
4195 }
4196 
4197 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
4198   for (PHINode *OrigPhi : OrigPHIsToFix) {
4199     VPWidenPHIRecipe *VPPhi =
4200         cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi));
4201     PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4202     // Make sure the builder has a valid insert point.
4203     Builder.SetInsertPoint(NewPhi);
4204     for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4205       VPValue *Inc = VPPhi->getIncomingValue(i);
4206       VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4207       NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4208     }
4209   }
4210 }
4211 
4212 bool InnerLoopVectorizer::useOrderedReductions(
4213     const RecurrenceDescriptor &RdxDesc) {
4214   return Cost->useOrderedReductions(RdxDesc);
4215 }
4216 
4217 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
4218                                               VPWidenPHIRecipe *PhiR,
4219                                               VPTransformState &State) {
4220   assert(EnableVPlanNativePath &&
4221          "Non-native vplans are not expected to have VPWidenPHIRecipes.");
4222   // Currently we enter here in the VPlan-native path for non-induction
4223   // PHIs where all control flow is uniform. We simply widen these PHIs.
4224   // Create a vector phi with no operands - the vector phi operands will be
4225   // set at the end of vector code generation.
4226   Type *VecTy = (State.VF.isScalar())
4227                     ? PN->getType()
4228                     : VectorType::get(PN->getType(), State.VF);
4229   Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4230   State.set(PhiR, VecPhi, 0);
4231   OrigPHIsToFix.push_back(cast<PHINode>(PN));
4232 }
4233 
4234 /// A helper function for checking whether an integer division-related
4235 /// instruction may divide by zero (in which case it must be predicated if
4236 /// executed conditionally in the scalar code).
4237 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4238 /// Non-zero divisors that are non compile-time constants will not be
4239 /// converted into multiplication, so we will still end up scalarizing
4240 /// the division, but can do so w/o predication.
4241 static bool mayDivideByZero(Instruction &I) {
4242   assert((I.getOpcode() == Instruction::UDiv ||
4243           I.getOpcode() == Instruction::SDiv ||
4244           I.getOpcode() == Instruction::URem ||
4245           I.getOpcode() == Instruction::SRem) &&
4246          "Unexpected instruction");
4247   Value *Divisor = I.getOperand(1);
4248   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4249   return !CInt || CInt->isZero();
4250 }
4251 
4252 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4253                                                VPUser &ArgOperands,
4254                                                VPTransformState &State) {
4255   assert(!isa<DbgInfoIntrinsic>(I) &&
4256          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4257   setDebugLocFromInst(&I);
4258 
4259   Module *M = I.getParent()->getParent()->getParent();
4260   auto *CI = cast<CallInst>(&I);
4261 
4262   SmallVector<Type *, 4> Tys;
4263   for (Value *ArgOperand : CI->args())
4264     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4265 
4266   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4267 
4268   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4269   // version of the instruction.
4270   // Is it beneficial to perform intrinsic call compared to lib call?
4271   bool NeedToScalarize = false;
4272   InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4273   InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
4274   bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
4275   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4276          "Instruction should be scalarized elsewhere.");
4277   assert((IntrinsicCost.isValid() || CallCost.isValid()) &&
4278          "Either the intrinsic cost or vector call cost must be valid");
4279 
4280   for (unsigned Part = 0; Part < UF; ++Part) {
4281     SmallVector<Type *, 2> TysForDecl = {CI->getType()};
4282     SmallVector<Value *, 4> Args;
4283     for (auto &I : enumerate(ArgOperands.operands())) {
4284       // Some intrinsics have a scalar argument - don't replace it with a
4285       // vector.
4286       Value *Arg;
4287       if (!UseVectorIntrinsic ||
4288           !isVectorIntrinsicWithScalarOpAtArg(ID, I.index()))
4289         Arg = State.get(I.value(), Part);
4290       else
4291         Arg = State.get(I.value(), VPIteration(0, 0));
4292       if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I.index()))
4293         TysForDecl.push_back(Arg->getType());
4294       Args.push_back(Arg);
4295     }
4296 
4297     Function *VectorF;
4298     if (UseVectorIntrinsic) {
4299       // Use vector version of the intrinsic.
4300       if (VF.isVector())
4301         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4302       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4303       assert(VectorF && "Can't retrieve vector intrinsic.");
4304     } else {
4305       // Use vector version of the function call.
4306       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4307 #ifndef NDEBUG
4308       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4309              "Can't create vector function.");
4310 #endif
4311         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4312     }
4313       SmallVector<OperandBundleDef, 1> OpBundles;
4314       CI->getOperandBundlesAsDefs(OpBundles);
4315       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4316 
4317       if (isa<FPMathOperator>(V))
4318         V->copyFastMathFlags(CI);
4319 
4320       State.set(Def, V, Part);
4321       addMetadata(V, &I);
4322   }
4323 }
4324 
4325 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4326   // We should not collect Scalars more than once per VF. Right now, this
4327   // function is called from collectUniformsAndScalars(), which already does
4328   // this check. Collecting Scalars for VF=1 does not make any sense.
4329   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4330          "This function should not be visited twice for the same VF");
4331 
4332   // This avoids any chances of creating a REPLICATE recipe during planning
4333   // since that would result in generation of scalarized code during execution,
4334   // which is not supported for scalable vectors.
4335   if (VF.isScalable()) {
4336     Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
4337     return;
4338   }
4339 
4340   SmallSetVector<Instruction *, 8> Worklist;
4341 
4342   // These sets are used to seed the analysis with pointers used by memory
4343   // accesses that will remain scalar.
4344   SmallSetVector<Instruction *, 8> ScalarPtrs;
4345   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4346   auto *Latch = TheLoop->getLoopLatch();
4347 
4348   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4349   // The pointer operands of loads and stores will be scalar as long as the
4350   // memory access is not a gather or scatter operation. The value operand of a
4351   // store will remain scalar if the store is scalarized.
4352   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4353     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4354     assert(WideningDecision != CM_Unknown &&
4355            "Widening decision should be ready at this moment");
4356     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4357       if (Ptr == Store->getValueOperand())
4358         return WideningDecision == CM_Scalarize;
4359     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4360            "Ptr is neither a value or pointer operand");
4361     return WideningDecision != CM_GatherScatter;
4362   };
4363 
4364   // A helper that returns true if the given value is a bitcast or
4365   // getelementptr instruction contained in the loop.
4366   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4367     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4368             isa<GetElementPtrInst>(V)) &&
4369            !TheLoop->isLoopInvariant(V);
4370   };
4371 
4372   // A helper that evaluates a memory access's use of a pointer. If the use will
4373   // be a scalar use and the pointer is only used by memory accesses, we place
4374   // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4375   // PossibleNonScalarPtrs.
4376   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4377     // We only care about bitcast and getelementptr instructions contained in
4378     // the loop.
4379     if (!isLoopVaryingBitCastOrGEP(Ptr))
4380       return;
4381 
4382     // If the pointer has already been identified as scalar (e.g., if it was
4383     // also identified as uniform), there's nothing to do.
4384     auto *I = cast<Instruction>(Ptr);
4385     if (Worklist.count(I))
4386       return;
4387 
4388     // If the use of the pointer will be a scalar use, and all users of the
4389     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4390     // place the pointer in PossibleNonScalarPtrs.
4391     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4392           return isa<LoadInst>(U) || isa<StoreInst>(U);
4393         }))
4394       ScalarPtrs.insert(I);
4395     else
4396       PossibleNonScalarPtrs.insert(I);
4397   };
4398 
4399   // We seed the scalars analysis with three classes of instructions: (1)
4400   // instructions marked uniform-after-vectorization and (2) bitcast,
4401   // getelementptr and (pointer) phi instructions used by memory accesses
4402   // requiring a scalar use.
4403   //
4404   // (1) Add to the worklist all instructions that have been identified as
4405   // uniform-after-vectorization.
4406   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4407 
4408   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4409   // memory accesses requiring a scalar use. The pointer operands of loads and
4410   // stores will be scalar as long as the memory accesses is not a gather or
4411   // scatter operation. The value operand of a store will remain scalar if the
4412   // store is scalarized.
4413   for (auto *BB : TheLoop->blocks())
4414     for (auto &I : *BB) {
4415       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4416         evaluatePtrUse(Load, Load->getPointerOperand());
4417       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4418         evaluatePtrUse(Store, Store->getPointerOperand());
4419         evaluatePtrUse(Store, Store->getValueOperand());
4420       }
4421     }
4422   for (auto *I : ScalarPtrs)
4423     if (!PossibleNonScalarPtrs.count(I)) {
4424       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4425       Worklist.insert(I);
4426     }
4427 
4428   // Insert the forced scalars.
4429   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4430   // induction variable when the PHI user is scalarized.
4431   auto ForcedScalar = ForcedScalars.find(VF);
4432   if (ForcedScalar != ForcedScalars.end())
4433     for (auto *I : ForcedScalar->second)
4434       Worklist.insert(I);
4435 
4436   // Expand the worklist by looking through any bitcasts and getelementptr
4437   // instructions we've already identified as scalar. This is similar to the
4438   // expansion step in collectLoopUniforms(); however, here we're only
4439   // expanding to include additional bitcasts and getelementptr instructions.
4440   unsigned Idx = 0;
4441   while (Idx != Worklist.size()) {
4442     Instruction *Dst = Worklist[Idx++];
4443     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4444       continue;
4445     auto *Src = cast<Instruction>(Dst->getOperand(0));
4446     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4447           auto *J = cast<Instruction>(U);
4448           return !TheLoop->contains(J) || Worklist.count(J) ||
4449                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4450                   isScalarUse(J, Src));
4451         })) {
4452       Worklist.insert(Src);
4453       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4454     }
4455   }
4456 
4457   // An induction variable will remain scalar if all users of the induction
4458   // variable and induction variable update remain scalar.
4459   for (auto &Induction : Legal->getInductionVars()) {
4460     auto *Ind = Induction.first;
4461     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4462 
4463     // If tail-folding is applied, the primary induction variable will be used
4464     // to feed a vector compare.
4465     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4466       continue;
4467 
4468     // Returns true if \p Indvar is a pointer induction that is used directly by
4469     // load/store instruction \p I.
4470     auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
4471                                               Instruction *I) {
4472       return Induction.second.getKind() ==
4473                  InductionDescriptor::IK_PtrInduction &&
4474              (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
4475              Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
4476     };
4477 
4478     // Determine if all users of the induction variable are scalar after
4479     // vectorization.
4480     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4481       auto *I = cast<Instruction>(U);
4482       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4483              IsDirectLoadStoreFromPtrIndvar(Ind, I);
4484     });
4485     if (!ScalarInd)
4486       continue;
4487 
4488     // Determine if all users of the induction variable update instruction are
4489     // scalar after vectorization.
4490     auto ScalarIndUpdate =
4491         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4492           auto *I = cast<Instruction>(U);
4493           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4494                  IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
4495         });
4496     if (!ScalarIndUpdate)
4497       continue;
4498 
4499     // The induction variable and its update instruction will remain scalar.
4500     Worklist.insert(Ind);
4501     Worklist.insert(IndUpdate);
4502     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4503     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4504                       << "\n");
4505   }
4506 
4507   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4508 }
4509 
4510 bool LoopVectorizationCostModel::isScalarWithPredication(
4511     Instruction *I, ElementCount VF) const {
4512   if (!blockNeedsPredicationForAnyReason(I->getParent()))
4513     return false;
4514   switch(I->getOpcode()) {
4515   default:
4516     break;
4517   case Instruction::Load:
4518   case Instruction::Store: {
4519     if (!Legal->isMaskRequired(I))
4520       return false;
4521     auto *Ptr = getLoadStorePointerOperand(I);
4522     auto *Ty = getLoadStoreType(I);
4523     Type *VTy = Ty;
4524     if (VF.isVector())
4525       VTy = VectorType::get(Ty, VF);
4526     const Align Alignment = getLoadStoreAlignment(I);
4527     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4528                                 TTI.isLegalMaskedGather(VTy, Alignment))
4529                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4530                                 TTI.isLegalMaskedScatter(VTy, Alignment));
4531   }
4532   case Instruction::UDiv:
4533   case Instruction::SDiv:
4534   case Instruction::SRem:
4535   case Instruction::URem:
4536     return mayDivideByZero(*I);
4537   }
4538   return false;
4539 }
4540 
4541 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4542     Instruction *I, ElementCount VF) {
4543   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4544   assert(getWideningDecision(I, VF) == CM_Unknown &&
4545          "Decision should not be set yet.");
4546   auto *Group = getInterleavedAccessGroup(I);
4547   assert(Group && "Must have a group.");
4548 
4549   // If the instruction's allocated size doesn't equal it's type size, it
4550   // requires padding and will be scalarized.
4551   auto &DL = I->getModule()->getDataLayout();
4552   auto *ScalarTy = getLoadStoreType(I);
4553   if (hasIrregularType(ScalarTy, DL))
4554     return false;
4555 
4556   // If the group involves a non-integral pointer, we may not be able to
4557   // losslessly cast all values to a common type.
4558   unsigned InterleaveFactor = Group->getFactor();
4559   bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
4560   for (unsigned i = 0; i < InterleaveFactor; i++) {
4561     Instruction *Member = Group->getMember(i);
4562     if (!Member)
4563       continue;
4564     auto *MemberTy = getLoadStoreType(Member);
4565     bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
4566     // Don't coerce non-integral pointers to integers or vice versa.
4567     if (MemberNI != ScalarNI) {
4568       // TODO: Consider adding special nullptr value case here
4569       return false;
4570     } else if (MemberNI && ScalarNI &&
4571                ScalarTy->getPointerAddressSpace() !=
4572                MemberTy->getPointerAddressSpace()) {
4573       return false;
4574     }
4575   }
4576 
4577   // Check if masking is required.
4578   // A Group may need masking for one of two reasons: it resides in a block that
4579   // needs predication, or it was decided to use masking to deal with gaps
4580   // (either a gap at the end of a load-access that may result in a speculative
4581   // load, or any gaps in a store-access).
4582   bool PredicatedAccessRequiresMasking =
4583       blockNeedsPredicationForAnyReason(I->getParent()) &&
4584       Legal->isMaskRequired(I);
4585   bool LoadAccessWithGapsRequiresEpilogMasking =
4586       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4587       !isScalarEpilogueAllowed();
4588   bool StoreAccessWithGapsRequiresMasking =
4589       isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4590   if (!PredicatedAccessRequiresMasking &&
4591       !LoadAccessWithGapsRequiresEpilogMasking &&
4592       !StoreAccessWithGapsRequiresMasking)
4593     return true;
4594 
4595   // If masked interleaving is required, we expect that the user/target had
4596   // enabled it, because otherwise it either wouldn't have been created or
4597   // it should have been invalidated by the CostModel.
4598   assert(useMaskedInterleavedAccesses(TTI) &&
4599          "Masked interleave-groups for predicated accesses are not enabled.");
4600 
4601   if (Group->isReverse())
4602     return false;
4603 
4604   auto *Ty = getLoadStoreType(I);
4605   const Align Alignment = getLoadStoreAlignment(I);
4606   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4607                           : TTI.isLegalMaskedStore(Ty, Alignment);
4608 }
4609 
4610 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4611     Instruction *I, ElementCount VF) {
4612   // Get and ensure we have a valid memory instruction.
4613   assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
4614 
4615   auto *Ptr = getLoadStorePointerOperand(I);
4616   auto *ScalarTy = getLoadStoreType(I);
4617 
4618   // In order to be widened, the pointer should be consecutive, first of all.
4619   if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
4620     return false;
4621 
4622   // If the instruction is a store located in a predicated block, it will be
4623   // scalarized.
4624   if (isScalarWithPredication(I, VF))
4625     return false;
4626 
4627   // If the instruction's allocated size doesn't equal it's type size, it
4628   // requires padding and will be scalarized.
4629   auto &DL = I->getModule()->getDataLayout();
4630   if (hasIrregularType(ScalarTy, DL))
4631     return false;
4632 
4633   return true;
4634 }
4635 
4636 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4637   // We should not collect Uniforms more than once per VF. Right now,
4638   // this function is called from collectUniformsAndScalars(), which
4639   // already does this check. Collecting Uniforms for VF=1 does not make any
4640   // sense.
4641 
4642   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
4643          "This function should not be visited twice for the same VF");
4644 
4645   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4646   // not analyze again.  Uniforms.count(VF) will return 1.
4647   Uniforms[VF].clear();
4648 
4649   // We now know that the loop is vectorizable!
4650   // Collect instructions inside the loop that will remain uniform after
4651   // vectorization.
4652 
4653   // Global values, params and instructions outside of current loop are out of
4654   // scope.
4655   auto isOutOfScope = [&](Value *V) -> bool {
4656     Instruction *I = dyn_cast<Instruction>(V);
4657     return (!I || !TheLoop->contains(I));
4658   };
4659 
4660   // Worklist containing uniform instructions demanding lane 0.
4661   SetVector<Instruction *> Worklist;
4662   BasicBlock *Latch = TheLoop->getLoopLatch();
4663 
4664   // Add uniform instructions demanding lane 0 to the worklist. Instructions
4665   // that are scalar with predication must not be considered uniform after
4666   // vectorization, because that would create an erroneous replicating region
4667   // where only a single instance out of VF should be formed.
4668   // TODO: optimize such seldom cases if found important, see PR40816.
4669   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4670     if (isOutOfScope(I)) {
4671       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
4672                         << *I << "\n");
4673       return;
4674     }
4675     if (isScalarWithPredication(I, VF)) {
4676       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4677                         << *I << "\n");
4678       return;
4679     }
4680     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4681     Worklist.insert(I);
4682   };
4683 
4684   // Start with the conditional branch. If the branch condition is an
4685   // instruction contained in the loop that is only used by the branch, it is
4686   // uniform.
4687   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4688   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4689     addToWorklistIfAllowed(Cmp);
4690 
4691   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
4692     InstWidening WideningDecision = getWideningDecision(I, VF);
4693     assert(WideningDecision != CM_Unknown &&
4694            "Widening decision should be ready at this moment");
4695 
4696     // A uniform memory op is itself uniform.  We exclude uniform stores
4697     // here as they demand the last lane, not the first one.
4698     if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
4699       assert(WideningDecision == CM_Scalarize);
4700       return true;
4701     }
4702 
4703     return (WideningDecision == CM_Widen ||
4704             WideningDecision == CM_Widen_Reverse ||
4705             WideningDecision == CM_Interleave);
4706   };
4707 
4708 
4709   // Returns true if Ptr is the pointer operand of a memory access instruction
4710   // I, and I is known to not require scalarization.
4711   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4712     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4713   };
4714 
4715   // Holds a list of values which are known to have at least one uniform use.
4716   // Note that there may be other uses which aren't uniform.  A "uniform use"
4717   // here is something which only demands lane 0 of the unrolled iterations;
4718   // it does not imply that all lanes produce the same value (e.g. this is not
4719   // the usual meaning of uniform)
4720   SetVector<Value *> HasUniformUse;
4721 
4722   // Scan the loop for instructions which are either a) known to have only
4723   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
4724   for (auto *BB : TheLoop->blocks())
4725     for (auto &I : *BB) {
4726       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
4727         switch (II->getIntrinsicID()) {
4728         case Intrinsic::sideeffect:
4729         case Intrinsic::experimental_noalias_scope_decl:
4730         case Intrinsic::assume:
4731         case Intrinsic::lifetime_start:
4732         case Intrinsic::lifetime_end:
4733           if (TheLoop->hasLoopInvariantOperands(&I))
4734             addToWorklistIfAllowed(&I);
4735           break;
4736         default:
4737           break;
4738         }
4739       }
4740 
4741       // ExtractValue instructions must be uniform, because the operands are
4742       // known to be loop-invariant.
4743       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
4744         assert(isOutOfScope(EVI->getAggregateOperand()) &&
4745                "Expected aggregate value to be loop invariant");
4746         addToWorklistIfAllowed(EVI);
4747         continue;
4748       }
4749 
4750       // If there's no pointer operand, there's nothing to do.
4751       auto *Ptr = getLoadStorePointerOperand(&I);
4752       if (!Ptr)
4753         continue;
4754 
4755       // A uniform memory op is itself uniform.  We exclude uniform stores
4756       // here as they demand the last lane, not the first one.
4757       if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
4758         addToWorklistIfAllowed(&I);
4759 
4760       if (isUniformDecision(&I, VF)) {
4761         assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
4762         HasUniformUse.insert(Ptr);
4763       }
4764     }
4765 
4766   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
4767   // demanding) users.  Since loops are assumed to be in LCSSA form, this
4768   // disallows uses outside the loop as well.
4769   for (auto *V : HasUniformUse) {
4770     if (isOutOfScope(V))
4771       continue;
4772     auto *I = cast<Instruction>(V);
4773     auto UsersAreMemAccesses =
4774       llvm::all_of(I->users(), [&](User *U) -> bool {
4775         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
4776       });
4777     if (UsersAreMemAccesses)
4778       addToWorklistIfAllowed(I);
4779   }
4780 
4781   // Expand Worklist in topological order: whenever a new instruction
4782   // is added , its users should be already inside Worklist.  It ensures
4783   // a uniform instruction will only be used by uniform instructions.
4784   unsigned idx = 0;
4785   while (idx != Worklist.size()) {
4786     Instruction *I = Worklist[idx++];
4787 
4788     for (auto OV : I->operand_values()) {
4789       // isOutOfScope operands cannot be uniform instructions.
4790       if (isOutOfScope(OV))
4791         continue;
4792       // First order recurrence Phi's should typically be considered
4793       // non-uniform.
4794       auto *OP = dyn_cast<PHINode>(OV);
4795       if (OP && Legal->isFirstOrderRecurrence(OP))
4796         continue;
4797       // If all the users of the operand are uniform, then add the
4798       // operand into the uniform worklist.
4799       auto *OI = cast<Instruction>(OV);
4800       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4801             auto *J = cast<Instruction>(U);
4802             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
4803           }))
4804         addToWorklistIfAllowed(OI);
4805     }
4806   }
4807 
4808   // For an instruction to be added into Worklist above, all its users inside
4809   // the loop should also be in Worklist. However, this condition cannot be
4810   // true for phi nodes that form a cyclic dependence. We must process phi
4811   // nodes separately. An induction variable will remain uniform if all users
4812   // of the induction variable and induction variable update remain uniform.
4813   // The code below handles both pointer and non-pointer induction variables.
4814   for (auto &Induction : Legal->getInductionVars()) {
4815     auto *Ind = Induction.first;
4816     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4817 
4818     // Determine if all users of the induction variable are uniform after
4819     // vectorization.
4820     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4821       auto *I = cast<Instruction>(U);
4822       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4823              isVectorizedMemAccessUse(I, Ind);
4824     });
4825     if (!UniformInd)
4826       continue;
4827 
4828     // Determine if all users of the induction variable update instruction are
4829     // uniform after vectorization.
4830     auto UniformIndUpdate =
4831         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4832           auto *I = cast<Instruction>(U);
4833           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4834                  isVectorizedMemAccessUse(I, IndUpdate);
4835         });
4836     if (!UniformIndUpdate)
4837       continue;
4838 
4839     // The induction variable and its update instruction will remain uniform.
4840     addToWorklistIfAllowed(Ind);
4841     addToWorklistIfAllowed(IndUpdate);
4842   }
4843 
4844   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4845 }
4846 
4847 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4848   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4849 
4850   if (Legal->getRuntimePointerChecking()->Need) {
4851     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4852         "runtime pointer checks needed. Enable vectorization of this "
4853         "loop with '#pragma clang loop vectorize(enable)' when "
4854         "compiling with -Os/-Oz",
4855         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4856     return true;
4857   }
4858 
4859   if (!PSE.getPredicate().isAlwaysTrue()) {
4860     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4861         "runtime SCEV checks needed. Enable vectorization of this "
4862         "loop with '#pragma clang loop vectorize(enable)' when "
4863         "compiling with -Os/-Oz",
4864         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4865     return true;
4866   }
4867 
4868   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4869   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4870     reportVectorizationFailure("Runtime stride check for small trip count",
4871         "runtime stride == 1 checks needed. Enable vectorization of "
4872         "this loop without such check by compiling with -Os/-Oz",
4873         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4874     return true;
4875   }
4876 
4877   return false;
4878 }
4879 
4880 ElementCount
4881 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
4882   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
4883     return ElementCount::getScalable(0);
4884 
4885   if (Hints->isScalableVectorizationDisabled()) {
4886     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
4887                             "ScalableVectorizationDisabled", ORE, TheLoop);
4888     return ElementCount::getScalable(0);
4889   }
4890 
4891   LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
4892 
4893   auto MaxScalableVF = ElementCount::getScalable(
4894       std::numeric_limits<ElementCount::ScalarTy>::max());
4895 
4896   // Test that the loop-vectorizer can legalize all operations for this MaxVF.
4897   // FIXME: While for scalable vectors this is currently sufficient, this should
4898   // be replaced by a more detailed mechanism that filters out specific VFs,
4899   // instead of invalidating vectorization for a whole set of VFs based on the
4900   // MaxVF.
4901 
4902   // Disable scalable vectorization if the loop contains unsupported reductions.
4903   if (!canVectorizeReductions(MaxScalableVF)) {
4904     reportVectorizationInfo(
4905         "Scalable vectorization not supported for the reduction "
4906         "operations found in this loop.",
4907         "ScalableVFUnfeasible", ORE, TheLoop);
4908     return ElementCount::getScalable(0);
4909   }
4910 
4911   // Disable scalable vectorization if the loop contains any instructions
4912   // with element types not supported for scalable vectors.
4913   if (any_of(ElementTypesInLoop, [&](Type *Ty) {
4914         return !Ty->isVoidTy() &&
4915                !this->TTI.isElementTypeLegalForScalableVector(Ty);
4916       })) {
4917     reportVectorizationInfo("Scalable vectorization is not supported "
4918                             "for all element types found in this loop.",
4919                             "ScalableVFUnfeasible", ORE, TheLoop);
4920     return ElementCount::getScalable(0);
4921   }
4922 
4923   if (Legal->isSafeForAnyVectorWidth())
4924     return MaxScalableVF;
4925 
4926   // Limit MaxScalableVF by the maximum safe dependence distance.
4927   Optional<unsigned> MaxVScale = TTI.getMaxVScale();
4928   if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange))
4929     MaxVScale =
4930         TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
4931   MaxScalableVF = ElementCount::getScalable(
4932       MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
4933   if (!MaxScalableVF)
4934     reportVectorizationInfo(
4935         "Max legal vector width too small, scalable vectorization "
4936         "unfeasible.",
4937         "ScalableVFUnfeasible", ORE, TheLoop);
4938 
4939   return MaxScalableVF;
4940 }
4941 
4942 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
4943     unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) {
4944   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4945   unsigned SmallestType, WidestType;
4946   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4947 
4948   // Get the maximum safe dependence distance in bits computed by LAA.
4949   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4950   // the memory accesses that is most restrictive (involved in the smallest
4951   // dependence distance).
4952   unsigned MaxSafeElements =
4953       PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
4954 
4955   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
4956   auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
4957 
4958   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
4959                     << ".\n");
4960   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
4961                     << ".\n");
4962 
4963   // First analyze the UserVF, fall back if the UserVF should be ignored.
4964   if (UserVF) {
4965     auto MaxSafeUserVF =
4966         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
4967 
4968     if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
4969       // If `VF=vscale x N` is safe, then so is `VF=N`
4970       if (UserVF.isScalable())
4971         return FixedScalableVFPair(
4972             ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
4973       else
4974         return UserVF;
4975     }
4976 
4977     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
4978 
4979     // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
4980     // is better to ignore the hint and let the compiler choose a suitable VF.
4981     if (!UserVF.isScalable()) {
4982       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4983                         << " is unsafe, clamping to max safe VF="
4984                         << MaxSafeFixedVF << ".\n");
4985       ORE->emit([&]() {
4986         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4987                                           TheLoop->getStartLoc(),
4988                                           TheLoop->getHeader())
4989                << "User-specified vectorization factor "
4990                << ore::NV("UserVectorizationFactor", UserVF)
4991                << " is unsafe, clamping to maximum safe vectorization factor "
4992                << ore::NV("VectorizationFactor", MaxSafeFixedVF);
4993       });
4994       return MaxSafeFixedVF;
4995     }
4996 
4997     if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
4998       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4999                         << " is ignored because scalable vectors are not "
5000                            "available.\n");
5001       ORE->emit([&]() {
5002         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5003                                           TheLoop->getStartLoc(),
5004                                           TheLoop->getHeader())
5005                << "User-specified vectorization factor "
5006                << ore::NV("UserVectorizationFactor", UserVF)
5007                << " is ignored because the target does not support scalable "
5008                   "vectors. The compiler will pick a more suitable value.";
5009       });
5010     } else {
5011       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5012                         << " is unsafe. Ignoring scalable UserVF.\n");
5013       ORE->emit([&]() {
5014         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5015                                           TheLoop->getStartLoc(),
5016                                           TheLoop->getHeader())
5017                << "User-specified vectorization factor "
5018                << ore::NV("UserVectorizationFactor", UserVF)
5019                << " is unsafe. Ignoring the hint to let the compiler pick a "
5020                   "more suitable value.";
5021       });
5022     }
5023   }
5024 
5025   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5026                     << " / " << WidestType << " bits.\n");
5027 
5028   FixedScalableVFPair Result(ElementCount::getFixed(1),
5029                              ElementCount::getScalable(0));
5030   if (auto MaxVF =
5031           getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5032                                   MaxSafeFixedVF, FoldTailByMasking))
5033     Result.FixedVF = MaxVF;
5034 
5035   if (auto MaxVF =
5036           getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5037                                   MaxSafeScalableVF, FoldTailByMasking))
5038     if (MaxVF.isScalable()) {
5039       Result.ScalableVF = MaxVF;
5040       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
5041                         << "\n");
5042     }
5043 
5044   return Result;
5045 }
5046 
5047 FixedScalableVFPair
5048 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5049   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5050     // TODO: It may by useful to do since it's still likely to be dynamically
5051     // uniform if the target can skip.
5052     reportVectorizationFailure(
5053         "Not inserting runtime ptr check for divergent target",
5054         "runtime pointer checks needed. Not enabled for divergent target",
5055         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5056     return FixedScalableVFPair::getNone();
5057   }
5058 
5059   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5060   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5061   if (TC == 1) {
5062     reportVectorizationFailure("Single iteration (non) loop",
5063         "loop trip count is one, irrelevant for vectorization",
5064         "SingleIterationLoop", ORE, TheLoop);
5065     return FixedScalableVFPair::getNone();
5066   }
5067 
5068   switch (ScalarEpilogueStatus) {
5069   case CM_ScalarEpilogueAllowed:
5070     return computeFeasibleMaxVF(TC, UserVF, false);
5071   case CM_ScalarEpilogueNotAllowedUsePredicate:
5072     LLVM_FALLTHROUGH;
5073   case CM_ScalarEpilogueNotNeededUsePredicate:
5074     LLVM_DEBUG(
5075         dbgs() << "LV: vector predicate hint/switch found.\n"
5076                << "LV: Not allowing scalar epilogue, creating predicated "
5077                << "vector loop.\n");
5078     break;
5079   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5080     // fallthrough as a special case of OptForSize
5081   case CM_ScalarEpilogueNotAllowedOptSize:
5082     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5083       LLVM_DEBUG(
5084           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5085     else
5086       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5087                         << "count.\n");
5088 
5089     // Bail if runtime checks are required, which are not good when optimising
5090     // for size.
5091     if (runtimeChecksRequired())
5092       return FixedScalableVFPair::getNone();
5093 
5094     break;
5095   }
5096 
5097   // The only loops we can vectorize without a scalar epilogue, are loops with
5098   // a bottom-test and a single exiting block. We'd have to handle the fact
5099   // that not every instruction executes on the last iteration.  This will
5100   // require a lane mask which varies through the vector loop body.  (TODO)
5101   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5102     // If there was a tail-folding hint/switch, but we can't fold the tail by
5103     // masking, fallback to a vectorization with a scalar epilogue.
5104     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5105       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5106                            "scalar epilogue instead.\n");
5107       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5108       return computeFeasibleMaxVF(TC, UserVF, false);
5109     }
5110     return FixedScalableVFPair::getNone();
5111   }
5112 
5113   // Now try the tail folding
5114 
5115   // Invalidate interleave groups that require an epilogue if we can't mask
5116   // the interleave-group.
5117   if (!useMaskedInterleavedAccesses(TTI)) {
5118     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5119            "No decisions should have been taken at this point");
5120     // Note: There is no need to invalidate any cost modeling decisions here, as
5121     // non where taken so far.
5122     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5123   }
5124 
5125   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true);
5126   // Avoid tail folding if the trip count is known to be a multiple of any VF
5127   // we chose.
5128   // FIXME: The condition below pessimises the case for fixed-width vectors,
5129   // when scalable VFs are also candidates for vectorization.
5130   if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
5131     ElementCount MaxFixedVF = MaxFactors.FixedVF;
5132     assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&
5133            "MaxFixedVF must be a power of 2");
5134     unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
5135                                    : MaxFixedVF.getFixedValue();
5136     ScalarEvolution *SE = PSE.getSE();
5137     const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5138     const SCEV *ExitCount = SE->getAddExpr(
5139         BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5140     const SCEV *Rem = SE->getURemExpr(
5141         SE->applyLoopGuards(ExitCount, TheLoop),
5142         SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5143     if (Rem->isZero()) {
5144       // Accept MaxFixedVF if we do not have a tail.
5145       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5146       return MaxFactors;
5147     }
5148   }
5149 
5150   // If we don't know the precise trip count, or if the trip count that we
5151   // found modulo the vectorization factor is not zero, try to fold the tail
5152   // by masking.
5153   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5154   if (Legal->prepareToFoldTailByMasking()) {
5155     FoldTailByMasking = true;
5156     return MaxFactors;
5157   }
5158 
5159   // If there was a tail-folding hint/switch, but we can't fold the tail by
5160   // masking, fallback to a vectorization with a scalar epilogue.
5161   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5162     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5163                          "scalar epilogue instead.\n");
5164     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5165     return MaxFactors;
5166   }
5167 
5168   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5169     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5170     return FixedScalableVFPair::getNone();
5171   }
5172 
5173   if (TC == 0) {
5174     reportVectorizationFailure(
5175         "Unable to calculate the loop count due to complex control flow",
5176         "unable to calculate the loop count due to complex control flow",
5177         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5178     return FixedScalableVFPair::getNone();
5179   }
5180 
5181   reportVectorizationFailure(
5182       "Cannot optimize for size and vectorize at the same time.",
5183       "cannot optimize for size and vectorize at the same time. "
5184       "Enable vectorization of this loop with '#pragma clang loop "
5185       "vectorize(enable)' when compiling with -Os/-Oz",
5186       "NoTailLoopWithOptForSize", ORE, TheLoop);
5187   return FixedScalableVFPair::getNone();
5188 }
5189 
5190 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
5191     unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
5192     const ElementCount &MaxSafeVF, bool FoldTailByMasking) {
5193   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
5194   TypeSize WidestRegister = TTI.getRegisterBitWidth(
5195       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5196                            : TargetTransformInfo::RGK_FixedWidthVector);
5197 
5198   // Convenience function to return the minimum of two ElementCounts.
5199   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
5200     assert((LHS.isScalable() == RHS.isScalable()) &&
5201            "Scalable flags must match");
5202     return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
5203   };
5204 
5205   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5206   // Note that both WidestRegister and WidestType may not be a powers of 2.
5207   auto MaxVectorElementCount = ElementCount::get(
5208       PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType),
5209       ComputeScalableMaxVF);
5210   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
5211   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5212                     << (MaxVectorElementCount * WidestType) << " bits.\n");
5213 
5214   if (!MaxVectorElementCount) {
5215     LLVM_DEBUG(dbgs() << "LV: The target has no "
5216                       << (ComputeScalableMaxVF ? "scalable" : "fixed")
5217                       << " vector registers.\n");
5218     return ElementCount::getFixed(1);
5219   }
5220 
5221   const auto TripCountEC = ElementCount::getFixed(ConstTripCount);
5222   if (ConstTripCount &&
5223       ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) &&
5224       (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {
5225     // If loop trip count (TC) is known at compile time there is no point in
5226     // choosing VF greater than TC (as done in the loop below). Select maximum
5227     // power of two which doesn't exceed TC.
5228     // If MaxVectorElementCount is scalable, we only fall back on a fixed VF
5229     // when the TC is less than or equal to the known number of lanes.
5230     auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount);
5231     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
5232                          "exceeding the constant trip count: "
5233                       << ClampedConstTripCount << "\n");
5234     return ElementCount::getFixed(ClampedConstTripCount);
5235   }
5236 
5237   ElementCount MaxVF = MaxVectorElementCount;
5238   if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
5239                             TTI.shouldMaximizeVectorBandwidth())) {
5240     auto MaxVectorElementCountMaxBW = ElementCount::get(
5241         PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
5242         ComputeScalableMaxVF);
5243     MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
5244 
5245     // Collect all viable vectorization factors larger than the default MaxVF
5246     // (i.e. MaxVectorElementCount).
5247     SmallVector<ElementCount, 8> VFs;
5248     for (ElementCount VS = MaxVectorElementCount * 2;
5249          ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
5250       VFs.push_back(VS);
5251 
5252     // For each VF calculate its register usage.
5253     auto RUs = calculateRegisterUsage(VFs);
5254 
5255     // Select the largest VF which doesn't require more registers than existing
5256     // ones.
5257     for (int i = RUs.size() - 1; i >= 0; --i) {
5258       bool Selected = true;
5259       for (auto &pair : RUs[i].MaxLocalUsers) {
5260         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5261         if (pair.second > TargetNumRegisters)
5262           Selected = false;
5263       }
5264       if (Selected) {
5265         MaxVF = VFs[i];
5266         break;
5267       }
5268     }
5269     if (ElementCount MinVF =
5270             TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
5271       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
5272         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5273                           << ") with target's minimum: " << MinVF << '\n');
5274         MaxVF = MinVF;
5275       }
5276     }
5277 
5278     // Invalidate any widening decisions we might have made, in case the loop
5279     // requires prediction (decided later), but we have already made some
5280     // load/store widening decisions.
5281     invalidateCostModelingDecisions();
5282   }
5283   return MaxVF;
5284 }
5285 
5286 Optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const {
5287   if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
5288     auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
5289     auto Min = Attr.getVScaleRangeMin();
5290     auto Max = Attr.getVScaleRangeMax();
5291     if (Max && Min == Max)
5292       return Max;
5293   }
5294 
5295   return TTI.getVScaleForTuning();
5296 }
5297 
5298 bool LoopVectorizationCostModel::isMoreProfitable(
5299     const VectorizationFactor &A, const VectorizationFactor &B) const {
5300   InstructionCost CostA = A.Cost;
5301   InstructionCost CostB = B.Cost;
5302 
5303   unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
5304 
5305   if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking &&
5306       MaxTripCount) {
5307     // If we are folding the tail and the trip count is a known (possibly small)
5308     // constant, the trip count will be rounded up to an integer number of
5309     // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),
5310     // which we compare directly. When not folding the tail, the total cost will
5311     // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is
5312     // approximated with the per-lane cost below instead of using the tripcount
5313     // as here.
5314     auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue());
5315     auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue());
5316     return RTCostA < RTCostB;
5317   }
5318 
5319   // Improve estimate for the vector width if it is scalable.
5320   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
5321   unsigned EstimatedWidthB = B.Width.getKnownMinValue();
5322   if (Optional<unsigned> VScale = getVScaleForTuning()) {
5323     if (A.Width.isScalable())
5324       EstimatedWidthA *= VScale.getValue();
5325     if (B.Width.isScalable())
5326       EstimatedWidthB *= VScale.getValue();
5327   }
5328 
5329   // Assume vscale may be larger than 1 (or the value being tuned for),
5330   // so that scalable vectorization is slightly favorable over fixed-width
5331   // vectorization.
5332   if (A.Width.isScalable() && !B.Width.isScalable())
5333     return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
5334 
5335   // To avoid the need for FP division:
5336   //      (CostA / A.Width) < (CostB / B.Width)
5337   // <=>  (CostA * B.Width) < (CostB * A.Width)
5338   return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
5339 }
5340 
5341 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
5342     const ElementCountSet &VFCandidates) {
5343   InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
5344   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
5345   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
5346   assert(VFCandidates.count(ElementCount::getFixed(1)) &&
5347          "Expected Scalar VF to be a candidate");
5348 
5349   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost);
5350   VectorizationFactor ChosenFactor = ScalarCost;
5351 
5352   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5353   if (ForceVectorization && VFCandidates.size() > 1) {
5354     // Ignore scalar width, because the user explicitly wants vectorization.
5355     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5356     // evaluation.
5357     ChosenFactor.Cost = InstructionCost::getMax();
5358   }
5359 
5360   SmallVector<InstructionVFPair> InvalidCosts;
5361   for (const auto &i : VFCandidates) {
5362     // The cost for scalar VF=1 is already calculated, so ignore it.
5363     if (i.isScalar())
5364       continue;
5365 
5366     VectorizationCostTy C = expectedCost(i, &InvalidCosts);
5367     VectorizationFactor Candidate(i, C.first);
5368 
5369 #ifndef NDEBUG
5370     unsigned AssumedMinimumVscale = 1;
5371     if (Optional<unsigned> VScale = getVScaleForTuning())
5372       AssumedMinimumVscale = VScale.getValue();
5373     unsigned Width =
5374         Candidate.Width.isScalable()
5375             ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5376             : Candidate.Width.getFixedValue();
5377     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5378                       << " costs: " << (Candidate.Cost / Width));
5379     if (i.isScalable())
5380       LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
5381                         << AssumedMinimumVscale << ")");
5382     LLVM_DEBUG(dbgs() << ".\n");
5383 #endif
5384 
5385     if (!C.second && !ForceVectorization) {
5386       LLVM_DEBUG(
5387           dbgs() << "LV: Not considering vector loop of width " << i
5388                  << " because it will not generate any vector instructions.\n");
5389       continue;
5390     }
5391 
5392     // If profitable add it to ProfitableVF list.
5393     if (isMoreProfitable(Candidate, ScalarCost))
5394       ProfitableVFs.push_back(Candidate);
5395 
5396     if (isMoreProfitable(Candidate, ChosenFactor))
5397       ChosenFactor = Candidate;
5398   }
5399 
5400   // Emit a report of VFs with invalid costs in the loop.
5401   if (!InvalidCosts.empty()) {
5402     // Group the remarks per instruction, keeping the instruction order from
5403     // InvalidCosts.
5404     std::map<Instruction *, unsigned> Numbering;
5405     unsigned I = 0;
5406     for (auto &Pair : InvalidCosts)
5407       if (!Numbering.count(Pair.first))
5408         Numbering[Pair.first] = I++;
5409 
5410     // Sort the list, first on instruction(number) then on VF.
5411     llvm::sort(InvalidCosts,
5412                [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
5413                  if (Numbering[A.first] != Numbering[B.first])
5414                    return Numbering[A.first] < Numbering[B.first];
5415                  ElementCountComparator ECC;
5416                  return ECC(A.second, B.second);
5417                });
5418 
5419     // For a list of ordered instruction-vf pairs:
5420     //   [(load, vf1), (load, vf2), (store, vf1)]
5421     // Group the instructions together to emit separate remarks for:
5422     //   load  (vf1, vf2)
5423     //   store (vf1)
5424     auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
5425     auto Subset = ArrayRef<InstructionVFPair>();
5426     do {
5427       if (Subset.empty())
5428         Subset = Tail.take_front(1);
5429 
5430       Instruction *I = Subset.front().first;
5431 
5432       // If the next instruction is different, or if there are no other pairs,
5433       // emit a remark for the collated subset. e.g.
5434       //   [(load, vf1), (load, vf2))]
5435       // to emit:
5436       //  remark: invalid costs for 'load' at VF=(vf, vf2)
5437       if (Subset == Tail || Tail[Subset.size()].first != I) {
5438         std::string OutString;
5439         raw_string_ostream OS(OutString);
5440         assert(!Subset.empty() && "Unexpected empty range");
5441         OS << "Instruction with invalid costs prevented vectorization at VF=(";
5442         for (auto &Pair : Subset)
5443           OS << (Pair.second == Subset.front().second ? "" : ", ")
5444              << Pair.second;
5445         OS << "):";
5446         if (auto *CI = dyn_cast<CallInst>(I))
5447           OS << " call to " << CI->getCalledFunction()->getName();
5448         else
5449           OS << " " << I->getOpcodeName();
5450         OS.flush();
5451         reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
5452         Tail = Tail.drop_front(Subset.size());
5453         Subset = {};
5454       } else
5455         // Grow the subset by one element
5456         Subset = Tail.take_front(Subset.size() + 1);
5457     } while (!Tail.empty());
5458   }
5459 
5460   if (!EnableCondStoresVectorization && NumPredStores) {
5461     reportVectorizationFailure("There are conditional stores.",
5462         "store that is conditionally executed prevents vectorization",
5463         "ConditionalStore", ORE, TheLoop);
5464     ChosenFactor = ScalarCost;
5465   }
5466 
5467   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
5468                  ChosenFactor.Cost >= ScalarCost.Cost) dbgs()
5469              << "LV: Vectorization seems to be not beneficial, "
5470              << "but was forced by a user.\n");
5471   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
5472   return ChosenFactor;
5473 }
5474 
5475 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
5476     const Loop &L, ElementCount VF) const {
5477   // Cross iteration phis such as reductions need special handling and are
5478   // currently unsupported.
5479   if (any_of(L.getHeader()->phis(),
5480              [&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); }))
5481     return false;
5482 
5483   // Phis with uses outside of the loop require special handling and are
5484   // currently unsupported.
5485   for (auto &Entry : Legal->getInductionVars()) {
5486     // Look for uses of the value of the induction at the last iteration.
5487     Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
5488     for (User *U : PostInc->users())
5489       if (!L.contains(cast<Instruction>(U)))
5490         return false;
5491     // Look for uses of penultimate value of the induction.
5492     for (User *U : Entry.first->users())
5493       if (!L.contains(cast<Instruction>(U)))
5494         return false;
5495   }
5496 
5497   // Induction variables that are widened require special handling that is
5498   // currently not supported.
5499   if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
5500         return !(this->isScalarAfterVectorization(Entry.first, VF) ||
5501                  this->isProfitableToScalarize(Entry.first, VF));
5502       }))
5503     return false;
5504 
5505   // Epilogue vectorization code has not been auditted to ensure it handles
5506   // non-latch exits properly.  It may be fine, but it needs auditted and
5507   // tested.
5508   if (L.getExitingBlock() != L.getLoopLatch())
5509     return false;
5510 
5511   return true;
5512 }
5513 
5514 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5515     const ElementCount VF) const {
5516   // FIXME: We need a much better cost-model to take different parameters such
5517   // as register pressure, code size increase and cost of extra branches into
5518   // account. For now we apply a very crude heuristic and only consider loops
5519   // with vectorization factors larger than a certain value.
5520   // We also consider epilogue vectorization unprofitable for targets that don't
5521   // consider interleaving beneficial (eg. MVE).
5522   if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
5523     return false;
5524   // FIXME: We should consider changing the threshold for scalable
5525   // vectors to take VScaleForTuning into account.
5526   if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF)
5527     return true;
5528   return false;
5529 }
5530 
5531 VectorizationFactor
5532 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
5533     const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
5534   VectorizationFactor Result = VectorizationFactor::Disabled();
5535   if (!EnableEpilogueVectorization) {
5536     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
5537     return Result;
5538   }
5539 
5540   if (!isScalarEpilogueAllowed()) {
5541     LLVM_DEBUG(
5542         dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
5543                   "allowed.\n";);
5544     return Result;
5545   }
5546 
5547   // Not really a cost consideration, but check for unsupported cases here to
5548   // simplify the logic.
5549   if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
5550     LLVM_DEBUG(
5551         dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
5552                   "not a supported candidate.\n";);
5553     return Result;
5554   }
5555 
5556   if (EpilogueVectorizationForceVF > 1) {
5557     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
5558     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
5559     if (LVP.hasPlanWithVF(ForcedEC))
5560       return {ForcedEC, 0};
5561     else {
5562       LLVM_DEBUG(
5563           dbgs()
5564               << "LEV: Epilogue vectorization forced factor is not viable.\n";);
5565       return Result;
5566     }
5567   }
5568 
5569   if (TheLoop->getHeader()->getParent()->hasOptSize() ||
5570       TheLoop->getHeader()->getParent()->hasMinSize()) {
5571     LLVM_DEBUG(
5572         dbgs()
5573             << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
5574     return Result;
5575   }
5576 
5577   if (!isEpilogueVectorizationProfitable(MainLoopVF)) {
5578     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
5579                          "this loop\n");
5580     return Result;
5581   }
5582 
5583   // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
5584   // the main loop handles 8 lanes per iteration. We could still benefit from
5585   // vectorizing the epilogue loop with VF=4.
5586   ElementCount EstimatedRuntimeVF = MainLoopVF;
5587   if (MainLoopVF.isScalable()) {
5588     EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
5589     if (Optional<unsigned> VScale = getVScaleForTuning())
5590       EstimatedRuntimeVF *= VScale.getValue();
5591   }
5592 
5593   for (auto &NextVF : ProfitableVFs)
5594     if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
5595           ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) ||
5596          ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) &&
5597         (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) &&
5598         LVP.hasPlanWithVF(NextVF.Width))
5599       Result = NextVF;
5600 
5601   if (Result != VectorizationFactor::Disabled())
5602     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5603                       << Result.Width << "\n";);
5604   return Result;
5605 }
5606 
5607 std::pair<unsigned, unsigned>
5608 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5609   unsigned MinWidth = -1U;
5610   unsigned MaxWidth = 8;
5611   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5612   // For in-loop reductions, no element types are added to ElementTypesInLoop
5613   // if there are no loads/stores in the loop. In this case, check through the
5614   // reduction variables to determine the maximum width.
5615   if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
5616     // Reset MaxWidth so that we can find the smallest type used by recurrences
5617     // in the loop.
5618     MaxWidth = -1U;
5619     for (auto &PhiDescriptorPair : Legal->getReductionVars()) {
5620       const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
5621       // When finding the min width used by the recurrence we need to account
5622       // for casts on the input operands of the recurrence.
5623       MaxWidth = std::min<unsigned>(
5624           MaxWidth, std::min<unsigned>(
5625                         RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
5626                         RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
5627     }
5628   } else {
5629     for (Type *T : ElementTypesInLoop) {
5630       MinWidth = std::min<unsigned>(
5631           MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5632       MaxWidth = std::max<unsigned>(
5633           MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5634     }
5635   }
5636   return {MinWidth, MaxWidth};
5637 }
5638 
5639 void LoopVectorizationCostModel::collectElementTypesForWidening() {
5640   ElementTypesInLoop.clear();
5641   // For each block.
5642   for (BasicBlock *BB : TheLoop->blocks()) {
5643     // For each instruction in the loop.
5644     for (Instruction &I : BB->instructionsWithoutDebug()) {
5645       Type *T = I.getType();
5646 
5647       // Skip ignored values.
5648       if (ValuesToIgnore.count(&I))
5649         continue;
5650 
5651       // Only examine Loads, Stores and PHINodes.
5652       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5653         continue;
5654 
5655       // Examine PHI nodes that are reduction variables. Update the type to
5656       // account for the recurrence type.
5657       if (auto *PN = dyn_cast<PHINode>(&I)) {
5658         if (!Legal->isReductionVariable(PN))
5659           continue;
5660         const RecurrenceDescriptor &RdxDesc =
5661             Legal->getReductionVars().find(PN)->second;
5662         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
5663             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
5664                                       RdxDesc.getRecurrenceType(),
5665                                       TargetTransformInfo::ReductionFlags()))
5666           continue;
5667         T = RdxDesc.getRecurrenceType();
5668       }
5669 
5670       // Examine the stored values.
5671       if (auto *ST = dyn_cast<StoreInst>(&I))
5672         T = ST->getValueOperand()->getType();
5673 
5674       assert(T->isSized() &&
5675              "Expected the load/store/recurrence type to be sized");
5676 
5677       ElementTypesInLoop.insert(T);
5678     }
5679   }
5680 }
5681 
5682 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5683                                                            unsigned LoopCost) {
5684   // -- The interleave heuristics --
5685   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5686   // There are many micro-architectural considerations that we can't predict
5687   // at this level. For example, frontend pressure (on decode or fetch) due to
5688   // code size, or the number and capabilities of the execution ports.
5689   //
5690   // We use the following heuristics to select the interleave count:
5691   // 1. If the code has reductions, then we interleave to break the cross
5692   // iteration dependency.
5693   // 2. If the loop is really small, then we interleave to reduce the loop
5694   // overhead.
5695   // 3. We don't interleave if we think that we will spill registers to memory
5696   // due to the increased register pressure.
5697 
5698   if (!isScalarEpilogueAllowed())
5699     return 1;
5700 
5701   // We used the distance for the interleave count.
5702   if (Legal->getMaxSafeDepDistBytes() != -1U)
5703     return 1;
5704 
5705   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5706   const bool HasReductions = !Legal->getReductionVars().empty();
5707   // Do not interleave loops with a relatively small known or estimated trip
5708   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5709   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5710   // because with the above conditions interleaving can expose ILP and break
5711   // cross iteration dependences for reductions.
5712   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5713       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5714     return 1;
5715 
5716   // If we did not calculate the cost for VF (because the user selected the VF)
5717   // then we calculate the cost of VF here.
5718   if (LoopCost == 0) {
5719     InstructionCost C = expectedCost(VF).first;
5720     assert(C.isValid() && "Expected to have chosen a VF with valid cost");
5721     LoopCost = *C.getValue();
5722 
5723     // Loop body is free and there is no need for interleaving.
5724     if (LoopCost == 0)
5725       return 1;
5726   }
5727 
5728   RegisterUsage R = calculateRegisterUsage({VF})[0];
5729   // We divide by these constants so assume that we have at least one
5730   // instruction that uses at least one register.
5731   for (auto& pair : R.MaxLocalUsers) {
5732     pair.second = std::max(pair.second, 1U);
5733   }
5734 
5735   // We calculate the interleave count using the following formula.
5736   // Subtract the number of loop invariants from the number of available
5737   // registers. These registers are used by all of the interleaved instances.
5738   // Next, divide the remaining registers by the number of registers that is
5739   // required by the loop, in order to estimate how many parallel instances
5740   // fit without causing spills. All of this is rounded down if necessary to be
5741   // a power of two. We want power of two interleave count to simplify any
5742   // addressing operations or alignment considerations.
5743   // We also want power of two interleave counts to ensure that the induction
5744   // variable of the vector loop wraps to zero, when tail is folded by masking;
5745   // this currently happens when OptForSize, in which case IC is set to 1 above.
5746   unsigned IC = UINT_MAX;
5747 
5748   for (auto& pair : R.MaxLocalUsers) {
5749     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5750     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5751                       << " registers of "
5752                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5753     if (VF.isScalar()) {
5754       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5755         TargetNumRegisters = ForceTargetNumScalarRegs;
5756     } else {
5757       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5758         TargetNumRegisters = ForceTargetNumVectorRegs;
5759     }
5760     unsigned MaxLocalUsers = pair.second;
5761     unsigned LoopInvariantRegs = 0;
5762     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5763       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5764 
5765     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5766     // Don't count the induction variable as interleaved.
5767     if (EnableIndVarRegisterHeur) {
5768       TmpIC =
5769           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5770                         std::max(1U, (MaxLocalUsers - 1)));
5771     }
5772 
5773     IC = std::min(IC, TmpIC);
5774   }
5775 
5776   // Clamp the interleave ranges to reasonable counts.
5777   unsigned MaxInterleaveCount =
5778       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
5779 
5780   // Check if the user has overridden the max.
5781   if (VF.isScalar()) {
5782     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5783       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5784   } else {
5785     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5786       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5787   }
5788 
5789   // If trip count is known or estimated compile time constant, limit the
5790   // interleave count to be less than the trip count divided by VF, provided it
5791   // is at least 1.
5792   //
5793   // For scalable vectors we can't know if interleaving is beneficial. It may
5794   // not be beneficial for small loops if none of the lanes in the second vector
5795   // iterations is enabled. However, for larger loops, there is likely to be a
5796   // similar benefit as for fixed-width vectors. For now, we choose to leave
5797   // the InterleaveCount as if vscale is '1', although if some information about
5798   // the vector is known (e.g. min vector size), we can make a better decision.
5799   if (BestKnownTC) {
5800     MaxInterleaveCount =
5801         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
5802     // Make sure MaxInterleaveCount is greater than 0.
5803     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
5804   }
5805 
5806   assert(MaxInterleaveCount > 0 &&
5807          "Maximum interleave count must be greater than 0");
5808 
5809   // Clamp the calculated IC to be between the 1 and the max interleave count
5810   // that the target and trip count allows.
5811   if (IC > MaxInterleaveCount)
5812     IC = MaxInterleaveCount;
5813   else
5814     // Make sure IC is greater than 0.
5815     IC = std::max(1u, IC);
5816 
5817   assert(IC > 0 && "Interleave count must be greater than 0.");
5818 
5819   // Interleave if we vectorized this loop and there is a reduction that could
5820   // benefit from interleaving.
5821   if (VF.isVector() && HasReductions) {
5822     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5823     return IC;
5824   }
5825 
5826   // For any scalar loop that either requires runtime checks or predication we
5827   // are better off leaving this to the unroller. Note that if we've already
5828   // vectorized the loop we will have done the runtime check and so interleaving
5829   // won't require further checks.
5830   bool ScalarInterleavingRequiresPredication =
5831       (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5832          return Legal->blockNeedsPredication(BB);
5833        }));
5834   bool ScalarInterleavingRequiresRuntimePointerCheck =
5835       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5836 
5837   // We want to interleave small loops in order to reduce the loop overhead and
5838   // potentially expose ILP opportunities.
5839   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5840                     << "LV: IC is " << IC << '\n'
5841                     << "LV: VF is " << VF << '\n');
5842   const bool AggressivelyInterleaveReductions =
5843       TTI.enableAggressiveInterleaving(HasReductions);
5844   if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5845       !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5846     // We assume that the cost overhead is 1 and we use the cost model
5847     // to estimate the cost of the loop and interleave until the cost of the
5848     // loop overhead is about 5% of the cost of the loop.
5849     unsigned SmallIC =
5850         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5851 
5852     // Interleave until store/load ports (estimated by max interleave count) are
5853     // saturated.
5854     unsigned NumStores = Legal->getNumStores();
5855     unsigned NumLoads = Legal->getNumLoads();
5856     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5857     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5858 
5859     // There is little point in interleaving for reductions containing selects
5860     // and compares when VF=1 since it may just create more overhead than it's
5861     // worth for loops with small trip counts. This is because we still have to
5862     // do the final reduction after the loop.
5863     bool HasSelectCmpReductions =
5864         HasReductions &&
5865         any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5866           const RecurrenceDescriptor &RdxDesc = Reduction.second;
5867           return RecurrenceDescriptor::isSelectCmpRecurrenceKind(
5868               RdxDesc.getRecurrenceKind());
5869         });
5870     if (HasSelectCmpReductions) {
5871       LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5872       return 1;
5873     }
5874 
5875     // If we have a scalar reduction (vector reductions are already dealt with
5876     // by this point), we can increase the critical path length if the loop
5877     // we're interleaving is inside another loop. For tree-wise reductions
5878     // set the limit to 2, and for ordered reductions it's best to disable
5879     // interleaving entirely.
5880     if (HasReductions && TheLoop->getLoopDepth() > 1) {
5881       bool HasOrderedReductions =
5882           any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5883             const RecurrenceDescriptor &RdxDesc = Reduction.second;
5884             return RdxDesc.isOrdered();
5885           });
5886       if (HasOrderedReductions) {
5887         LLVM_DEBUG(
5888             dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5889         return 1;
5890       }
5891 
5892       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5893       SmallIC = std::min(SmallIC, F);
5894       StoresIC = std::min(StoresIC, F);
5895       LoadsIC = std::min(LoadsIC, F);
5896     }
5897 
5898     if (EnableLoadStoreRuntimeInterleave &&
5899         std::max(StoresIC, LoadsIC) > SmallIC) {
5900       LLVM_DEBUG(
5901           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5902       return std::max(StoresIC, LoadsIC);
5903     }
5904 
5905     // If there are scalar reductions and TTI has enabled aggressive
5906     // interleaving for reductions, we will interleave to expose ILP.
5907     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
5908         AggressivelyInterleaveReductions) {
5909       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5910       // Interleave no less than SmallIC but not as aggressive as the normal IC
5911       // to satisfy the rare situation when resources are too limited.
5912       return std::max(IC / 2, SmallIC);
5913     } else {
5914       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5915       return SmallIC;
5916     }
5917   }
5918 
5919   // Interleave if this is a large loop (small loops are already dealt with by
5920   // this point) that could benefit from interleaving.
5921   if (AggressivelyInterleaveReductions) {
5922     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5923     return IC;
5924   }
5925 
5926   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5927   return 1;
5928 }
5929 
5930 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5931 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5932   // This function calculates the register usage by measuring the highest number
5933   // of values that are alive at a single location. Obviously, this is a very
5934   // rough estimation. We scan the loop in a topological order in order and
5935   // assign a number to each instruction. We use RPO to ensure that defs are
5936   // met before their users. We assume that each instruction that has in-loop
5937   // users starts an interval. We record every time that an in-loop value is
5938   // used, so we have a list of the first and last occurrences of each
5939   // instruction. Next, we transpose this data structure into a multi map that
5940   // holds the list of intervals that *end* at a specific location. This multi
5941   // map allows us to perform a linear search. We scan the instructions linearly
5942   // and record each time that a new interval starts, by placing it in a set.
5943   // If we find this value in the multi-map then we remove it from the set.
5944   // The max register usage is the maximum size of the set.
5945   // We also search for instructions that are defined outside the loop, but are
5946   // used inside the loop. We need this number separately from the max-interval
5947   // usage number because when we unroll, loop-invariant values do not take
5948   // more register.
5949   LoopBlocksDFS DFS(TheLoop);
5950   DFS.perform(LI);
5951 
5952   RegisterUsage RU;
5953 
5954   // Each 'key' in the map opens a new interval. The values
5955   // of the map are the index of the 'last seen' usage of the
5956   // instruction that is the key.
5957   using IntervalMap = DenseMap<Instruction *, unsigned>;
5958 
5959   // Maps instruction to its index.
5960   SmallVector<Instruction *, 64> IdxToInstr;
5961   // Marks the end of each interval.
5962   IntervalMap EndPoint;
5963   // Saves the list of instruction indices that are used in the loop.
5964   SmallPtrSet<Instruction *, 8> Ends;
5965   // Saves the list of values that are used in the loop but are
5966   // defined outside the loop, such as arguments and constants.
5967   SmallPtrSet<Value *, 8> LoopInvariants;
5968 
5969   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5970     for (Instruction &I : BB->instructionsWithoutDebug()) {
5971       IdxToInstr.push_back(&I);
5972 
5973       // Save the end location of each USE.
5974       for (Value *U : I.operands()) {
5975         auto *Instr = dyn_cast<Instruction>(U);
5976 
5977         // Ignore non-instruction values such as arguments, constants, etc.
5978         if (!Instr)
5979           continue;
5980 
5981         // If this instruction is outside the loop then record it and continue.
5982         if (!TheLoop->contains(Instr)) {
5983           LoopInvariants.insert(Instr);
5984           continue;
5985         }
5986 
5987         // Overwrite previous end points.
5988         EndPoint[Instr] = IdxToInstr.size();
5989         Ends.insert(Instr);
5990       }
5991     }
5992   }
5993 
5994   // Saves the list of intervals that end with the index in 'key'.
5995   using InstrList = SmallVector<Instruction *, 2>;
5996   DenseMap<unsigned, InstrList> TransposeEnds;
5997 
5998   // Transpose the EndPoints to a list of values that end at each index.
5999   for (auto &Interval : EndPoint)
6000     TransposeEnds[Interval.second].push_back(Interval.first);
6001 
6002   SmallPtrSet<Instruction *, 8> OpenIntervals;
6003   SmallVector<RegisterUsage, 8> RUs(VFs.size());
6004   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
6005 
6006   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
6007 
6008   // A lambda that gets the register usage for the given type and VF.
6009   const auto &TTICapture = TTI;
6010   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
6011     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
6012       return 0;
6013     InstructionCost::CostType RegUsage =
6014         *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue();
6015     assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() &&
6016            "Nonsensical values for register usage.");
6017     return RegUsage;
6018   };
6019 
6020   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
6021     Instruction *I = IdxToInstr[i];
6022 
6023     // Remove all of the instructions that end at this location.
6024     InstrList &List = TransposeEnds[i];
6025     for (Instruction *ToRemove : List)
6026       OpenIntervals.erase(ToRemove);
6027 
6028     // Ignore instructions that are never used within the loop.
6029     if (!Ends.count(I))
6030       continue;
6031 
6032     // Skip ignored values.
6033     if (ValuesToIgnore.count(I))
6034       continue;
6035 
6036     // For each VF find the maximum usage of registers.
6037     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6038       // Count the number of live intervals.
6039       SmallMapVector<unsigned, unsigned, 4> RegUsage;
6040 
6041       if (VFs[j].isScalar()) {
6042         for (auto Inst : OpenIntervals) {
6043           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6044           if (RegUsage.find(ClassID) == RegUsage.end())
6045             RegUsage[ClassID] = 1;
6046           else
6047             RegUsage[ClassID] += 1;
6048         }
6049       } else {
6050         collectUniformsAndScalars(VFs[j]);
6051         for (auto Inst : OpenIntervals) {
6052           // Skip ignored values for VF > 1.
6053           if (VecValuesToIgnore.count(Inst))
6054             continue;
6055           if (isScalarAfterVectorization(Inst, VFs[j])) {
6056             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6057             if (RegUsage.find(ClassID) == RegUsage.end())
6058               RegUsage[ClassID] = 1;
6059             else
6060               RegUsage[ClassID] += 1;
6061           } else {
6062             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
6063             if (RegUsage.find(ClassID) == RegUsage.end())
6064               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
6065             else
6066               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6067           }
6068         }
6069       }
6070 
6071       for (auto& pair : RegUsage) {
6072         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
6073           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
6074         else
6075           MaxUsages[j][pair.first] = pair.second;
6076       }
6077     }
6078 
6079     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6080                       << OpenIntervals.size() << '\n');
6081 
6082     // Add the current instruction to the list of open intervals.
6083     OpenIntervals.insert(I);
6084   }
6085 
6086   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6087     SmallMapVector<unsigned, unsigned, 4> Invariant;
6088 
6089     for (auto Inst : LoopInvariants) {
6090       unsigned Usage =
6091           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6092       unsigned ClassID =
6093           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6094       if (Invariant.find(ClassID) == Invariant.end())
6095         Invariant[ClassID] = Usage;
6096       else
6097         Invariant[ClassID] += Usage;
6098     }
6099 
6100     LLVM_DEBUG({
6101       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
6102       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
6103              << " item\n";
6104       for (const auto &pair : MaxUsages[i]) {
6105         dbgs() << "LV(REG): RegisterClass: "
6106                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6107                << " registers\n";
6108       }
6109       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
6110              << " item\n";
6111       for (const auto &pair : Invariant) {
6112         dbgs() << "LV(REG): RegisterClass: "
6113                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6114                << " registers\n";
6115       }
6116     });
6117 
6118     RU.LoopInvariantRegs = Invariant;
6119     RU.MaxLocalUsers = MaxUsages[i];
6120     RUs[i] = RU;
6121   }
6122 
6123   return RUs;
6124 }
6125 
6126 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
6127                                                            ElementCount VF) {
6128   // TODO: Cost model for emulated masked load/store is completely
6129   // broken. This hack guides the cost model to use an artificially
6130   // high enough value to practically disable vectorization with such
6131   // operations, except where previously deployed legality hack allowed
6132   // using very low cost values. This is to avoid regressions coming simply
6133   // from moving "masked load/store" check from legality to cost model.
6134   // Masked Load/Gather emulation was previously never allowed.
6135   // Limited number of Masked Store/Scatter emulation was allowed.
6136   assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction");
6137   return isa<LoadInst>(I) ||
6138          (isa<StoreInst>(I) &&
6139           NumPredStores > NumberOfStoresToPredicate);
6140 }
6141 
6142 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6143   // If we aren't vectorizing the loop, or if we've already collected the
6144   // instructions to scalarize, there's nothing to do. Collection may already
6145   // have occurred if we have a user-selected VF and are now computing the
6146   // expected cost for interleaving.
6147   if (VF.isScalar() || VF.isZero() ||
6148       InstsToScalarize.find(VF) != InstsToScalarize.end())
6149     return;
6150 
6151   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6152   // not profitable to scalarize any instructions, the presence of VF in the
6153   // map will indicate that we've analyzed it already.
6154   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6155 
6156   // Find all the instructions that are scalar with predication in the loop and
6157   // determine if it would be better to not if-convert the blocks they are in.
6158   // If so, we also record the instructions to scalarize.
6159   for (BasicBlock *BB : TheLoop->blocks()) {
6160     if (!blockNeedsPredicationForAnyReason(BB))
6161       continue;
6162     for (Instruction &I : *BB)
6163       if (isScalarWithPredication(&I, VF)) {
6164         ScalarCostsTy ScalarCosts;
6165         // Do not apply discount if scalable, because that would lead to
6166         // invalid scalarization costs.
6167         // Do not apply discount logic if hacked cost is needed
6168         // for emulated masked memrefs.
6169         if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) &&
6170             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6171           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6172         // Remember that BB will remain after vectorization.
6173         PredicatedBBsAfterVectorization.insert(BB);
6174       }
6175   }
6176 }
6177 
6178 int LoopVectorizationCostModel::computePredInstDiscount(
6179     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
6180   assert(!isUniformAfterVectorization(PredInst, VF) &&
6181          "Instruction marked uniform-after-vectorization will be predicated");
6182 
6183   // Initialize the discount to zero, meaning that the scalar version and the
6184   // vector version cost the same.
6185   InstructionCost Discount = 0;
6186 
6187   // Holds instructions to analyze. The instructions we visit are mapped in
6188   // ScalarCosts. Those instructions are the ones that would be scalarized if
6189   // we find that the scalar version costs less.
6190   SmallVector<Instruction *, 8> Worklist;
6191 
6192   // Returns true if the given instruction can be scalarized.
6193   auto canBeScalarized = [&](Instruction *I) -> bool {
6194     // We only attempt to scalarize instructions forming a single-use chain
6195     // from the original predicated block that would otherwise be vectorized.
6196     // Although not strictly necessary, we give up on instructions we know will
6197     // already be scalar to avoid traversing chains that are unlikely to be
6198     // beneficial.
6199     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6200         isScalarAfterVectorization(I, VF))
6201       return false;
6202 
6203     // If the instruction is scalar with predication, it will be analyzed
6204     // separately. We ignore it within the context of PredInst.
6205     if (isScalarWithPredication(I, VF))
6206       return false;
6207 
6208     // If any of the instruction's operands are uniform after vectorization,
6209     // the instruction cannot be scalarized. This prevents, for example, a
6210     // masked load from being scalarized.
6211     //
6212     // We assume we will only emit a value for lane zero of an instruction
6213     // marked uniform after vectorization, rather than VF identical values.
6214     // Thus, if we scalarize an instruction that uses a uniform, we would
6215     // create uses of values corresponding to the lanes we aren't emitting code
6216     // for. This behavior can be changed by allowing getScalarValue to clone
6217     // the lane zero values for uniforms rather than asserting.
6218     for (Use &U : I->operands())
6219       if (auto *J = dyn_cast<Instruction>(U.get()))
6220         if (isUniformAfterVectorization(J, VF))
6221           return false;
6222 
6223     // Otherwise, we can scalarize the instruction.
6224     return true;
6225   };
6226 
6227   // Compute the expected cost discount from scalarizing the entire expression
6228   // feeding the predicated instruction. We currently only consider expressions
6229   // that are single-use instruction chains.
6230   Worklist.push_back(PredInst);
6231   while (!Worklist.empty()) {
6232     Instruction *I = Worklist.pop_back_val();
6233 
6234     // If we've already analyzed the instruction, there's nothing to do.
6235     if (ScalarCosts.find(I) != ScalarCosts.end())
6236       continue;
6237 
6238     // Compute the cost of the vector instruction. Note that this cost already
6239     // includes the scalarization overhead of the predicated instruction.
6240     InstructionCost VectorCost = getInstructionCost(I, VF).first;
6241 
6242     // Compute the cost of the scalarized instruction. This cost is the cost of
6243     // the instruction as if it wasn't if-converted and instead remained in the
6244     // predicated block. We will scale this cost by block probability after
6245     // computing the scalarization overhead.
6246     InstructionCost ScalarCost =
6247         VF.getFixedValue() *
6248         getInstructionCost(I, ElementCount::getFixed(1)).first;
6249 
6250     // Compute the scalarization overhead of needed insertelement instructions
6251     // and phi nodes.
6252     if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
6253       ScalarCost += TTI.getScalarizationOverhead(
6254           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6255           APInt::getAllOnes(VF.getFixedValue()), true, false);
6256       ScalarCost +=
6257           VF.getFixedValue() *
6258           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6259     }
6260 
6261     // Compute the scalarization overhead of needed extractelement
6262     // instructions. For each of the instruction's operands, if the operand can
6263     // be scalarized, add it to the worklist; otherwise, account for the
6264     // overhead.
6265     for (Use &U : I->operands())
6266       if (auto *J = dyn_cast<Instruction>(U.get())) {
6267         assert(VectorType::isValidElementType(J->getType()) &&
6268                "Instruction has non-scalar type");
6269         if (canBeScalarized(J))
6270           Worklist.push_back(J);
6271         else if (needsExtract(J, VF)) {
6272           ScalarCost += TTI.getScalarizationOverhead(
6273               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6274               APInt::getAllOnes(VF.getFixedValue()), false, true);
6275         }
6276       }
6277 
6278     // Scale the total scalar cost by block probability.
6279     ScalarCost /= getReciprocalPredBlockProb();
6280 
6281     // Compute the discount. A non-negative discount means the vector version
6282     // of the instruction costs more, and scalarizing would be beneficial.
6283     Discount += VectorCost - ScalarCost;
6284     ScalarCosts[I] = ScalarCost;
6285   }
6286 
6287   return *Discount.getValue();
6288 }
6289 
6290 LoopVectorizationCostModel::VectorizationCostTy
6291 LoopVectorizationCostModel::expectedCost(
6292     ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
6293   VectorizationCostTy Cost;
6294 
6295   // For each block.
6296   for (BasicBlock *BB : TheLoop->blocks()) {
6297     VectorizationCostTy BlockCost;
6298 
6299     // For each instruction in the old loop.
6300     for (Instruction &I : BB->instructionsWithoutDebug()) {
6301       // Skip ignored values.
6302       if (ValuesToIgnore.count(&I) ||
6303           (VF.isVector() && VecValuesToIgnore.count(&I)))
6304         continue;
6305 
6306       VectorizationCostTy C = getInstructionCost(&I, VF);
6307 
6308       // Check if we should override the cost.
6309       if (C.first.isValid() &&
6310           ForceTargetInstructionCost.getNumOccurrences() > 0)
6311         C.first = InstructionCost(ForceTargetInstructionCost);
6312 
6313       // Keep a list of instructions with invalid costs.
6314       if (Invalid && !C.first.isValid())
6315         Invalid->emplace_back(&I, VF);
6316 
6317       BlockCost.first += C.first;
6318       BlockCost.second |= C.second;
6319       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6320                         << " for VF " << VF << " For instruction: " << I
6321                         << '\n');
6322     }
6323 
6324     // If we are vectorizing a predicated block, it will have been
6325     // if-converted. This means that the block's instructions (aside from
6326     // stores and instructions that may divide by zero) will now be
6327     // unconditionally executed. For the scalar case, we may not always execute
6328     // the predicated block, if it is an if-else block. Thus, scale the block's
6329     // cost by the probability of executing it. blockNeedsPredication from
6330     // Legal is used so as to not include all blocks in tail folded loops.
6331     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6332       BlockCost.first /= getReciprocalPredBlockProb();
6333 
6334     Cost.first += BlockCost.first;
6335     Cost.second |= BlockCost.second;
6336   }
6337 
6338   return Cost;
6339 }
6340 
6341 /// Gets Address Access SCEV after verifying that the access pattern
6342 /// is loop invariant except the induction variable dependence.
6343 ///
6344 /// This SCEV can be sent to the Target in order to estimate the address
6345 /// calculation cost.
6346 static const SCEV *getAddressAccessSCEV(
6347               Value *Ptr,
6348               LoopVectorizationLegality *Legal,
6349               PredicatedScalarEvolution &PSE,
6350               const Loop *TheLoop) {
6351 
6352   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6353   if (!Gep)
6354     return nullptr;
6355 
6356   // We are looking for a gep with all loop invariant indices except for one
6357   // which should be an induction variable.
6358   auto SE = PSE.getSE();
6359   unsigned NumOperands = Gep->getNumOperands();
6360   for (unsigned i = 1; i < NumOperands; ++i) {
6361     Value *Opd = Gep->getOperand(i);
6362     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6363         !Legal->isInductionVariable(Opd))
6364       return nullptr;
6365   }
6366 
6367   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6368   return PSE.getSCEV(Ptr);
6369 }
6370 
6371 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6372   return Legal->hasStride(I->getOperand(0)) ||
6373          Legal->hasStride(I->getOperand(1));
6374 }
6375 
6376 InstructionCost
6377 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6378                                                         ElementCount VF) {
6379   assert(VF.isVector() &&
6380          "Scalarization cost of instruction implies vectorization.");
6381   if (VF.isScalable())
6382     return InstructionCost::getInvalid();
6383 
6384   Type *ValTy = getLoadStoreType(I);
6385   auto SE = PSE.getSE();
6386 
6387   unsigned AS = getLoadStoreAddressSpace(I);
6388   Value *Ptr = getLoadStorePointerOperand(I);
6389   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6390   // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
6391   //       that it is being called from this specific place.
6392 
6393   // Figure out whether the access is strided and get the stride value
6394   // if it's known in compile time
6395   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6396 
6397   // Get the cost of the scalar memory instruction and address computation.
6398   InstructionCost Cost =
6399       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6400 
6401   // Don't pass *I here, since it is scalar but will actually be part of a
6402   // vectorized loop where the user of it is a vectorized instruction.
6403   const Align Alignment = getLoadStoreAlignment(I);
6404   Cost += VF.getKnownMinValue() *
6405           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6406                               AS, TTI::TCK_RecipThroughput);
6407 
6408   // Get the overhead of the extractelement and insertelement instructions
6409   // we might create due to scalarization.
6410   Cost += getScalarizationOverhead(I, VF);
6411 
6412   // If we have a predicated load/store, it will need extra i1 extracts and
6413   // conditional branches, but may not be executed for each vector lane. Scale
6414   // the cost by the probability of executing the predicated block.
6415   if (isPredicatedInst(I, VF)) {
6416     Cost /= getReciprocalPredBlockProb();
6417 
6418     // Add the cost of an i1 extract and a branch
6419     auto *Vec_i1Ty =
6420         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
6421     Cost += TTI.getScalarizationOverhead(
6422         Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
6423         /*Insert=*/false, /*Extract=*/true);
6424     Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
6425 
6426     if (useEmulatedMaskMemRefHack(I, VF))
6427       // Artificially setting to a high enough value to practically disable
6428       // vectorization with such operations.
6429       Cost = 3000000;
6430   }
6431 
6432   return Cost;
6433 }
6434 
6435 InstructionCost
6436 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6437                                                     ElementCount VF) {
6438   Type *ValTy = getLoadStoreType(I);
6439   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6440   Value *Ptr = getLoadStorePointerOperand(I);
6441   unsigned AS = getLoadStoreAddressSpace(I);
6442   int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
6443   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6444 
6445   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6446          "Stride should be 1 or -1 for consecutive memory access");
6447   const Align Alignment = getLoadStoreAlignment(I);
6448   InstructionCost Cost = 0;
6449   if (Legal->isMaskRequired(I))
6450     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6451                                       CostKind);
6452   else
6453     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6454                                 CostKind, I);
6455 
6456   bool Reverse = ConsecutiveStride < 0;
6457   if (Reverse)
6458     Cost +=
6459         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
6460   return Cost;
6461 }
6462 
6463 InstructionCost
6464 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6465                                                 ElementCount VF) {
6466   assert(Legal->isUniformMemOp(*I));
6467 
6468   Type *ValTy = getLoadStoreType(I);
6469   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6470   const Align Alignment = getLoadStoreAlignment(I);
6471   unsigned AS = getLoadStoreAddressSpace(I);
6472   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6473   if (isa<LoadInst>(I)) {
6474     return TTI.getAddressComputationCost(ValTy) +
6475            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6476                                CostKind) +
6477            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6478   }
6479   StoreInst *SI = cast<StoreInst>(I);
6480 
6481   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6482   return TTI.getAddressComputationCost(ValTy) +
6483          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6484                              CostKind) +
6485          (isLoopInvariantStoreValue
6486               ? 0
6487               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6488                                        VF.getKnownMinValue() - 1));
6489 }
6490 
6491 InstructionCost
6492 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6493                                                  ElementCount VF) {
6494   Type *ValTy = getLoadStoreType(I);
6495   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6496   const Align Alignment = getLoadStoreAlignment(I);
6497   const Value *Ptr = getLoadStorePointerOperand(I);
6498 
6499   return TTI.getAddressComputationCost(VectorTy) +
6500          TTI.getGatherScatterOpCost(
6501              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6502              TargetTransformInfo::TCK_RecipThroughput, I);
6503 }
6504 
6505 InstructionCost
6506 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6507                                                    ElementCount VF) {
6508   // TODO: Once we have support for interleaving with scalable vectors
6509   // we can calculate the cost properly here.
6510   if (VF.isScalable())
6511     return InstructionCost::getInvalid();
6512 
6513   Type *ValTy = getLoadStoreType(I);
6514   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6515   unsigned AS = getLoadStoreAddressSpace(I);
6516 
6517   auto Group = getInterleavedAccessGroup(I);
6518   assert(Group && "Fail to get an interleaved access group.");
6519 
6520   unsigned InterleaveFactor = Group->getFactor();
6521   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6522 
6523   // Holds the indices of existing members in the interleaved group.
6524   SmallVector<unsigned, 4> Indices;
6525   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
6526     if (Group->getMember(IF))
6527       Indices.push_back(IF);
6528 
6529   // Calculate the cost of the whole interleaved group.
6530   bool UseMaskForGaps =
6531       (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6532       (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
6533   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
6534       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6535       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6536 
6537   if (Group->isReverse()) {
6538     // TODO: Add support for reversed masked interleaved access.
6539     assert(!Legal->isMaskRequired(I) &&
6540            "Reverse masked interleaved access not supported.");
6541     Cost +=
6542         Group->getNumMembers() *
6543         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
6544   }
6545   return Cost;
6546 }
6547 
6548 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
6549     Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
6550   using namespace llvm::PatternMatch;
6551   // Early exit for no inloop reductions
6552   if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6553     return None;
6554   auto *VectorTy = cast<VectorType>(Ty);
6555 
6556   // We are looking for a pattern of, and finding the minimal acceptable cost:
6557   //  reduce(mul(ext(A), ext(B))) or
6558   //  reduce(mul(A, B)) or
6559   //  reduce(ext(A)) or
6560   //  reduce(A).
6561   // The basic idea is that we walk down the tree to do that, finding the root
6562   // reduction instruction in InLoopReductionImmediateChains. From there we find
6563   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6564   // of the components. If the reduction cost is lower then we return it for the
6565   // reduction instruction and 0 for the other instructions in the pattern. If
6566   // it is not we return an invalid cost specifying the orignal cost method
6567   // should be used.
6568   Instruction *RetI = I;
6569   if (match(RetI, m_ZExtOrSExt(m_Value()))) {
6570     if (!RetI->hasOneUser())
6571       return None;
6572     RetI = RetI->user_back();
6573   }
6574   if (match(RetI, m_Mul(m_Value(), m_Value())) &&
6575       RetI->user_back()->getOpcode() == Instruction::Add) {
6576     if (!RetI->hasOneUser())
6577       return None;
6578     RetI = RetI->user_back();
6579   }
6580 
6581   // Test if the found instruction is a reduction, and if not return an invalid
6582   // cost specifying the parent to use the original cost modelling.
6583   if (!InLoopReductionImmediateChains.count(RetI))
6584     return None;
6585 
6586   // Find the reduction this chain is a part of and calculate the basic cost of
6587   // the reduction on its own.
6588   Instruction *LastChain = InLoopReductionImmediateChains[RetI];
6589   Instruction *ReductionPhi = LastChain;
6590   while (!isa<PHINode>(ReductionPhi))
6591     ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
6592 
6593   const RecurrenceDescriptor &RdxDesc =
6594       Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
6595 
6596   InstructionCost BaseCost = TTI.getArithmeticReductionCost(
6597       RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
6598 
6599   // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6600   // normal fmul instruction to the cost of the fadd reduction.
6601   if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
6602     BaseCost +=
6603         TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
6604 
6605   // If we're using ordered reductions then we can just return the base cost
6606   // here, since getArithmeticReductionCost calculates the full ordered
6607   // reduction cost when FP reassociation is not allowed.
6608   if (useOrderedReductions(RdxDesc))
6609     return BaseCost;
6610 
6611   // Get the operand that was not the reduction chain and match it to one of the
6612   // patterns, returning the better cost if it is found.
6613   Instruction *RedOp = RetI->getOperand(1) == LastChain
6614                            ? dyn_cast<Instruction>(RetI->getOperand(0))
6615                            : dyn_cast<Instruction>(RetI->getOperand(1));
6616 
6617   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6618 
6619   Instruction *Op0, *Op1;
6620   if (RedOp &&
6621       match(RedOp,
6622             m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
6623       match(Op0, m_ZExtOrSExt(m_Value())) &&
6624       Op0->getOpcode() == Op1->getOpcode() &&
6625       Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6626       !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
6627       (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
6628 
6629     // Matched reduce(ext(mul(ext(A), ext(B)))
6630     // Note that the extend opcodes need to all match, or if A==B they will have
6631     // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6632     // which is equally fine.
6633     bool IsUnsigned = isa<ZExtInst>(Op0);
6634     auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6635     auto *MulType = VectorType::get(Op0->getType(), VectorTy);
6636 
6637     InstructionCost ExtCost =
6638         TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
6639                              TTI::CastContextHint::None, CostKind, Op0);
6640     InstructionCost MulCost =
6641         TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
6642     InstructionCost Ext2Cost =
6643         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
6644                              TTI::CastContextHint::None, CostKind, RedOp);
6645 
6646     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6647         /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6648         CostKind);
6649 
6650     if (RedCost.isValid() &&
6651         RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6652       return I == RetI ? RedCost : 0;
6653   } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
6654              !TheLoop->isLoopInvariant(RedOp)) {
6655     // Matched reduce(ext(A))
6656     bool IsUnsigned = isa<ZExtInst>(RedOp);
6657     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6658     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6659         /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6660         CostKind);
6661 
6662     InstructionCost ExtCost =
6663         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6664                              TTI::CastContextHint::None, CostKind, RedOp);
6665     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6666       return I == RetI ? RedCost : 0;
6667   } else if (RedOp &&
6668              match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
6669     if (match(Op0, m_ZExtOrSExt(m_Value())) &&
6670         Op0->getOpcode() == Op1->getOpcode() &&
6671         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
6672       bool IsUnsigned = isa<ZExtInst>(Op0);
6673       Type *Op0Ty = Op0->getOperand(0)->getType();
6674       Type *Op1Ty = Op1->getOperand(0)->getType();
6675       Type *LargestOpTy =
6676           Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
6677                                                                     : Op0Ty;
6678       auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
6679 
6680       // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of
6681       // different sizes. We take the largest type as the ext to reduce, and add
6682       // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6683       InstructionCost ExtCost0 = TTI.getCastInstrCost(
6684           Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
6685           TTI::CastContextHint::None, CostKind, Op0);
6686       InstructionCost ExtCost1 = TTI.getCastInstrCost(
6687           Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
6688           TTI::CastContextHint::None, CostKind, Op1);
6689       InstructionCost MulCost =
6690           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6691 
6692       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6693           /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6694           CostKind);
6695       InstructionCost ExtraExtCost = 0;
6696       if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
6697         Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
6698         ExtraExtCost = TTI.getCastInstrCost(
6699             ExtraExtOp->getOpcode(), ExtType,
6700             VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
6701             TTI::CastContextHint::None, CostKind, ExtraExtOp);
6702       }
6703 
6704       if (RedCost.isValid() &&
6705           (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6706         return I == RetI ? RedCost : 0;
6707     } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
6708       // Matched reduce(mul())
6709       InstructionCost MulCost =
6710           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6711 
6712       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6713           /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy,
6714           CostKind);
6715 
6716       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
6717         return I == RetI ? RedCost : 0;
6718     }
6719   }
6720 
6721   return I == RetI ? Optional<InstructionCost>(BaseCost) : None;
6722 }
6723 
6724 InstructionCost
6725 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6726                                                      ElementCount VF) {
6727   // Calculate scalar cost only. Vectorization cost should be ready at this
6728   // moment.
6729   if (VF.isScalar()) {
6730     Type *ValTy = getLoadStoreType(I);
6731     const Align Alignment = getLoadStoreAlignment(I);
6732     unsigned AS = getLoadStoreAddressSpace(I);
6733 
6734     return TTI.getAddressComputationCost(ValTy) +
6735            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6736                                TTI::TCK_RecipThroughput, I);
6737   }
6738   return getWideningCost(I, VF);
6739 }
6740 
6741 LoopVectorizationCostModel::VectorizationCostTy
6742 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6743                                                ElementCount VF) {
6744   // If we know that this instruction will remain uniform, check the cost of
6745   // the scalar version.
6746   if (isUniformAfterVectorization(I, VF))
6747     VF = ElementCount::getFixed(1);
6748 
6749   if (VF.isVector() && isProfitableToScalarize(I, VF))
6750     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6751 
6752   // Forced scalars do not have any scalarization overhead.
6753   auto ForcedScalar = ForcedScalars.find(VF);
6754   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6755     auto InstSet = ForcedScalar->second;
6756     if (InstSet.count(I))
6757       return VectorizationCostTy(
6758           (getInstructionCost(I, ElementCount::getFixed(1)).first *
6759            VF.getKnownMinValue()),
6760           false);
6761   }
6762 
6763   Type *VectorTy;
6764   InstructionCost C = getInstructionCost(I, VF, VectorTy);
6765 
6766   bool TypeNotScalarized = false;
6767   if (VF.isVector() && VectorTy->isVectorTy()) {
6768     unsigned NumParts = TTI.getNumberOfParts(VectorTy);
6769     if (NumParts)
6770       TypeNotScalarized = NumParts < VF.getKnownMinValue();
6771     else
6772       C = InstructionCost::getInvalid();
6773   }
6774   return VectorizationCostTy(C, TypeNotScalarized);
6775 }
6776 
6777 InstructionCost
6778 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6779                                                      ElementCount VF) const {
6780 
6781   // There is no mechanism yet to create a scalable scalarization loop,
6782   // so this is currently Invalid.
6783   if (VF.isScalable())
6784     return InstructionCost::getInvalid();
6785 
6786   if (VF.isScalar())
6787     return 0;
6788 
6789   InstructionCost Cost = 0;
6790   Type *RetTy = ToVectorTy(I->getType(), VF);
6791   if (!RetTy->isVoidTy() &&
6792       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6793     Cost += TTI.getScalarizationOverhead(
6794         cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true,
6795         false);
6796 
6797   // Some targets keep addresses scalar.
6798   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6799     return Cost;
6800 
6801   // Some targets support efficient element stores.
6802   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6803     return Cost;
6804 
6805   // Collect operands to consider.
6806   CallInst *CI = dyn_cast<CallInst>(I);
6807   Instruction::op_range Ops = CI ? CI->args() : I->operands();
6808 
6809   // Skip operands that do not require extraction/scalarization and do not incur
6810   // any overhead.
6811   SmallVector<Type *> Tys;
6812   for (auto *V : filterExtractingOperands(Ops, VF))
6813     Tys.push_back(MaybeVectorizeType(V->getType(), VF));
6814   return Cost + TTI.getOperandsScalarizationOverhead(
6815                     filterExtractingOperands(Ops, VF), Tys);
6816 }
6817 
6818 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6819   if (VF.isScalar())
6820     return;
6821   NumPredStores = 0;
6822   for (BasicBlock *BB : TheLoop->blocks()) {
6823     // For each instruction in the old loop.
6824     for (Instruction &I : *BB) {
6825       Value *Ptr =  getLoadStorePointerOperand(&I);
6826       if (!Ptr)
6827         continue;
6828 
6829       // TODO: We should generate better code and update the cost model for
6830       // predicated uniform stores. Today they are treated as any other
6831       // predicated store (see added test cases in
6832       // invariant-store-vectorization.ll).
6833       if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6834         NumPredStores++;
6835 
6836       if (Legal->isUniformMemOp(I)) {
6837         // TODO: Avoid replicating loads and stores instead of
6838         // relying on instcombine to remove them.
6839         // Load: Scalar load + broadcast
6840         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6841         InstructionCost Cost;
6842         if (isa<StoreInst>(&I) && VF.isScalable() &&
6843             isLegalGatherOrScatter(&I, VF)) {
6844           Cost = getGatherScatterCost(&I, VF);
6845           setWideningDecision(&I, VF, CM_GatherScatter, Cost);
6846         } else {
6847           assert((isa<LoadInst>(&I) || !VF.isScalable()) &&
6848                  "Cannot yet scalarize uniform stores");
6849           Cost = getUniformMemOpCost(&I, VF);
6850           setWideningDecision(&I, VF, CM_Scalarize, Cost);
6851         }
6852         continue;
6853       }
6854 
6855       // We assume that widening is the best solution when possible.
6856       if (memoryInstructionCanBeWidened(&I, VF)) {
6857         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6858         int ConsecutiveStride = Legal->isConsecutivePtr(
6859             getLoadStoreType(&I), getLoadStorePointerOperand(&I));
6860         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6861                "Expected consecutive stride.");
6862         InstWidening Decision =
6863             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6864         setWideningDecision(&I, VF, Decision, Cost);
6865         continue;
6866       }
6867 
6868       // Choose between Interleaving, Gather/Scatter or Scalarization.
6869       InstructionCost InterleaveCost = InstructionCost::getInvalid();
6870       unsigned NumAccesses = 1;
6871       if (isAccessInterleaved(&I)) {
6872         auto Group = getInterleavedAccessGroup(&I);
6873         assert(Group && "Fail to get an interleaved access group.");
6874 
6875         // Make one decision for the whole group.
6876         if (getWideningDecision(&I, VF) != CM_Unknown)
6877           continue;
6878 
6879         NumAccesses = Group->getNumMembers();
6880         if (interleavedAccessCanBeWidened(&I, VF))
6881           InterleaveCost = getInterleaveGroupCost(&I, VF);
6882       }
6883 
6884       InstructionCost GatherScatterCost =
6885           isLegalGatherOrScatter(&I, VF)
6886               ? getGatherScatterCost(&I, VF) * NumAccesses
6887               : InstructionCost::getInvalid();
6888 
6889       InstructionCost ScalarizationCost =
6890           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6891 
6892       // Choose better solution for the current VF,
6893       // write down this decision and use it during vectorization.
6894       InstructionCost Cost;
6895       InstWidening Decision;
6896       if (InterleaveCost <= GatherScatterCost &&
6897           InterleaveCost < ScalarizationCost) {
6898         Decision = CM_Interleave;
6899         Cost = InterleaveCost;
6900       } else if (GatherScatterCost < ScalarizationCost) {
6901         Decision = CM_GatherScatter;
6902         Cost = GatherScatterCost;
6903       } else {
6904         Decision = CM_Scalarize;
6905         Cost = ScalarizationCost;
6906       }
6907       // If the instructions belongs to an interleave group, the whole group
6908       // receives the same decision. The whole group receives the cost, but
6909       // the cost will actually be assigned to one instruction.
6910       if (auto Group = getInterleavedAccessGroup(&I))
6911         setWideningDecision(Group, VF, Decision, Cost);
6912       else
6913         setWideningDecision(&I, VF, Decision, Cost);
6914     }
6915   }
6916 
6917   // Make sure that any load of address and any other address computation
6918   // remains scalar unless there is gather/scatter support. This avoids
6919   // inevitable extracts into address registers, and also has the benefit of
6920   // activating LSR more, since that pass can't optimize vectorized
6921   // addresses.
6922   if (TTI.prefersVectorizedAddressing())
6923     return;
6924 
6925   // Start with all scalar pointer uses.
6926   SmallPtrSet<Instruction *, 8> AddrDefs;
6927   for (BasicBlock *BB : TheLoop->blocks())
6928     for (Instruction &I : *BB) {
6929       Instruction *PtrDef =
6930         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6931       if (PtrDef && TheLoop->contains(PtrDef) &&
6932           getWideningDecision(&I, VF) != CM_GatherScatter)
6933         AddrDefs.insert(PtrDef);
6934     }
6935 
6936   // Add all instructions used to generate the addresses.
6937   SmallVector<Instruction *, 4> Worklist;
6938   append_range(Worklist, AddrDefs);
6939   while (!Worklist.empty()) {
6940     Instruction *I = Worklist.pop_back_val();
6941     for (auto &Op : I->operands())
6942       if (auto *InstOp = dyn_cast<Instruction>(Op))
6943         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6944             AddrDefs.insert(InstOp).second)
6945           Worklist.push_back(InstOp);
6946   }
6947 
6948   for (auto *I : AddrDefs) {
6949     if (isa<LoadInst>(I)) {
6950       // Setting the desired widening decision should ideally be handled in
6951       // by cost functions, but since this involves the task of finding out
6952       // if the loaded register is involved in an address computation, it is
6953       // instead changed here when we know this is the case.
6954       InstWidening Decision = getWideningDecision(I, VF);
6955       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6956         // Scalarize a widened load of address.
6957         setWideningDecision(
6958             I, VF, CM_Scalarize,
6959             (VF.getKnownMinValue() *
6960              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6961       else if (auto Group = getInterleavedAccessGroup(I)) {
6962         // Scalarize an interleave group of address loads.
6963         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6964           if (Instruction *Member = Group->getMember(I))
6965             setWideningDecision(
6966                 Member, VF, CM_Scalarize,
6967                 (VF.getKnownMinValue() *
6968                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6969         }
6970       }
6971     } else
6972       // Make sure I gets scalarized and a cost estimate without
6973       // scalarization overhead.
6974       ForcedScalars[VF].insert(I);
6975   }
6976 }
6977 
6978 InstructionCost
6979 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
6980                                                Type *&VectorTy) {
6981   Type *RetTy = I->getType();
6982   if (canTruncateToMinimalBitwidth(I, VF))
6983     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6984   auto SE = PSE.getSE();
6985   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6986 
6987   auto hasSingleCopyAfterVectorization = [this](Instruction *I,
6988                                                 ElementCount VF) -> bool {
6989     if (VF.isScalar())
6990       return true;
6991 
6992     auto Scalarized = InstsToScalarize.find(VF);
6993     assert(Scalarized != InstsToScalarize.end() &&
6994            "VF not yet analyzed for scalarization profitability");
6995     return !Scalarized->second.count(I) &&
6996            llvm::all_of(I->users(), [&](User *U) {
6997              auto *UI = cast<Instruction>(U);
6998              return !Scalarized->second.count(UI);
6999            });
7000   };
7001   (void) hasSingleCopyAfterVectorization;
7002 
7003   if (isScalarAfterVectorization(I, VF)) {
7004     // With the exception of GEPs and PHIs, after scalarization there should
7005     // only be one copy of the instruction generated in the loop. This is
7006     // because the VF is either 1, or any instructions that need scalarizing
7007     // have already been dealt with by the the time we get here. As a result,
7008     // it means we don't have to multiply the instruction cost by VF.
7009     assert(I->getOpcode() == Instruction::GetElementPtr ||
7010            I->getOpcode() == Instruction::PHI ||
7011            (I->getOpcode() == Instruction::BitCast &&
7012             I->getType()->isPointerTy()) ||
7013            hasSingleCopyAfterVectorization(I, VF));
7014     VectorTy = RetTy;
7015   } else
7016     VectorTy = ToVectorTy(RetTy, VF);
7017 
7018   // TODO: We need to estimate the cost of intrinsic calls.
7019   switch (I->getOpcode()) {
7020   case Instruction::GetElementPtr:
7021     // We mark this instruction as zero-cost because the cost of GEPs in
7022     // vectorized code depends on whether the corresponding memory instruction
7023     // is scalarized or not. Therefore, we handle GEPs with the memory
7024     // instruction cost.
7025     return 0;
7026   case Instruction::Br: {
7027     // In cases of scalarized and predicated instructions, there will be VF
7028     // predicated blocks in the vectorized loop. Each branch around these
7029     // blocks requires also an extract of its vector compare i1 element.
7030     bool ScalarPredicatedBB = false;
7031     BranchInst *BI = cast<BranchInst>(I);
7032     if (VF.isVector() && BI->isConditional() &&
7033         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
7034          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
7035       ScalarPredicatedBB = true;
7036 
7037     if (ScalarPredicatedBB) {
7038       // Not possible to scalarize scalable vector with predicated instructions.
7039       if (VF.isScalable())
7040         return InstructionCost::getInvalid();
7041       // Return cost for branches around scalarized and predicated blocks.
7042       auto *Vec_i1Ty =
7043           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
7044       return (
7045           TTI.getScalarizationOverhead(
7046               Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) +
7047           (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
7048     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
7049       // The back-edge branch will remain, as will all scalar branches.
7050       return TTI.getCFInstrCost(Instruction::Br, CostKind);
7051     else
7052       // This branch will be eliminated by if-conversion.
7053       return 0;
7054     // Note: We currently assume zero cost for an unconditional branch inside
7055     // a predicated block since it will become a fall-through, although we
7056     // may decide in the future to call TTI for all branches.
7057   }
7058   case Instruction::PHI: {
7059     auto *Phi = cast<PHINode>(I);
7060 
7061     // First-order recurrences are replaced by vector shuffles inside the loop.
7062     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
7063     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
7064       return TTI.getShuffleCost(
7065           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
7066           None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
7067 
7068     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7069     // converted into select instructions. We require N - 1 selects per phi
7070     // node, where N is the number of incoming values.
7071     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
7072       return (Phi->getNumIncomingValues() - 1) *
7073              TTI.getCmpSelInstrCost(
7074                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
7075                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
7076                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
7077 
7078     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
7079   }
7080   case Instruction::UDiv:
7081   case Instruction::SDiv:
7082   case Instruction::URem:
7083   case Instruction::SRem:
7084     // If we have a predicated instruction, it may not be executed for each
7085     // vector lane. Get the scalarization cost and scale this amount by the
7086     // probability of executing the predicated block. If the instruction is not
7087     // predicated, we fall through to the next case.
7088     if (VF.isVector() && isScalarWithPredication(I, VF)) {
7089       InstructionCost Cost = 0;
7090 
7091       // These instructions have a non-void type, so account for the phi nodes
7092       // that we will create. This cost is likely to be zero. The phi node
7093       // cost, if any, should be scaled by the block probability because it
7094       // models a copy at the end of each predicated block.
7095       Cost += VF.getKnownMinValue() *
7096               TTI.getCFInstrCost(Instruction::PHI, CostKind);
7097 
7098       // The cost of the non-predicated instruction.
7099       Cost += VF.getKnownMinValue() *
7100               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
7101 
7102       // The cost of insertelement and extractelement instructions needed for
7103       // scalarization.
7104       Cost += getScalarizationOverhead(I, VF);
7105 
7106       // Scale the cost by the probability of executing the predicated blocks.
7107       // This assumes the predicated block for each vector lane is equally
7108       // likely.
7109       return Cost / getReciprocalPredBlockProb();
7110     }
7111     LLVM_FALLTHROUGH;
7112   case Instruction::Add:
7113   case Instruction::FAdd:
7114   case Instruction::Sub:
7115   case Instruction::FSub:
7116   case Instruction::Mul:
7117   case Instruction::FMul:
7118   case Instruction::FDiv:
7119   case Instruction::FRem:
7120   case Instruction::Shl:
7121   case Instruction::LShr:
7122   case Instruction::AShr:
7123   case Instruction::And:
7124   case Instruction::Or:
7125   case Instruction::Xor: {
7126     // Since we will replace the stride by 1 the multiplication should go away.
7127     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7128       return 0;
7129 
7130     // Detect reduction patterns
7131     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7132       return *RedCost;
7133 
7134     // Certain instructions can be cheaper to vectorize if they have a constant
7135     // second vector operand. One example of this are shifts on x86.
7136     Value *Op2 = I->getOperand(1);
7137     TargetTransformInfo::OperandValueProperties Op2VP;
7138     TargetTransformInfo::OperandValueKind Op2VK =
7139         TTI.getOperandInfo(Op2, Op2VP);
7140     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7141       Op2VK = TargetTransformInfo::OK_UniformValue;
7142 
7143     SmallVector<const Value *, 4> Operands(I->operand_values());
7144     return TTI.getArithmeticInstrCost(
7145         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7146         Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
7147   }
7148   case Instruction::FNeg: {
7149     return TTI.getArithmeticInstrCost(
7150         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7151         TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None,
7152         TargetTransformInfo::OP_None, I->getOperand(0), I);
7153   }
7154   case Instruction::Select: {
7155     SelectInst *SI = cast<SelectInst>(I);
7156     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7157     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7158 
7159     const Value *Op0, *Op1;
7160     using namespace llvm::PatternMatch;
7161     if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
7162                         match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
7163       // select x, y, false --> x & y
7164       // select x, true, y --> x | y
7165       TTI::OperandValueProperties Op1VP = TTI::OP_None;
7166       TTI::OperandValueProperties Op2VP = TTI::OP_None;
7167       TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP);
7168       TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP);
7169       assert(Op0->getType()->getScalarSizeInBits() == 1 &&
7170               Op1->getType()->getScalarSizeInBits() == 1);
7171 
7172       SmallVector<const Value *, 2> Operands{Op0, Op1};
7173       return TTI.getArithmeticInstrCost(
7174           match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
7175           CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I);
7176     }
7177 
7178     Type *CondTy = SI->getCondition()->getType();
7179     if (!ScalarCond)
7180       CondTy = VectorType::get(CondTy, VF);
7181 
7182     CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
7183     if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
7184       Pred = Cmp->getPredicate();
7185     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
7186                                   CostKind, I);
7187   }
7188   case Instruction::ICmp:
7189   case Instruction::FCmp: {
7190     Type *ValTy = I->getOperand(0)->getType();
7191     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7192     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7193       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7194     VectorTy = ToVectorTy(ValTy, VF);
7195     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7196                                   cast<CmpInst>(I)->getPredicate(), CostKind,
7197                                   I);
7198   }
7199   case Instruction::Store:
7200   case Instruction::Load: {
7201     ElementCount Width = VF;
7202     if (Width.isVector()) {
7203       InstWidening Decision = getWideningDecision(I, Width);
7204       assert(Decision != CM_Unknown &&
7205              "CM decision should be taken at this point");
7206       if (Decision == CM_Scalarize)
7207         Width = ElementCount::getFixed(1);
7208     }
7209     VectorTy = ToVectorTy(getLoadStoreType(I), Width);
7210     return getMemoryInstructionCost(I, VF);
7211   }
7212   case Instruction::BitCast:
7213     if (I->getType()->isPointerTy())
7214       return 0;
7215     LLVM_FALLTHROUGH;
7216   case Instruction::ZExt:
7217   case Instruction::SExt:
7218   case Instruction::FPToUI:
7219   case Instruction::FPToSI:
7220   case Instruction::FPExt:
7221   case Instruction::PtrToInt:
7222   case Instruction::IntToPtr:
7223   case Instruction::SIToFP:
7224   case Instruction::UIToFP:
7225   case Instruction::Trunc:
7226   case Instruction::FPTrunc: {
7227     // Computes the CastContextHint from a Load/Store instruction.
7228     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7229       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7230              "Expected a load or a store!");
7231 
7232       if (VF.isScalar() || !TheLoop->contains(I))
7233         return TTI::CastContextHint::Normal;
7234 
7235       switch (getWideningDecision(I, VF)) {
7236       case LoopVectorizationCostModel::CM_GatherScatter:
7237         return TTI::CastContextHint::GatherScatter;
7238       case LoopVectorizationCostModel::CM_Interleave:
7239         return TTI::CastContextHint::Interleave;
7240       case LoopVectorizationCostModel::CM_Scalarize:
7241       case LoopVectorizationCostModel::CM_Widen:
7242         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7243                                         : TTI::CastContextHint::Normal;
7244       case LoopVectorizationCostModel::CM_Widen_Reverse:
7245         return TTI::CastContextHint::Reversed;
7246       case LoopVectorizationCostModel::CM_Unknown:
7247         llvm_unreachable("Instr did not go through cost modelling?");
7248       }
7249 
7250       llvm_unreachable("Unhandled case!");
7251     };
7252 
7253     unsigned Opcode = I->getOpcode();
7254     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7255     // For Trunc, the context is the only user, which must be a StoreInst.
7256     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7257       if (I->hasOneUse())
7258         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7259           CCH = ComputeCCH(Store);
7260     }
7261     // For Z/Sext, the context is the operand, which must be a LoadInst.
7262     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7263              Opcode == Instruction::FPExt) {
7264       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7265         CCH = ComputeCCH(Load);
7266     }
7267 
7268     // We optimize the truncation of induction variables having constant
7269     // integer steps. The cost of these truncations is the same as the scalar
7270     // operation.
7271     if (isOptimizableIVTruncate(I, VF)) {
7272       auto *Trunc = cast<TruncInst>(I);
7273       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7274                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7275     }
7276 
7277     // Detect reduction patterns
7278     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7279       return *RedCost;
7280 
7281     Type *SrcScalarTy = I->getOperand(0)->getType();
7282     Type *SrcVecTy =
7283         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7284     if (canTruncateToMinimalBitwidth(I, VF)) {
7285       // This cast is going to be shrunk. This may remove the cast or it might
7286       // turn it into slightly different cast. For example, if MinBW == 16,
7287       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7288       //
7289       // Calculate the modified src and dest types.
7290       Type *MinVecTy = VectorTy;
7291       if (Opcode == Instruction::Trunc) {
7292         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7293         VectorTy =
7294             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7295       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7296         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7297         VectorTy =
7298             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7299       }
7300     }
7301 
7302     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7303   }
7304   case Instruction::Call: {
7305     if (RecurrenceDescriptor::isFMulAddIntrinsic(I))
7306       if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7307         return *RedCost;
7308     bool NeedToScalarize;
7309     CallInst *CI = cast<CallInst>(I);
7310     InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7311     if (getVectorIntrinsicIDForCall(CI, TLI)) {
7312       InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
7313       return std::min(CallCost, IntrinsicCost);
7314     }
7315     return CallCost;
7316   }
7317   case Instruction::ExtractValue:
7318     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7319   case Instruction::Alloca:
7320     // We cannot easily widen alloca to a scalable alloca, as
7321     // the result would need to be a vector of pointers.
7322     if (VF.isScalable())
7323       return InstructionCost::getInvalid();
7324     LLVM_FALLTHROUGH;
7325   default:
7326     // This opcode is unknown. Assume that it is the same as 'mul'.
7327     return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7328   } // end of switch.
7329 }
7330 
7331 char LoopVectorize::ID = 0;
7332 
7333 static const char lv_name[] = "Loop Vectorization";
7334 
7335 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7336 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7337 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7338 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
7339 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7340 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7341 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7342 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7343 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7344 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7345 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7346 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7347 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7348 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
7349 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
7350 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7351 
7352 namespace llvm {
7353 
7354 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7355 
7356 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7357                               bool VectorizeOnlyWhenForced) {
7358   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7359 }
7360 
7361 } // end namespace llvm
7362 
7363 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
7364   // Check if the pointer operand of a load or store instruction is
7365   // consecutive.
7366   if (auto *Ptr = getLoadStorePointerOperand(Inst))
7367     return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr);
7368   return false;
7369 }
7370 
7371 void LoopVectorizationCostModel::collectValuesToIgnore() {
7372   // Ignore ephemeral values.
7373   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7374 
7375   // Find all stores to invariant variables. Since they are going to sink
7376   // outside the loop we do not need calculate cost for them.
7377   for (BasicBlock *BB : TheLoop->blocks())
7378     for (Instruction &I : *BB) {
7379       StoreInst *SI;
7380       if ((SI = dyn_cast<StoreInst>(&I)) &&
7381           Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
7382         ValuesToIgnore.insert(&I);
7383     }
7384 
7385   // Ignore type-promoting instructions we identified during reduction
7386   // detection.
7387   for (auto &Reduction : Legal->getReductionVars()) {
7388     const RecurrenceDescriptor &RedDes = Reduction.second;
7389     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7390     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7391   }
7392   // Ignore type-casting instructions we identified during induction
7393   // detection.
7394   for (auto &Induction : Legal->getInductionVars()) {
7395     const InductionDescriptor &IndDes = Induction.second;
7396     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7397     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7398   }
7399 }
7400 
7401 void LoopVectorizationCostModel::collectInLoopReductions() {
7402   for (auto &Reduction : Legal->getReductionVars()) {
7403     PHINode *Phi = Reduction.first;
7404     const RecurrenceDescriptor &RdxDesc = Reduction.second;
7405 
7406     // We don't collect reductions that are type promoted (yet).
7407     if (RdxDesc.getRecurrenceType() != Phi->getType())
7408       continue;
7409 
7410     // If the target would prefer this reduction to happen "in-loop", then we
7411     // want to record it as such.
7412     unsigned Opcode = RdxDesc.getOpcode();
7413     if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7414         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7415                                    TargetTransformInfo::ReductionFlags()))
7416       continue;
7417 
7418     // Check that we can correctly put the reductions into the loop, by
7419     // finding the chain of operations that leads from the phi to the loop
7420     // exit value.
7421     SmallVector<Instruction *, 4> ReductionOperations =
7422         RdxDesc.getReductionOpChain(Phi, TheLoop);
7423     bool InLoop = !ReductionOperations.empty();
7424     if (InLoop) {
7425       InLoopReductionChains[Phi] = ReductionOperations;
7426       // Add the elements to InLoopReductionImmediateChains for cost modelling.
7427       Instruction *LastChain = Phi;
7428       for (auto *I : ReductionOperations) {
7429         InLoopReductionImmediateChains[I] = LastChain;
7430         LastChain = I;
7431       }
7432     }
7433     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7434                       << " reduction for phi: " << *Phi << "\n");
7435   }
7436 }
7437 
7438 // TODO: we could return a pair of values that specify the max VF and
7439 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7440 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7441 // doesn't have a cost model that can choose which plan to execute if
7442 // more than one is generated.
7443 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7444                                  LoopVectorizationCostModel &CM) {
7445   unsigned WidestType;
7446   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7447   return WidestVectorRegBits / WidestType;
7448 }
7449 
7450 VectorizationFactor
7451 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7452   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
7453   ElementCount VF = UserVF;
7454   // Outer loop handling: They may require CFG and instruction level
7455   // transformations before even evaluating whether vectorization is profitable.
7456   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7457   // the vectorization pipeline.
7458   if (!OrigLoop->isInnermost()) {
7459     // If the user doesn't provide a vectorization factor, determine a
7460     // reasonable one.
7461     if (UserVF.isZero()) {
7462       VF = ElementCount::getFixed(determineVPlanVF(
7463           TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
7464               .getFixedSize(),
7465           CM));
7466       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7467 
7468       // Make sure we have a VF > 1 for stress testing.
7469       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7470         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7471                           << "overriding computed VF.\n");
7472         VF = ElementCount::getFixed(4);
7473       }
7474     }
7475     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7476     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7477            "VF needs to be a power of two");
7478     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7479                       << "VF " << VF << " to build VPlans.\n");
7480     buildVPlans(VF, VF);
7481 
7482     // For VPlan build stress testing, we bail out after VPlan construction.
7483     if (VPlanBuildStressTest)
7484       return VectorizationFactor::Disabled();
7485 
7486     return {VF, 0 /*Cost*/};
7487   }
7488 
7489   LLVM_DEBUG(
7490       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7491                 "VPlan-native path.\n");
7492   return VectorizationFactor::Disabled();
7493 }
7494 
7495 Optional<VectorizationFactor>
7496 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7497   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7498   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7499   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7500     return None;
7501 
7502   // Invalidate interleave groups if all blocks of loop will be predicated.
7503   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7504       !useMaskedInterleavedAccesses(*TTI)) {
7505     LLVM_DEBUG(
7506         dbgs()
7507         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7508            "which requires masked-interleaved support.\n");
7509     if (CM.InterleaveInfo.invalidateGroups())
7510       // Invalidating interleave groups also requires invalidating all decisions
7511       // based on them, which includes widening decisions and uniform and scalar
7512       // values.
7513       CM.invalidateCostModelingDecisions();
7514   }
7515 
7516   ElementCount MaxUserVF =
7517       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7518   bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
7519   if (!UserVF.isZero() && UserVFIsLegal) {
7520     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7521            "VF needs to be a power of two");
7522     // Collect the instructions (and their associated costs) that will be more
7523     // profitable to scalarize.
7524     if (CM.selectUserVectorizationFactor(UserVF)) {
7525       LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7526       CM.collectInLoopReductions();
7527       buildVPlansWithVPRecipes(UserVF, UserVF);
7528       LLVM_DEBUG(printPlans(dbgs()));
7529       return {{UserVF, 0}};
7530     } else
7531       reportVectorizationInfo("UserVF ignored because of invalid costs.",
7532                               "InvalidCost", ORE, OrigLoop);
7533   }
7534 
7535   // Populate the set of Vectorization Factor Candidates.
7536   ElementCountSet VFCandidates;
7537   for (auto VF = ElementCount::getFixed(1);
7538        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7539     VFCandidates.insert(VF);
7540   for (auto VF = ElementCount::getScalable(1);
7541        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7542     VFCandidates.insert(VF);
7543 
7544   for (const auto &VF : VFCandidates) {
7545     // Collect Uniform and Scalar instructions after vectorization with VF.
7546     CM.collectUniformsAndScalars(VF);
7547 
7548     // Collect the instructions (and their associated costs) that will be more
7549     // profitable to scalarize.
7550     if (VF.isVector())
7551       CM.collectInstsToScalarize(VF);
7552   }
7553 
7554   CM.collectInLoopReductions();
7555   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7556   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7557 
7558   LLVM_DEBUG(printPlans(dbgs()));
7559   if (!MaxFactors.hasVector())
7560     return VectorizationFactor::Disabled();
7561 
7562   // Select the optimal vectorization factor.
7563   auto SelectedVF = CM.selectVectorizationFactor(VFCandidates);
7564 
7565   // Check if it is profitable to vectorize with runtime checks.
7566   unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
7567   if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) {
7568     bool PragmaThresholdReached =
7569         NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
7570     bool ThresholdReached =
7571         NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
7572     if ((ThresholdReached && !Hints.allowReordering()) ||
7573         PragmaThresholdReached) {
7574       ORE->emit([&]() {
7575         return OptimizationRemarkAnalysisAliasing(
7576                    DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(),
7577                    OrigLoop->getHeader())
7578                << "loop not vectorized: cannot prove it is safe to reorder "
7579                   "memory operations";
7580       });
7581       LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
7582       Hints.emitRemarkWithHints();
7583       return VectorizationFactor::Disabled();
7584     }
7585   }
7586   return SelectedVF;
7587 }
7588 
7589 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
7590   assert(count_if(VPlans,
7591                   [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7592              1 &&
7593          "Best VF has not a single VPlan.");
7594 
7595   for (const VPlanPtr &Plan : VPlans) {
7596     if (Plan->hasVF(VF))
7597       return *Plan.get();
7598   }
7599   llvm_unreachable("No plan found!");
7600 }
7601 
7602 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7603   SmallVector<Metadata *, 4> MDs;
7604   // Reserve first location for self reference to the LoopID metadata node.
7605   MDs.push_back(nullptr);
7606   bool IsUnrollMetadata = false;
7607   MDNode *LoopID = L->getLoopID();
7608   if (LoopID) {
7609     // First find existing loop unrolling disable metadata.
7610     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7611       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7612       if (MD) {
7613         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7614         IsUnrollMetadata =
7615             S && S->getString().startswith("llvm.loop.unroll.disable");
7616       }
7617       MDs.push_back(LoopID->getOperand(i));
7618     }
7619   }
7620 
7621   if (!IsUnrollMetadata) {
7622     // Add runtime unroll disable metadata.
7623     LLVMContext &Context = L->getHeader()->getContext();
7624     SmallVector<Metadata *, 1> DisableOperands;
7625     DisableOperands.push_back(
7626         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7627     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7628     MDs.push_back(DisableNode);
7629     MDNode *NewLoopID = MDNode::get(Context, MDs);
7630     // Set operand 0 to refer to the loop id itself.
7631     NewLoopID->replaceOperandWith(0, NewLoopID);
7632     L->setLoopID(NewLoopID);
7633   }
7634 }
7635 
7636 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
7637                                            VPlan &BestVPlan,
7638                                            InnerLoopVectorizer &ILV,
7639                                            DominatorTree *DT) {
7640   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
7641                     << '\n');
7642 
7643   // Perform the actual loop transformation.
7644 
7645   // 1. Set up the skeleton for vectorization, including vector pre-header and
7646   // middle block. The vector loop is created during VPlan execution.
7647   VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
7648   Value *CanonicalIVStartValue;
7649   std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
7650       ILV.createVectorizedLoopSkeleton();
7651   ILV.collectPoisonGeneratingRecipes(State);
7652 
7653   ILV.printDebugTracesAtStart();
7654 
7655   //===------------------------------------------------===//
7656   //
7657   // Notice: any optimization or new instruction that go
7658   // into the code below should also be implemented in
7659   // the cost-model.
7660   //
7661   //===------------------------------------------------===//
7662 
7663   // 2. Copy and widen instructions from the old loop into the new loop.
7664   BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr),
7665                              ILV.getOrCreateVectorTripCount(nullptr),
7666                              CanonicalIVStartValue, State);
7667   BestVPlan.execute(&State);
7668 
7669   // Keep all loop hints from the original loop on the vector loop (we'll
7670   // replace the vectorizer-specific hints below).
7671   MDNode *OrigLoopID = OrigLoop->getLoopID();
7672 
7673   Optional<MDNode *> VectorizedLoopID =
7674       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7675                                       LLVMLoopVectorizeFollowupVectorized});
7676 
7677   VPBasicBlock *HeaderVPBB =
7678       BestVPlan.getVectorLoopRegion()->getEntryBasicBlock();
7679   Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7680   if (VectorizedLoopID.hasValue())
7681     L->setLoopID(VectorizedLoopID.getValue());
7682   else {
7683     // Keep all loop hints from the original loop on the vector loop (we'll
7684     // replace the vectorizer-specific hints below).
7685     if (MDNode *LID = OrigLoop->getLoopID())
7686       L->setLoopID(LID);
7687 
7688     LoopVectorizeHints Hints(L, true, *ORE);
7689     Hints.setAlreadyVectorized();
7690   }
7691   // Disable runtime unrolling when vectorizing the epilogue loop.
7692   if (CanonicalIVStartValue)
7693     AddRuntimeUnrollDisableMetaData(L);
7694 
7695   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7696   //    predication, updating analyses.
7697   ILV.fixVectorizedLoop(State, BestVPlan);
7698 
7699   ILV.printDebugTracesAtEnd();
7700 }
7701 
7702 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
7703 void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
7704   for (const auto &Plan : VPlans)
7705     if (PrintVPlansInDotFormat)
7706       Plan->printDOT(O);
7707     else
7708       Plan->print(O);
7709 }
7710 #endif
7711 
7712 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
7713     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7714 
7715   // We create new control-flow for the vectorized loop, so the original exit
7716   // conditions will be dead after vectorization if it's only used by the
7717   // terminator
7718   SmallVector<BasicBlock*> ExitingBlocks;
7719   OrigLoop->getExitingBlocks(ExitingBlocks);
7720   for (auto *BB : ExitingBlocks) {
7721     auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));
7722     if (!Cmp || !Cmp->hasOneUse())
7723       continue;
7724 
7725     // TODO: we should introduce a getUniqueExitingBlocks on Loop
7726     if (!DeadInstructions.insert(Cmp).second)
7727       continue;
7728 
7729     // The operands of the icmp is often a dead trunc, used by IndUpdate.
7730     // TODO: can recurse through operands in general
7731     for (Value *Op : Cmp->operands()) {
7732       if (isa<TruncInst>(Op) && Op->hasOneUse())
7733           DeadInstructions.insert(cast<Instruction>(Op));
7734     }
7735   }
7736 
7737   // We create new "steps" for induction variable updates to which the original
7738   // induction variables map. An original update instruction will be dead if
7739   // all its users except the induction variable are dead.
7740   auto *Latch = OrigLoop->getLoopLatch();
7741   for (auto &Induction : Legal->getInductionVars()) {
7742     PHINode *Ind = Induction.first;
7743     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
7744 
7745     // If the tail is to be folded by masking, the primary induction variable,
7746     // if exists, isn't dead: it will be used for masking. Don't kill it.
7747     if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
7748       continue;
7749 
7750     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
7751           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
7752         }))
7753       DeadInstructions.insert(IndUpdate);
7754   }
7755 }
7756 
7757 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7758 
7759 //===--------------------------------------------------------------------===//
7760 // EpilogueVectorizerMainLoop
7761 //===--------------------------------------------------------------------===//
7762 
7763 /// This function is partially responsible for generating the control flow
7764 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7765 std::pair<BasicBlock *, Value *>
7766 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
7767   MDNode *OrigLoopID = OrigLoop->getLoopID();
7768 
7769   // Workaround!  Compute the trip count of the original loop and cache it
7770   // before we start modifying the CFG.  This code has a systemic problem
7771   // wherein it tries to run analysis over partially constructed IR; this is
7772   // wrong, and not simply for SCEV.  The trip count of the original loop
7773   // simply happens to be prone to hitting this in practice.  In theory, we
7774   // can hit the same issue for any SCEV, or ValueTracking query done during
7775   // mutation.  See PR49900.
7776   getOrCreateTripCount(OrigLoop->getLoopPreheader());
7777   createVectorLoopSkeleton("");
7778 
7779   // Generate the code to check the minimum iteration count of the vector
7780   // epilogue (see below).
7781   EPI.EpilogueIterationCountCheck =
7782       emitIterationCountCheck(LoopScalarPreHeader, true);
7783   EPI.EpilogueIterationCountCheck->setName("iter.check");
7784 
7785   // Generate the code to check any assumptions that we've made for SCEV
7786   // expressions.
7787   EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader);
7788 
7789   // Generate the code that checks at runtime if arrays overlap. We put the
7790   // checks into a separate block to make the more common case of few elements
7791   // faster.
7792   EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader);
7793 
7794   // Generate the iteration count check for the main loop, *after* the check
7795   // for the epilogue loop, so that the path-length is shorter for the case
7796   // that goes directly through the vector epilogue. The longer-path length for
7797   // the main loop is compensated for, by the gain from vectorizing the larger
7798   // trip count. Note: the branch will get updated later on when we vectorize
7799   // the epilogue.
7800   EPI.MainLoopIterationCountCheck =
7801       emitIterationCountCheck(LoopScalarPreHeader, false);
7802 
7803   // Generate the induction variable.
7804   EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
7805 
7806   // Skip induction resume value creation here because they will be created in
7807   // the second pass. If we created them here, they wouldn't be used anyway,
7808   // because the vplan in the second pass still contains the inductions from the
7809   // original loop.
7810 
7811   return {completeLoopSkeleton(OrigLoopID), nullptr};
7812 }
7813 
7814 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7815   LLVM_DEBUG({
7816     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7817            << "Main Loop VF:" << EPI.MainLoopVF
7818            << ", Main Loop UF:" << EPI.MainLoopUF
7819            << ", Epilogue Loop VF:" << EPI.EpilogueVF
7820            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7821   });
7822 }
7823 
7824 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7825   DEBUG_WITH_TYPE(VerboseDebug, {
7826     dbgs() << "intermediate fn:\n"
7827            << *OrigLoop->getHeader()->getParent() << "\n";
7828   });
7829 }
7830 
7831 BasicBlock *
7832 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
7833                                                     bool ForEpilogue) {
7834   assert(Bypass && "Expected valid bypass basic block.");
7835   ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7836   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7837   Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
7838   // Reuse existing vector loop preheader for TC checks.
7839   // Note that new preheader block is generated for vector loop.
7840   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7841   IRBuilder<> Builder(TCCheckBlock->getTerminator());
7842 
7843   // Generate code to check if the loop's trip count is less than VF * UF of the
7844   // main vector loop.
7845   auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ?
7846       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7847 
7848   Value *CheckMinIters = Builder.CreateICmp(
7849       P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7850       "min.iters.check");
7851 
7852   if (!ForEpilogue)
7853     TCCheckBlock->setName("vector.main.loop.iter.check");
7854 
7855   // Create new preheader for vector loop.
7856   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7857                                    DT, LI, nullptr, "vector.ph");
7858 
7859   if (ForEpilogue) {
7860     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7861                                  DT->getNode(Bypass)->getIDom()) &&
7862            "TC check is expected to dominate Bypass");
7863 
7864     // Update dominator for Bypass & LoopExit.
7865     DT->changeImmediateDominator(Bypass, TCCheckBlock);
7866     if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
7867       // For loops with multiple exits, there's no edge from the middle block
7868       // to exit blocks (as the epilogue must run) and thus no need to update
7869       // the immediate dominator of the exit blocks.
7870       DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
7871 
7872     LoopBypassBlocks.push_back(TCCheckBlock);
7873 
7874     // Save the trip count so we don't have to regenerate it in the
7875     // vec.epilog.iter.check. This is safe to do because the trip count
7876     // generated here dominates the vector epilog iter check.
7877     EPI.TripCount = Count;
7878   }
7879 
7880   ReplaceInstWithInst(
7881       TCCheckBlock->getTerminator(),
7882       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7883 
7884   return TCCheckBlock;
7885 }
7886 
7887 //===--------------------------------------------------------------------===//
7888 // EpilogueVectorizerEpilogueLoop
7889 //===--------------------------------------------------------------------===//
7890 
7891 /// This function is partially responsible for generating the control flow
7892 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7893 std::pair<BasicBlock *, Value *>
7894 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
7895   MDNode *OrigLoopID = OrigLoop->getLoopID();
7896   createVectorLoopSkeleton("vec.epilog.");
7897 
7898   // Now, compare the remaining count and if there aren't enough iterations to
7899   // execute the vectorized epilogue skip to the scalar part.
7900   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
7901   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
7902   LoopVectorPreHeader =
7903       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
7904                  LI, nullptr, "vec.epilog.ph");
7905   emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
7906                                           VecEpilogueIterationCountCheck);
7907 
7908   // Adjust the control flow taking the state info from the main loop
7909   // vectorization into account.
7910   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7911          "expected this to be saved from the previous pass.");
7912   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7913       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7914 
7915   DT->changeImmediateDominator(LoopVectorPreHeader,
7916                                EPI.MainLoopIterationCountCheck);
7917 
7918   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7919       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7920 
7921   if (EPI.SCEVSafetyCheck)
7922     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7923         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7924   if (EPI.MemSafetyCheck)
7925     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7926         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7927 
7928   DT->changeImmediateDominator(
7929       VecEpilogueIterationCountCheck,
7930       VecEpilogueIterationCountCheck->getSinglePredecessor());
7931 
7932   DT->changeImmediateDominator(LoopScalarPreHeader,
7933                                EPI.EpilogueIterationCountCheck);
7934   if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
7935     // If there is an epilogue which must run, there's no edge from the
7936     // middle block to exit blocks  and thus no need to update the immediate
7937     // dominator of the exit blocks.
7938     DT->changeImmediateDominator(LoopExitBlock,
7939                                  EPI.EpilogueIterationCountCheck);
7940 
7941   // Keep track of bypass blocks, as they feed start values to the induction
7942   // phis in the scalar loop preheader.
7943   if (EPI.SCEVSafetyCheck)
7944     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
7945   if (EPI.MemSafetyCheck)
7946     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
7947   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
7948 
7949   // The vec.epilog.iter.check block may contain Phi nodes from reductions which
7950   // merge control-flow from the latch block and the middle block. Update the
7951   // incoming values here and move the Phi into the preheader.
7952   SmallVector<PHINode *, 4> PhisInBlock;
7953   for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
7954     PhisInBlock.push_back(&Phi);
7955 
7956   for (PHINode *Phi : PhisInBlock) {
7957     Phi->replaceIncomingBlockWith(
7958         VecEpilogueIterationCountCheck->getSinglePredecessor(),
7959         VecEpilogueIterationCountCheck);
7960     Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
7961     if (EPI.SCEVSafetyCheck)
7962       Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
7963     if (EPI.MemSafetyCheck)
7964       Phi->removeIncomingValue(EPI.MemSafetyCheck);
7965     Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
7966   }
7967 
7968   // Generate a resume induction for the vector epilogue and put it in the
7969   // vector epilogue preheader
7970   Type *IdxTy = Legal->getWidestInductionType();
7971   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
7972                                          LoopVectorPreHeader->getFirstNonPHI());
7973   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7974   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7975                            EPI.MainLoopIterationCountCheck);
7976 
7977   // Generate induction resume values. These variables save the new starting
7978   // indexes for the scalar loop. They are used to test if there are any tail
7979   // iterations left once the vector loop has completed.
7980   // Note that when the vectorized epilogue is skipped due to iteration count
7981   // check, then the resume value for the induction variable comes from
7982   // the trip count of the main vector loop, hence passing the AdditionalBypass
7983   // argument.
7984   createInductionResumeValues({VecEpilogueIterationCountCheck,
7985                                EPI.VectorTripCount} /* AdditionalBypass */);
7986 
7987   return {completeLoopSkeleton(OrigLoopID), EPResumeVal};
7988 }
7989 
7990 BasicBlock *
7991 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7992     BasicBlock *Bypass, BasicBlock *Insert) {
7993 
7994   assert(EPI.TripCount &&
7995          "Expected trip count to have been safed in the first pass.");
7996   assert(
7997       (!isa<Instruction>(EPI.TripCount) ||
7998        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7999       "saved trip count does not dominate insertion point.");
8000   Value *TC = EPI.TripCount;
8001   IRBuilder<> Builder(Insert->getTerminator());
8002   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
8003 
8004   // Generate code to check if the loop's trip count is less than VF * UF of the
8005   // vector epilogue loop.
8006   auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ?
8007       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
8008 
8009   Value *CheckMinIters =
8010       Builder.CreateICmp(P, Count,
8011                          createStepForVF(Builder, Count->getType(),
8012                                          EPI.EpilogueVF, EPI.EpilogueUF),
8013                          "min.epilog.iters.check");
8014 
8015   ReplaceInstWithInst(
8016       Insert->getTerminator(),
8017       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
8018 
8019   LoopBypassBlocks.push_back(Insert);
8020   return Insert;
8021 }
8022 
8023 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
8024   LLVM_DEBUG({
8025     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
8026            << "Epilogue Loop VF:" << EPI.EpilogueVF
8027            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8028   });
8029 }
8030 
8031 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
8032   DEBUG_WITH_TYPE(VerboseDebug, {
8033     dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
8034   });
8035 }
8036 
8037 bool LoopVectorizationPlanner::getDecisionAndClampRange(
8038     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
8039   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
8040   bool PredicateAtRangeStart = Predicate(Range.Start);
8041 
8042   for (ElementCount TmpVF = Range.Start * 2;
8043        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
8044     if (Predicate(TmpVF) != PredicateAtRangeStart) {
8045       Range.End = TmpVF;
8046       break;
8047     }
8048 
8049   return PredicateAtRangeStart;
8050 }
8051 
8052 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
8053 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
8054 /// of VF's starting at a given VF and extending it as much as possible. Each
8055 /// vectorization decision can potentially shorten this sub-range during
8056 /// buildVPlan().
8057 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
8058                                            ElementCount MaxVF) {
8059   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8060   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8061     VFRange SubRange = {VF, MaxVFPlusOne};
8062     VPlans.push_back(buildVPlan(SubRange));
8063     VF = SubRange.End;
8064   }
8065 }
8066 
8067 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
8068                                          VPlanPtr &Plan) {
8069   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8070 
8071   // Look for cached value.
8072   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8073   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8074   if (ECEntryIt != EdgeMaskCache.end())
8075     return ECEntryIt->second;
8076 
8077   VPValue *SrcMask = createBlockInMask(Src, Plan);
8078 
8079   // The terminator has to be a branch inst!
8080   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8081   assert(BI && "Unexpected terminator found");
8082 
8083   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8084     return EdgeMaskCache[Edge] = SrcMask;
8085 
8086   // If source is an exiting block, we know the exit edge is dynamically dead
8087   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
8088   // adding uses of an otherwise potentially dead instruction.
8089   if (OrigLoop->isLoopExiting(Src))
8090     return EdgeMaskCache[Edge] = SrcMask;
8091 
8092   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
8093   assert(EdgeMask && "No Edge Mask found for condition");
8094 
8095   if (BI->getSuccessor(0) != Dst)
8096     EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
8097 
8098   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8099     // The condition is 'SrcMask && EdgeMask', which is equivalent to
8100     // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8101     // The select version does not introduce new UB if SrcMask is false and
8102     // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8103     VPValue *False = Plan->getOrAddVPValue(
8104         ConstantInt::getFalse(BI->getCondition()->getType()));
8105     EdgeMask =
8106         Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
8107   }
8108 
8109   return EdgeMaskCache[Edge] = EdgeMask;
8110 }
8111 
8112 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
8113   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8114 
8115   // Look for cached value.
8116   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
8117   if (BCEntryIt != BlockMaskCache.end())
8118     return BCEntryIt->second;
8119 
8120   // All-one mask is modelled as no-mask following the convention for masked
8121   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8122   VPValue *BlockMask = nullptr;
8123 
8124   if (OrigLoop->getHeader() == BB) {
8125     if (!CM.blockNeedsPredicationForAnyReason(BB))
8126       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
8127 
8128     // Introduce the early-exit compare IV <= BTC to form header block mask.
8129     // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8130     // constructing the desired canonical IV in the header block as its first
8131     // non-phi instructions.
8132     assert(CM.foldTailByMasking() && "must fold the tail");
8133     VPBasicBlock *HeaderVPBB =
8134         Plan->getVectorLoopRegion()->getEntryBasicBlock();
8135     auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8136     auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV());
8137     HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi());
8138 
8139     VPBuilder::InsertPointGuard Guard(Builder);
8140     Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8141     if (CM.TTI.emitGetActiveLaneMask()) {
8142       VPValue *TC = Plan->getOrCreateTripCount();
8143       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC});
8144     } else {
8145       VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
8146       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
8147     }
8148     return BlockMaskCache[BB] = BlockMask;
8149   }
8150 
8151   // This is the block mask. We OR all incoming edges.
8152   for (auto *Predecessor : predecessors(BB)) {
8153     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8154     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
8155       return BlockMaskCache[BB] = EdgeMask;
8156 
8157     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8158       BlockMask = EdgeMask;
8159       continue;
8160     }
8161 
8162     BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8163   }
8164 
8165   return BlockMaskCache[BB] = BlockMask;
8166 }
8167 
8168 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
8169                                                 ArrayRef<VPValue *> Operands,
8170                                                 VFRange &Range,
8171                                                 VPlanPtr &Plan) {
8172   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8173          "Must be called with either a load or store");
8174 
8175   auto willWiden = [&](ElementCount VF) -> bool {
8176     if (VF.isScalar())
8177       return false;
8178     LoopVectorizationCostModel::InstWidening Decision =
8179         CM.getWideningDecision(I, VF);
8180     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8181            "CM decision should be taken at this point.");
8182     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8183       return true;
8184     if (CM.isScalarAfterVectorization(I, VF) ||
8185         CM.isProfitableToScalarize(I, VF))
8186       return false;
8187     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8188   };
8189 
8190   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8191     return nullptr;
8192 
8193   VPValue *Mask = nullptr;
8194   if (Legal->isMaskRequired(I))
8195     Mask = createBlockInMask(I->getParent(), Plan);
8196 
8197   // Determine if the pointer operand of the access is either consecutive or
8198   // reverse consecutive.
8199   LoopVectorizationCostModel::InstWidening Decision =
8200       CM.getWideningDecision(I, Range.Start);
8201   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8202   bool Consecutive =
8203       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8204 
8205   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8206     return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
8207                                               Consecutive, Reverse);
8208 
8209   StoreInst *Store = cast<StoreInst>(I);
8210   return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
8211                                             Mask, Consecutive, Reverse);
8212 }
8213 
8214 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8215 /// insert a recipe to expand the step for the induction recipe.
8216 static VPWidenIntOrFpInductionRecipe *createWidenInductionRecipes(
8217     PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start,
8218     const InductionDescriptor &IndDesc, LoopVectorizationCostModel &CM,
8219     VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range) {
8220   // Returns true if an instruction \p I should be scalarized instead of
8221   // vectorized for the chosen vectorization factor.
8222   auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) {
8223     return CM.isScalarAfterVectorization(I, VF) ||
8224            CM.isProfitableToScalarize(I, VF);
8225   };
8226 
8227   bool NeedsScalarIV = LoopVectorizationPlanner::getDecisionAndClampRange(
8228       [&](ElementCount VF) {
8229         // Returns true if we should generate a scalar version of \p IV.
8230         if (ShouldScalarizeInstruction(PhiOrTrunc, VF))
8231           return true;
8232         auto isScalarInst = [&](User *U) -> bool {
8233           auto *I = cast<Instruction>(U);
8234           return OrigLoop.contains(I) && ShouldScalarizeInstruction(I, VF);
8235         };
8236         return any_of(PhiOrTrunc->users(), isScalarInst);
8237       },
8238       Range);
8239   bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange(
8240       [&](ElementCount VF) {
8241         return ShouldScalarizeInstruction(PhiOrTrunc, VF);
8242       },
8243       Range);
8244   assert(IndDesc.getStartValue() ==
8245          Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8246   assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8247          "step must be loop invariant");
8248 
8249   VPValue *Step =
8250       vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
8251   if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8252     return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI,
8253                                              NeedsScalarIV, !NeedsScalarIVOnly);
8254   }
8255   assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8256   return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc,
8257                                            NeedsScalarIV, !NeedsScalarIVOnly);
8258 }
8259 
8260 VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI(
8261     PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) {
8262 
8263   // Check if this is an integer or fp induction. If so, build the recipe that
8264   // produces its scalar and vector values.
8265   if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8266     return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, CM, Plan,
8267                                        *PSE.getSE(), *OrigLoop, Range);
8268 
8269   // Check if this is pointer induction. If so, build the recipe for it.
8270   if (auto *II = Legal->getPointerInductionDescriptor(Phi))
8271     return new VPWidenPointerInductionRecipe(Phi, Operands[0], *II,
8272                                              *PSE.getSE());
8273   return nullptr;
8274 }
8275 
8276 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8277     TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) {
8278   // Optimize the special case where the source is a constant integer
8279   // induction variable. Notice that we can only optimize the 'trunc' case
8280   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8281   // (c) other casts depend on pointer size.
8282 
8283   // Determine whether \p K is a truncation based on an induction variable that
8284   // can be optimized.
8285   auto isOptimizableIVTruncate =
8286       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8287     return [=](ElementCount VF) -> bool {
8288       return CM.isOptimizableIVTruncate(K, VF);
8289     };
8290   };
8291 
8292   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8293           isOptimizableIVTruncate(I), Range)) {
8294 
8295     auto *Phi = cast<PHINode>(I->getOperand(0));
8296     const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8297     VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
8298     return createWidenInductionRecipes(Phi, I, Start, II, CM, Plan,
8299                                        *PSE.getSE(), *OrigLoop, Range);
8300   }
8301   return nullptr;
8302 }
8303 
8304 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
8305                                                 ArrayRef<VPValue *> Operands,
8306                                                 VPlanPtr &Plan) {
8307   // If all incoming values are equal, the incoming VPValue can be used directly
8308   // instead of creating a new VPBlendRecipe.
8309   VPValue *FirstIncoming = Operands[0];
8310   if (all_of(Operands, [FirstIncoming](const VPValue *Inc) {
8311         return FirstIncoming == Inc;
8312       })) {
8313     return Operands[0];
8314   }
8315 
8316   unsigned NumIncoming = Phi->getNumIncomingValues();
8317   // For in-loop reductions, we do not need to create an additional select.
8318   VPValue *InLoopVal = nullptr;
8319   for (unsigned In = 0; In < NumIncoming; In++) {
8320     PHINode *PhiOp =
8321         dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue());
8322     if (PhiOp && CM.isInLoopReduction(PhiOp)) {
8323       assert(!InLoopVal && "Found more than one in-loop reduction!");
8324       InLoopVal = Operands[In];
8325     }
8326   }
8327 
8328   assert((!InLoopVal || NumIncoming == 2) &&
8329          "Found an in-loop reduction for PHI with unexpected number of "
8330          "incoming values");
8331   if (InLoopVal)
8332     return Operands[Operands[0] == InLoopVal ? 1 : 0];
8333 
8334   // We know that all PHIs in non-header blocks are converted into selects, so
8335   // we don't have to worry about the insertion order and we can just use the
8336   // builder. At this point we generate the predication tree. There may be
8337   // duplications since this is a simple recursive scan, but future
8338   // optimizations will clean it up.
8339   SmallVector<VPValue *, 2> OperandsWithMask;
8340 
8341   for (unsigned In = 0; In < NumIncoming; In++) {
8342     VPValue *EdgeMask =
8343       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8344     assert((EdgeMask || NumIncoming == 1) &&
8345            "Multiple predecessors with one having a full mask");
8346     OperandsWithMask.push_back(Operands[In]);
8347     if (EdgeMask)
8348       OperandsWithMask.push_back(EdgeMask);
8349   }
8350   return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
8351 }
8352 
8353 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8354                                                    ArrayRef<VPValue *> Operands,
8355                                                    VFRange &Range) const {
8356 
8357   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8358       [this, CI](ElementCount VF) {
8359         return CM.isScalarWithPredication(CI, VF);
8360       },
8361       Range);
8362 
8363   if (IsPredicated)
8364     return nullptr;
8365 
8366   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8367   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8368              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8369              ID == Intrinsic::pseudoprobe ||
8370              ID == Intrinsic::experimental_noalias_scope_decl))
8371     return nullptr;
8372 
8373   auto willWiden = [&](ElementCount VF) -> bool {
8374     if (VF.isScalar())
8375        return false;
8376     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8377     // The following case may be scalarized depending on the VF.
8378     // The flag shows whether we use Intrinsic or a usual Call for vectorized
8379     // version of the instruction.
8380     // Is it beneficial to perform intrinsic call compared to lib call?
8381     bool NeedToScalarize = false;
8382     InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
8383     InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
8384     bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
8385     return UseVectorIntrinsic || !NeedToScalarize;
8386   };
8387 
8388   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8389     return nullptr;
8390 
8391   ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());
8392   return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()));
8393 }
8394 
8395 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8396   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8397          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8398   // Instruction should be widened, unless it is scalar after vectorization,
8399   // scalarization is profitable or it is predicated.
8400   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8401     return CM.isScalarAfterVectorization(I, VF) ||
8402            CM.isProfitableToScalarize(I, VF) ||
8403            CM.isScalarWithPredication(I, VF);
8404   };
8405   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8406                                                              Range);
8407 }
8408 
8409 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8410                                            ArrayRef<VPValue *> Operands) const {
8411   auto IsVectorizableOpcode = [](unsigned Opcode) {
8412     switch (Opcode) {
8413     case Instruction::Add:
8414     case Instruction::And:
8415     case Instruction::AShr:
8416     case Instruction::BitCast:
8417     case Instruction::FAdd:
8418     case Instruction::FCmp:
8419     case Instruction::FDiv:
8420     case Instruction::FMul:
8421     case Instruction::FNeg:
8422     case Instruction::FPExt:
8423     case Instruction::FPToSI:
8424     case Instruction::FPToUI:
8425     case Instruction::FPTrunc:
8426     case Instruction::FRem:
8427     case Instruction::FSub:
8428     case Instruction::ICmp:
8429     case Instruction::IntToPtr:
8430     case Instruction::LShr:
8431     case Instruction::Mul:
8432     case Instruction::Or:
8433     case Instruction::PtrToInt:
8434     case Instruction::SDiv:
8435     case Instruction::Select:
8436     case Instruction::SExt:
8437     case Instruction::Shl:
8438     case Instruction::SIToFP:
8439     case Instruction::SRem:
8440     case Instruction::Sub:
8441     case Instruction::Trunc:
8442     case Instruction::UDiv:
8443     case Instruction::UIToFP:
8444     case Instruction::URem:
8445     case Instruction::Xor:
8446     case Instruction::ZExt:
8447       return true;
8448     }
8449     return false;
8450   };
8451 
8452   if (!IsVectorizableOpcode(I->getOpcode()))
8453     return nullptr;
8454 
8455   // Success: widen this instruction.
8456   return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8457 }
8458 
8459 void VPRecipeBuilder::fixHeaderPhis() {
8460   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8461   for (VPHeaderPHIRecipe *R : PhisToFix) {
8462     auto *PN = cast<PHINode>(R->getUnderlyingValue());
8463     VPRecipeBase *IncR =
8464         getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8465     R->addOperand(IncR->getVPSingleValue());
8466   }
8467 }
8468 
8469 VPBasicBlock *VPRecipeBuilder::handleReplication(
8470     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8471     VPlanPtr &Plan) {
8472   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8473       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8474       Range);
8475 
8476   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8477       [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); },
8478       Range);
8479 
8480   // Even if the instruction is not marked as uniform, there are certain
8481   // intrinsic calls that can be effectively treated as such, so we check for
8482   // them here. Conservatively, we only do this for scalable vectors, since
8483   // for fixed-width VFs we can always fall back on full scalarization.
8484   if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8485     switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8486     case Intrinsic::assume:
8487     case Intrinsic::lifetime_start:
8488     case Intrinsic::lifetime_end:
8489       // For scalable vectors if one of the operands is variant then we still
8490       // want to mark as uniform, which will generate one instruction for just
8491       // the first lane of the vector. We can't scalarize the call in the same
8492       // way as for fixed-width vectors because we don't know how many lanes
8493       // there are.
8494       //
8495       // The reasons for doing it this way for scalable vectors are:
8496       //   1. For the assume intrinsic generating the instruction for the first
8497       //      lane is still be better than not generating any at all. For
8498       //      example, the input may be a splat across all lanes.
8499       //   2. For the lifetime start/end intrinsics the pointer operand only
8500       //      does anything useful when the input comes from a stack object,
8501       //      which suggests it should always be uniform. For non-stack objects
8502       //      the effect is to poison the object, which still allows us to
8503       //      remove the call.
8504       IsUniform = true;
8505       break;
8506     default:
8507       break;
8508     }
8509   }
8510 
8511   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8512                                        IsUniform, IsPredicated);
8513   setRecipe(I, Recipe);
8514   Plan->addVPValue(I, Recipe);
8515 
8516   // Find if I uses a predicated instruction. If so, it will use its scalar
8517   // value. Avoid hoisting the insert-element which packs the scalar value into
8518   // a vector value, as that happens iff all users use the vector value.
8519   for (VPValue *Op : Recipe->operands()) {
8520     auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef());
8521     if (!PredR)
8522       continue;
8523     auto *RepR =
8524         cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef());
8525     assert(RepR->isPredicated() &&
8526            "expected Replicate recipe to be predicated");
8527     RepR->setAlsoPack(false);
8528   }
8529 
8530   // Finalize the recipe for Instr, first if it is not predicated.
8531   if (!IsPredicated) {
8532     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8533     VPBB->appendRecipe(Recipe);
8534     return VPBB;
8535   }
8536   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8537 
8538   VPBlockBase *SingleSucc = VPBB->getSingleSuccessor();
8539   assert(SingleSucc && "VPBB must have a single successor when handling "
8540                        "predicated replication.");
8541   VPBlockUtils::disconnectBlocks(VPBB, SingleSucc);
8542   // Record predicated instructions for above packing optimizations.
8543   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
8544   VPBlockUtils::insertBlockAfter(Region, VPBB);
8545   auto *RegSucc = new VPBasicBlock();
8546   VPBlockUtils::insertBlockAfter(RegSucc, Region);
8547   VPBlockUtils::connectBlocks(RegSucc, SingleSucc);
8548   return RegSucc;
8549 }
8550 
8551 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
8552                                                       VPRecipeBase *PredRecipe,
8553                                                       VPlanPtr &Plan) {
8554   // Instructions marked for predication are replicated and placed under an
8555   // if-then construct to prevent side-effects.
8556 
8557   // Generate recipes to compute the block mask for this region.
8558   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
8559 
8560   // Build the triangular if-then region.
8561   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8562   assert(Instr->getParent() && "Predicated instruction not in any basic block");
8563   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8564   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8565   auto *PHIRecipe = Instr->getType()->isVoidTy()
8566                         ? nullptr
8567                         : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
8568   if (PHIRecipe) {
8569     Plan->removeVPValueFor(Instr);
8570     Plan->addVPValue(Instr, PHIRecipe);
8571   }
8572   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8573   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8574   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
8575 
8576   // Note: first set Entry as region entry and then connect successors starting
8577   // from it in order, to propagate the "parent" of each VPBasicBlock.
8578   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
8579   VPBlockUtils::connectBlocks(Pred, Exit);
8580 
8581   return Region;
8582 }
8583 
8584 VPRecipeOrVPValueTy
8585 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8586                                         ArrayRef<VPValue *> Operands,
8587                                         VFRange &Range, VPlanPtr &Plan) {
8588   // First, check for specific widening recipes that deal with calls, memory
8589   // operations, inductions and Phi nodes.
8590   if (auto *CI = dyn_cast<CallInst>(Instr))
8591     return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
8592 
8593   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8594     return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
8595 
8596   VPRecipeBase *Recipe;
8597   if (auto Phi = dyn_cast<PHINode>(Instr)) {
8598     if (Phi->getParent() != OrigLoop->getHeader())
8599       return tryToBlend(Phi, Operands, Plan);
8600     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range)))
8601       return toVPRecipeResult(Recipe);
8602 
8603     VPHeaderPHIRecipe *PhiRecipe = nullptr;
8604     assert((Legal->isReductionVariable(Phi) ||
8605             Legal->isFirstOrderRecurrence(Phi)) &&
8606            "can only widen reductions and first-order recurrences here");
8607     VPValue *StartV = Operands[0];
8608     if (Legal->isReductionVariable(Phi)) {
8609       const RecurrenceDescriptor &RdxDesc =
8610           Legal->getReductionVars().find(Phi)->second;
8611       assert(RdxDesc.getRecurrenceStartValue() ==
8612              Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8613       PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8614                                            CM.isInLoopReduction(Phi),
8615                                            CM.useOrderedReductions(RdxDesc));
8616     } else {
8617       PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8618     }
8619 
8620       // Record the incoming value from the backedge, so we can add the incoming
8621       // value from the backedge after all recipes have been created.
8622       recordRecipeOf(cast<Instruction>(
8623           Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())));
8624       PhisToFix.push_back(PhiRecipe);
8625       return toVPRecipeResult(PhiRecipe);
8626   }
8627 
8628   if (isa<TruncInst>(Instr) &&
8629       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
8630                                                Range, *Plan)))
8631     return toVPRecipeResult(Recipe);
8632 
8633   if (!shouldWiden(Instr, Range))
8634     return nullptr;
8635 
8636   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8637     return toVPRecipeResult(new VPWidenGEPRecipe(
8638         GEP, make_range(Operands.begin(), Operands.end()), OrigLoop));
8639 
8640   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8641     bool InvariantCond =
8642         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
8643     return toVPRecipeResult(new VPWidenSelectRecipe(
8644         *SI, make_range(Operands.begin(), Operands.end()), InvariantCond));
8645   }
8646 
8647   return toVPRecipeResult(tryToWiden(Instr, Operands));
8648 }
8649 
8650 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8651                                                         ElementCount MaxVF) {
8652   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8653 
8654   // Collect instructions from the original loop that will become trivially dead
8655   // in the vectorized loop. We don't need to vectorize these instructions. For
8656   // example, original induction update instructions can become dead because we
8657   // separately emit induction "steps" when generating code for the new loop.
8658   // Similarly, we create a new latch condition when setting up the structure
8659   // of the new loop, so the old one can become dead.
8660   SmallPtrSet<Instruction *, 4> DeadInstructions;
8661   collectTriviallyDeadInstructions(DeadInstructions);
8662 
8663   // Add assume instructions we need to drop to DeadInstructions, to prevent
8664   // them from being added to the VPlan.
8665   // TODO: We only need to drop assumes in blocks that get flattend. If the
8666   // control flow is preserved, we should keep them.
8667   auto &ConditionalAssumes = Legal->getConditionalAssumes();
8668   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
8669 
8670   MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
8671   // Dead instructions do not need sinking. Remove them from SinkAfter.
8672   for (Instruction *I : DeadInstructions)
8673     SinkAfter.erase(I);
8674 
8675   // Cannot sink instructions after dead instructions (there won't be any
8676   // recipes for them). Instead, find the first non-dead previous instruction.
8677   for (auto &P : Legal->getSinkAfter()) {
8678     Instruction *SinkTarget = P.second;
8679     Instruction *FirstInst = &*SinkTarget->getParent()->begin();
8680     (void)FirstInst;
8681     while (DeadInstructions.contains(SinkTarget)) {
8682       assert(
8683           SinkTarget != FirstInst &&
8684           "Must find a live instruction (at least the one feeding the "
8685           "first-order recurrence PHI) before reaching beginning of the block");
8686       SinkTarget = SinkTarget->getPrevNode();
8687       assert(SinkTarget != P.first &&
8688              "sink source equals target, no sinking required");
8689     }
8690     P.second = SinkTarget;
8691   }
8692 
8693   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8694   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8695     VFRange SubRange = {VF, MaxVFPlusOne};
8696     VPlans.push_back(
8697         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
8698     VF = SubRange.End;
8699   }
8700 }
8701 
8702 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a
8703 // CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a
8704 // BranchOnCount VPInstruction to the latch.
8705 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
8706                                   bool HasNUW, bool IsVPlanNative) {
8707   Value *StartIdx = ConstantInt::get(IdxTy, 0);
8708   auto *StartV = Plan.getOrAddVPValue(StartIdx);
8709 
8710   auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8711   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8712   VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8713   Header->insert(CanonicalIVPHI, Header->begin());
8714 
8715   auto *CanonicalIVIncrement =
8716       new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW
8717                                : VPInstruction::CanonicalIVIncrement,
8718                         {CanonicalIVPHI}, DL);
8719   CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8720 
8721   VPBasicBlock *EB = TopRegion->getExitBasicBlock();
8722   if (IsVPlanNative)
8723     EB->setCondBit(nullptr);
8724   EB->appendRecipe(CanonicalIVIncrement);
8725 
8726   auto *BranchOnCount =
8727       new VPInstruction(VPInstruction::BranchOnCount,
8728                         {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8729   EB->appendRecipe(BranchOnCount);
8730 }
8731 
8732 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
8733     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
8734     const MapVector<Instruction *, Instruction *> &SinkAfter) {
8735 
8736   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8737 
8738   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8739 
8740   // ---------------------------------------------------------------------------
8741   // Pre-construction: record ingredients whose recipes we'll need to further
8742   // process after constructing the initial VPlan.
8743   // ---------------------------------------------------------------------------
8744 
8745   // Mark instructions we'll need to sink later and their targets as
8746   // ingredients whose recipe we'll need to record.
8747   for (auto &Entry : SinkAfter) {
8748     RecipeBuilder.recordRecipeOf(Entry.first);
8749     RecipeBuilder.recordRecipeOf(Entry.second);
8750   }
8751   for (auto &Reduction : CM.getInLoopReductionChains()) {
8752     PHINode *Phi = Reduction.first;
8753     RecurKind Kind =
8754         Legal->getReductionVars().find(Phi)->second.getRecurrenceKind();
8755     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8756 
8757     RecipeBuilder.recordRecipeOf(Phi);
8758     for (auto &R : ReductionOperations) {
8759       RecipeBuilder.recordRecipeOf(R);
8760       // For min/max reductions, where we have a pair of icmp/select, we also
8761       // need to record the ICmp recipe, so it can be removed later.
8762       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
8763              "Only min/max recurrences allowed for inloop reductions");
8764       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
8765         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
8766     }
8767   }
8768 
8769   // For each interleave group which is relevant for this (possibly trimmed)
8770   // Range, add it to the set of groups to be later applied to the VPlan and add
8771   // placeholders for its members' Recipes which we'll be replacing with a
8772   // single VPInterleaveRecipe.
8773   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8774     auto applyIG = [IG, this](ElementCount VF) -> bool {
8775       return (VF.isVector() && // Query is illegal for VF == 1
8776               CM.getWideningDecision(IG->getInsertPos(), VF) ==
8777                   LoopVectorizationCostModel::CM_Interleave);
8778     };
8779     if (!getDecisionAndClampRange(applyIG, Range))
8780       continue;
8781     InterleaveGroups.insert(IG);
8782     for (unsigned i = 0; i < IG->getFactor(); i++)
8783       if (Instruction *Member = IG->getMember(i))
8784         RecipeBuilder.recordRecipeOf(Member);
8785   };
8786 
8787   // ---------------------------------------------------------------------------
8788   // Build initial VPlan: Scan the body of the loop in a topological order to
8789   // visit each basic block after having visited its predecessor basic blocks.
8790   // ---------------------------------------------------------------------------
8791 
8792   // Create initial VPlan skeleton, starting with a block for the pre-header,
8793   // followed by a region for the vector loop, followed by the middle block. The
8794   // skeleton vector loop region contains a header and latch block.
8795   VPBasicBlock *Preheader = new VPBasicBlock("vector.ph");
8796   auto Plan = std::make_unique<VPlan>(Preheader);
8797 
8798   VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
8799   VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
8800   VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
8801   auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop");
8802   VPBlockUtils::insertBlockAfter(TopRegion, Preheader);
8803   VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");
8804   VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
8805 
8806   Instruction *DLInst =
8807       getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
8808   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(),
8809                         DLInst ? DLInst->getDebugLoc() : DebugLoc(),
8810                         !CM.foldTailByMasking(), false);
8811 
8812   // Scan the body of the loop in a topological order to visit each basic block
8813   // after having visited its predecessor basic blocks.
8814   LoopBlocksDFS DFS(OrigLoop);
8815   DFS.perform(LI);
8816 
8817   VPBasicBlock *VPBB = HeaderVPBB;
8818   SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove;
8819   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8820     // Relevant instructions from basic block BB will be grouped into VPRecipe
8821     // ingredients and fill a new VPBasicBlock.
8822     unsigned VPBBsForBB = 0;
8823     if (VPBB != HeaderVPBB)
8824       VPBB->setName(BB->getName());
8825     Builder.setInsertPoint(VPBB);
8826 
8827     // Introduce each ingredient into VPlan.
8828     // TODO: Model and preserve debug intrinsics in VPlan.
8829     for (Instruction &I : BB->instructionsWithoutDebug()) {
8830       Instruction *Instr = &I;
8831 
8832       // First filter out irrelevant instructions, to ensure no recipes are
8833       // built for them.
8834       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
8835         continue;
8836 
8837       SmallVector<VPValue *, 4> Operands;
8838       auto *Phi = dyn_cast<PHINode>(Instr);
8839       if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
8840         Operands.push_back(Plan->getOrAddVPValue(
8841             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
8842       } else {
8843         auto OpRange = Plan->mapToVPValues(Instr->operands());
8844         Operands = {OpRange.begin(), OpRange.end()};
8845       }
8846       if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
8847               Instr, Operands, Range, Plan)) {
8848         // If Instr can be simplified to an existing VPValue, use it.
8849         if (RecipeOrValue.is<VPValue *>()) {
8850           auto *VPV = RecipeOrValue.get<VPValue *>();
8851           Plan->addVPValue(Instr, VPV);
8852           // If the re-used value is a recipe, register the recipe for the
8853           // instruction, in case the recipe for Instr needs to be recorded.
8854           if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef()))
8855             RecipeBuilder.setRecipe(Instr, R);
8856           continue;
8857         }
8858         // Otherwise, add the new recipe.
8859         VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>();
8860         for (auto *Def : Recipe->definedValues()) {
8861           auto *UV = Def->getUnderlyingValue();
8862           Plan->addVPValue(UV, Def);
8863         }
8864 
8865         if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) &&
8866             HeaderVPBB->getFirstNonPhi() != VPBB->end()) {
8867           // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section
8868           // of the header block. That can happen for truncates of induction
8869           // variables. Those recipes are moved to the phi section of the header
8870           // block after applying SinkAfter, which relies on the original
8871           // position of the trunc.
8872           assert(isa<TruncInst>(Instr));
8873           InductionsToMove.push_back(
8874               cast<VPWidenIntOrFpInductionRecipe>(Recipe));
8875         }
8876         RecipeBuilder.setRecipe(Instr, Recipe);
8877         VPBB->appendRecipe(Recipe);
8878         continue;
8879       }
8880 
8881       // Invariant stores inside loop will be deleted and a single store
8882       // with the final reduction value will be added to the exit block
8883       StoreInst *SI;
8884       if ((SI = dyn_cast<StoreInst>(&I)) &&
8885           Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
8886         continue;
8887 
8888       // Otherwise, if all widening options failed, Instruction is to be
8889       // replicated. This may create a successor for VPBB.
8890       VPBasicBlock *NextVPBB =
8891           RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan);
8892       if (NextVPBB != VPBB) {
8893         VPBB = NextVPBB;
8894         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
8895                                     : "");
8896       }
8897     }
8898 
8899     VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
8900     VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
8901   }
8902 
8903   HeaderVPBB->setName("vector.body");
8904 
8905   // Fold the last, empty block into its predecessor.
8906   VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB);
8907   assert(VPBB && "expected to fold last (empty) block");
8908   // After here, VPBB should not be used.
8909   VPBB = nullptr;
8910 
8911   assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
8912          !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
8913          "entry block must be set to a VPRegionBlock having a non-empty entry "
8914          "VPBasicBlock");
8915   RecipeBuilder.fixHeaderPhis();
8916 
8917   // ---------------------------------------------------------------------------
8918   // Transform initial VPlan: Apply previously taken decisions, in order, to
8919   // bring the VPlan to its final state.
8920   // ---------------------------------------------------------------------------
8921 
8922   // Apply Sink-After legal constraints.
8923   auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {
8924     auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
8925     if (Region && Region->isReplicator()) {
8926       assert(Region->getNumSuccessors() == 1 &&
8927              Region->getNumPredecessors() == 1 && "Expected SESE region!");
8928       assert(R->getParent()->size() == 1 &&
8929              "A recipe in an original replicator region must be the only "
8930              "recipe in its block");
8931       return Region;
8932     }
8933     return nullptr;
8934   };
8935   for (auto &Entry : SinkAfter) {
8936     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
8937     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
8938 
8939     auto *TargetRegion = GetReplicateRegion(Target);
8940     auto *SinkRegion = GetReplicateRegion(Sink);
8941     if (!SinkRegion) {
8942       // If the sink source is not a replicate region, sink the recipe directly.
8943       if (TargetRegion) {
8944         // The target is in a replication region, make sure to move Sink to
8945         // the block after it, not into the replication region itself.
8946         VPBasicBlock *NextBlock =
8947             cast<VPBasicBlock>(TargetRegion->getSuccessors().front());
8948         Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
8949       } else
8950         Sink->moveAfter(Target);
8951       continue;
8952     }
8953 
8954     // The sink source is in a replicate region. Unhook the region from the CFG.
8955     auto *SinkPred = SinkRegion->getSinglePredecessor();
8956     auto *SinkSucc = SinkRegion->getSingleSuccessor();
8957     VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion);
8958     VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc);
8959     VPBlockUtils::connectBlocks(SinkPred, SinkSucc);
8960 
8961     if (TargetRegion) {
8962       // The target recipe is also in a replicate region, move the sink region
8963       // after the target region.
8964       auto *TargetSucc = TargetRegion->getSingleSuccessor();
8965       VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc);
8966       VPBlockUtils::connectBlocks(TargetRegion, SinkRegion);
8967       VPBlockUtils::connectBlocks(SinkRegion, TargetSucc);
8968     } else {
8969       // The sink source is in a replicate region, we need to move the whole
8970       // replicate region, which should only contain a single recipe in the
8971       // main block.
8972       auto *SplitBlock =
8973           Target->getParent()->splitAt(std::next(Target->getIterator()));
8974 
8975       auto *SplitPred = SplitBlock->getSinglePredecessor();
8976 
8977       VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock);
8978       VPBlockUtils::connectBlocks(SplitPred, SinkRegion);
8979       VPBlockUtils::connectBlocks(SinkRegion, SplitBlock);
8980     }
8981   }
8982 
8983   VPlanTransforms::removeRedundantCanonicalIVs(*Plan);
8984   VPlanTransforms::removeRedundantInductionCasts(*Plan);
8985 
8986   // Now that sink-after is done, move induction recipes for optimized truncates
8987   // to the phi section of the header block.
8988   for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove)
8989     Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
8990 
8991   // Adjust the recipes for any inloop reductions.
8992   adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExit()), Plan,
8993                              RecipeBuilder, Range.Start);
8994 
8995   // Introduce a recipe to combine the incoming and previous values of a
8996   // first-order recurrence.
8997   for (VPRecipeBase &R :
8998        Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
8999     auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R);
9000     if (!RecurPhi)
9001       continue;
9002 
9003     VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe();
9004     VPBasicBlock *InsertBlock = PrevRecipe->getParent();
9005     auto *Region = GetReplicateRegion(PrevRecipe);
9006     if (Region)
9007       InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor());
9008     if (Region || PrevRecipe->isPhi())
9009       Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
9010     else
9011       Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator()));
9012 
9013     auto *RecurSplice = cast<VPInstruction>(
9014         Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
9015                              {RecurPhi, RecurPhi->getBackedgeValue()}));
9016 
9017     RecurPhi->replaceAllUsesWith(RecurSplice);
9018     // Set the first operand of RecurSplice to RecurPhi again, after replacing
9019     // all users.
9020     RecurSplice->setOperand(0, RecurPhi);
9021   }
9022 
9023   // Interleave memory: for each Interleave Group we marked earlier as relevant
9024   // for this VPlan, replace the Recipes widening its memory instructions with a
9025   // single VPInterleaveRecipe at its insertion point.
9026   for (auto IG : InterleaveGroups) {
9027     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
9028         RecipeBuilder.getRecipe(IG->getInsertPos()));
9029     SmallVector<VPValue *, 4> StoredValues;
9030     for (unsigned i = 0; i < IG->getFactor(); ++i)
9031       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
9032         auto *StoreR =
9033             cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
9034         StoredValues.push_back(StoreR->getStoredValue());
9035       }
9036 
9037     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
9038                                         Recipe->getMask());
9039     VPIG->insertBefore(Recipe);
9040     unsigned J = 0;
9041     for (unsigned i = 0; i < IG->getFactor(); ++i)
9042       if (Instruction *Member = IG->getMember(i)) {
9043         if (!Member->getType()->isVoidTy()) {
9044           VPValue *OriginalV = Plan->getVPValue(Member);
9045           Plan->removeVPValueFor(Member);
9046           Plan->addVPValue(Member, VPIG->getVPValue(J));
9047           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
9048           J++;
9049         }
9050         RecipeBuilder.getRecipe(Member)->eraseFromParent();
9051       }
9052   }
9053 
9054   // From this point onwards, VPlan-to-VPlan transformations may change the plan
9055   // in ways that accessing values using original IR values is incorrect.
9056   Plan->disableValue2VPValue();
9057 
9058   VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE());
9059   VPlanTransforms::sinkScalarOperands(*Plan);
9060   VPlanTransforms::mergeReplicateRegions(*Plan);
9061   VPlanTransforms::removeDeadRecipes(*Plan, *OrigLoop);
9062   VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan);
9063 
9064   std::string PlanName;
9065   raw_string_ostream RSO(PlanName);
9066   ElementCount VF = Range.Start;
9067   Plan->addVF(VF);
9068   RSO << "Initial VPlan for VF={" << VF;
9069   for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
9070     Plan->addVF(VF);
9071     RSO << "," << VF;
9072   }
9073   RSO << "},UF>=1";
9074   RSO.flush();
9075   Plan->setName(PlanName);
9076 
9077   // Fold Exit block into its predecessor if possible.
9078   // TODO: Fold block earlier once all VPlan transforms properly maintain a
9079   // VPBasicBlock as exit.
9080   VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExit());
9081 
9082   assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
9083   return Plan;
9084 }
9085 
9086 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9087   // Outer loop handling: They may require CFG and instruction level
9088   // transformations before even evaluating whether vectorization is profitable.
9089   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9090   // the vectorization pipeline.
9091   assert(!OrigLoop->isInnermost());
9092   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9093 
9094   // Create new empty VPlan
9095   auto Plan = std::make_unique<VPlan>();
9096 
9097   // Build hierarchical CFG
9098   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9099   HCFGBuilder.buildHierarchicalCFG();
9100 
9101   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
9102        VF *= 2)
9103     Plan->addVF(VF);
9104 
9105   if (EnableVPlanPredication) {
9106     VPlanPredicator VPP(*Plan);
9107     VPP.predicate();
9108 
9109     // Avoid running transformation to recipes until masked code generation in
9110     // VPlan-native path is in place.
9111     return Plan;
9112   }
9113 
9114   SmallPtrSet<Instruction *, 1> DeadInstructions;
9115   VPlanTransforms::VPInstructionsToVPRecipes(
9116       OrigLoop, Plan,
9117       [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
9118       DeadInstructions, *PSE.getSE());
9119 
9120   // Update plan to be compatible with the inner loop vectorizer for
9121   // code-generation.
9122   VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion();
9123   VPBasicBlock *Preheader = LoopRegion->getEntryBasicBlock();
9124   VPBasicBlock *Exit = LoopRegion->getExitBasicBlock();
9125   VPBlockBase *Latch = Exit->getSinglePredecessor();
9126   VPBlockBase *Header = Preheader->getSingleSuccessor();
9127 
9128   // 1. Move preheader block out of main vector loop.
9129   Preheader->setParent(LoopRegion->getParent());
9130   VPBlockUtils::disconnectBlocks(Preheader, Header);
9131   VPBlockUtils::connectBlocks(Preheader, LoopRegion);
9132   Plan->setEntry(Preheader);
9133 
9134   // 2. Disconnect backedge and exit block.
9135   VPBlockUtils::disconnectBlocks(Latch, Header);
9136   VPBlockUtils::disconnectBlocks(Latch, Exit);
9137 
9138   // 3. Update entry and exit of main vector loop region.
9139   LoopRegion->setEntry(Header);
9140   LoopRegion->setExit(Latch);
9141 
9142   // 4. Remove exit block.
9143   delete Exit;
9144 
9145   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(),
9146                         true, true);
9147   return Plan;
9148 }
9149 
9150 // Adjust the recipes for reductions. For in-loop reductions the chain of
9151 // instructions leading from the loop exit instr to the phi need to be converted
9152 // to reductions, with one operand being vector and the other being the scalar
9153 // reduction chain. For other reductions, a select is introduced between the phi
9154 // and live-out recipes when folding the tail.
9155 void LoopVectorizationPlanner::adjustRecipesForReductions(
9156     VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
9157     ElementCount MinVF) {
9158   for (auto &Reduction : CM.getInLoopReductionChains()) {
9159     PHINode *Phi = Reduction.first;
9160     const RecurrenceDescriptor &RdxDesc =
9161         Legal->getReductionVars().find(Phi)->second;
9162     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9163 
9164     if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc))
9165       continue;
9166 
9167     // ReductionOperations are orders top-down from the phi's use to the
9168     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
9169     // which of the two operands will remain scalar and which will be reduced.
9170     // For minmax the chain will be the select instructions.
9171     Instruction *Chain = Phi;
9172     for (Instruction *R : ReductionOperations) {
9173       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
9174       RecurKind Kind = RdxDesc.getRecurrenceKind();
9175 
9176       VPValue *ChainOp = Plan->getVPValue(Chain);
9177       unsigned FirstOpId;
9178       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
9179              "Only min/max recurrences allowed for inloop reductions");
9180       // Recognize a call to the llvm.fmuladd intrinsic.
9181       bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9182       assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) &&
9183              "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9184       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9185         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
9186                "Expected to replace a VPWidenSelectSC");
9187         FirstOpId = 1;
9188       } else {
9189         assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) ||
9190                 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) &&
9191                "Expected to replace a VPWidenSC");
9192         FirstOpId = 0;
9193       }
9194       unsigned VecOpId =
9195           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
9196       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
9197 
9198       auto *CondOp = CM.blockNeedsPredicationForAnyReason(R->getParent())
9199                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
9200                          : nullptr;
9201 
9202       if (IsFMulAdd) {
9203         // If the instruction is a call to the llvm.fmuladd intrinsic then we
9204         // need to create an fmul recipe to use as the vector operand for the
9205         // fadd reduction.
9206         VPInstruction *FMulRecipe = new VPInstruction(
9207             Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))});
9208         FMulRecipe->setFastMathFlags(R->getFastMathFlags());
9209         WidenRecipe->getParent()->insert(FMulRecipe,
9210                                          WidenRecipe->getIterator());
9211         VecOp = FMulRecipe;
9212       }
9213       VPReductionRecipe *RedRecipe =
9214           new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
9215       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9216       Plan->removeVPValueFor(R);
9217       Plan->addVPValue(R, RedRecipe);
9218       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
9219       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9220       WidenRecipe->eraseFromParent();
9221 
9222       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9223         VPRecipeBase *CompareRecipe =
9224             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
9225         assert(isa<VPWidenRecipe>(CompareRecipe) &&
9226                "Expected to replace a VPWidenSC");
9227         assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
9228                "Expected no remaining users");
9229         CompareRecipe->eraseFromParent();
9230       }
9231       Chain = R;
9232     }
9233   }
9234 
9235   // If tail is folded by masking, introduce selects between the phi
9236   // and the live-out instruction of each reduction, at the beginning of the
9237   // dedicated latch block.
9238   if (CM.foldTailByMasking()) {
9239     Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin());
9240     for (VPRecipeBase &R :
9241          Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9242       VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9243       if (!PhiR || PhiR->isInLoop())
9244         continue;
9245       VPValue *Cond =
9246           RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
9247       VPValue *Red = PhiR->getBackedgeValue();
9248       assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB &&
9249              "reduction recipe must be defined before latch");
9250       Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});
9251     }
9252   }
9253 }
9254 
9255 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9256 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9257                                VPSlotTracker &SlotTracker) const {
9258   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9259   IG->getInsertPos()->printAsOperand(O, false);
9260   O << ", ";
9261   getAddr()->printAsOperand(O, SlotTracker);
9262   VPValue *Mask = getMask();
9263   if (Mask) {
9264     O << ", ";
9265     Mask->printAsOperand(O, SlotTracker);
9266   }
9267 
9268   unsigned OpIdx = 0;
9269   for (unsigned i = 0; i < IG->getFactor(); ++i) {
9270     if (!IG->getMember(i))
9271       continue;
9272     if (getNumStoreOperands() > 0) {
9273       O << "\n" << Indent << "  store ";
9274       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9275       O << " to index " << i;
9276     } else {
9277       O << "\n" << Indent << "  ";
9278       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
9279       O << " = load from index " << i;
9280     }
9281     ++OpIdx;
9282   }
9283 }
9284 #endif
9285 
9286 void VPWidenCallRecipe::execute(VPTransformState &State) {
9287   State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
9288                                   *this, State);
9289 }
9290 
9291 void VPWidenSelectRecipe::execute(VPTransformState &State) {
9292   auto &I = *cast<SelectInst>(getUnderlyingInstr());
9293   State.ILV->setDebugLocFromInst(&I);
9294 
9295   // The condition can be loop invariant  but still defined inside the
9296   // loop. This means that we can't just use the original 'cond' value.
9297   // We have to take the 'vectorized' value and pick the first lane.
9298   // Instcombine will make this a no-op.
9299   auto *InvarCond =
9300       InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr;
9301 
9302   for (unsigned Part = 0; Part < State.UF; ++Part) {
9303     Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part);
9304     Value *Op0 = State.get(getOperand(1), Part);
9305     Value *Op1 = State.get(getOperand(2), Part);
9306     Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
9307     State.set(this, Sel, Part);
9308     State.ILV->addMetadata(Sel, &I);
9309   }
9310 }
9311 
9312 void VPWidenRecipe::execute(VPTransformState &State) {
9313   auto &I = *cast<Instruction>(getUnderlyingValue());
9314   auto &Builder = State.Builder;
9315   switch (I.getOpcode()) {
9316   case Instruction::Call:
9317   case Instruction::Br:
9318   case Instruction::PHI:
9319   case Instruction::GetElementPtr:
9320   case Instruction::Select:
9321     llvm_unreachable("This instruction is handled by a different recipe.");
9322   case Instruction::UDiv:
9323   case Instruction::SDiv:
9324   case Instruction::SRem:
9325   case Instruction::URem:
9326   case Instruction::Add:
9327   case Instruction::FAdd:
9328   case Instruction::Sub:
9329   case Instruction::FSub:
9330   case Instruction::FNeg:
9331   case Instruction::Mul:
9332   case Instruction::FMul:
9333   case Instruction::FDiv:
9334   case Instruction::FRem:
9335   case Instruction::Shl:
9336   case Instruction::LShr:
9337   case Instruction::AShr:
9338   case Instruction::And:
9339   case Instruction::Or:
9340   case Instruction::Xor: {
9341     // Just widen unops and binops.
9342     State.ILV->setDebugLocFromInst(&I);
9343 
9344     for (unsigned Part = 0; Part < State.UF; ++Part) {
9345       SmallVector<Value *, 2> Ops;
9346       for (VPValue *VPOp : operands())
9347         Ops.push_back(State.get(VPOp, Part));
9348 
9349       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
9350 
9351       if (auto *VecOp = dyn_cast<Instruction>(V)) {
9352         VecOp->copyIRFlags(&I);
9353 
9354         // If the instruction is vectorized and was in a basic block that needed
9355         // predication, we can't propagate poison-generating flags (nuw/nsw,
9356         // exact, etc.). The control flow has been linearized and the
9357         // instruction is no longer guarded by the predicate, which could make
9358         // the flag properties to no longer hold.
9359         if (State.MayGeneratePoisonRecipes.contains(this))
9360           VecOp->dropPoisonGeneratingFlags();
9361       }
9362 
9363       // Use this vector value for all users of the original instruction.
9364       State.set(this, V, Part);
9365       State.ILV->addMetadata(V, &I);
9366     }
9367 
9368     break;
9369   }
9370   case Instruction::ICmp:
9371   case Instruction::FCmp: {
9372     // Widen compares. Generate vector compares.
9373     bool FCmp = (I.getOpcode() == Instruction::FCmp);
9374     auto *Cmp = cast<CmpInst>(&I);
9375     State.ILV->setDebugLocFromInst(Cmp);
9376     for (unsigned Part = 0; Part < State.UF; ++Part) {
9377       Value *A = State.get(getOperand(0), Part);
9378       Value *B = State.get(getOperand(1), Part);
9379       Value *C = nullptr;
9380       if (FCmp) {
9381         // Propagate fast math flags.
9382         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
9383         Builder.setFastMathFlags(Cmp->getFastMathFlags());
9384         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
9385       } else {
9386         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
9387       }
9388       State.set(this, C, Part);
9389       State.ILV->addMetadata(C, &I);
9390     }
9391 
9392     break;
9393   }
9394 
9395   case Instruction::ZExt:
9396   case Instruction::SExt:
9397   case Instruction::FPToUI:
9398   case Instruction::FPToSI:
9399   case Instruction::FPExt:
9400   case Instruction::PtrToInt:
9401   case Instruction::IntToPtr:
9402   case Instruction::SIToFP:
9403   case Instruction::UIToFP:
9404   case Instruction::Trunc:
9405   case Instruction::FPTrunc:
9406   case Instruction::BitCast: {
9407     auto *CI = cast<CastInst>(&I);
9408     State.ILV->setDebugLocFromInst(CI);
9409 
9410     /// Vectorize casts.
9411     Type *DestTy = (State.VF.isScalar())
9412                        ? CI->getType()
9413                        : VectorType::get(CI->getType(), State.VF);
9414 
9415     for (unsigned Part = 0; Part < State.UF; ++Part) {
9416       Value *A = State.get(getOperand(0), Part);
9417       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
9418       State.set(this, Cast, Part);
9419       State.ILV->addMetadata(Cast, &I);
9420     }
9421     break;
9422   }
9423   default:
9424     // This instruction is not vectorized by simple widening.
9425     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
9426     llvm_unreachable("Unhandled instruction!");
9427   } // end of switch.
9428 }
9429 
9430 void VPWidenGEPRecipe::execute(VPTransformState &State) {
9431   auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
9432   // Construct a vector GEP by widening the operands of the scalar GEP as
9433   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
9434   // results in a vector of pointers when at least one operand of the GEP
9435   // is vector-typed. Thus, to keep the representation compact, we only use
9436   // vector-typed operands for loop-varying values.
9437 
9438   if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
9439     // If we are vectorizing, but the GEP has only loop-invariant operands,
9440     // the GEP we build (by only using vector-typed operands for
9441     // loop-varying values) would be a scalar pointer. Thus, to ensure we
9442     // produce a vector of pointers, we need to either arbitrarily pick an
9443     // operand to broadcast, or broadcast a clone of the original GEP.
9444     // Here, we broadcast a clone of the original.
9445     //
9446     // TODO: If at some point we decide to scalarize instructions having
9447     //       loop-invariant operands, this special case will no longer be
9448     //       required. We would add the scalarization decision to
9449     //       collectLoopScalars() and teach getVectorValue() to broadcast
9450     //       the lane-zero scalar value.
9451     auto *Clone = State.Builder.Insert(GEP->clone());
9452     for (unsigned Part = 0; Part < State.UF; ++Part) {
9453       Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone);
9454       State.set(this, EntryPart, Part);
9455       State.ILV->addMetadata(EntryPart, GEP);
9456     }
9457   } else {
9458     // If the GEP has at least one loop-varying operand, we are sure to
9459     // produce a vector of pointers. But if we are only unrolling, we want
9460     // to produce a scalar GEP for each unroll part. Thus, the GEP we
9461     // produce with the code below will be scalar (if VF == 1) or vector
9462     // (otherwise). Note that for the unroll-only case, we still maintain
9463     // values in the vector mapping with initVector, as we do for other
9464     // instructions.
9465     for (unsigned Part = 0; Part < State.UF; ++Part) {
9466       // The pointer operand of the new GEP. If it's loop-invariant, we
9467       // won't broadcast it.
9468       auto *Ptr = IsPtrLoopInvariant
9469                       ? State.get(getOperand(0), VPIteration(0, 0))
9470                       : State.get(getOperand(0), Part);
9471 
9472       // Collect all the indices for the new GEP. If any index is
9473       // loop-invariant, we won't broadcast it.
9474       SmallVector<Value *, 4> Indices;
9475       for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
9476         VPValue *Operand = getOperand(I);
9477         if (IsIndexLoopInvariant[I - 1])
9478           Indices.push_back(State.get(Operand, VPIteration(0, 0)));
9479         else
9480           Indices.push_back(State.get(Operand, Part));
9481       }
9482 
9483       // If the GEP instruction is vectorized and was in a basic block that
9484       // needed predication, we can't propagate the poison-generating 'inbounds'
9485       // flag. The control flow has been linearized and the GEP is no longer
9486       // guarded by the predicate, which could make the 'inbounds' properties to
9487       // no longer hold.
9488       bool IsInBounds =
9489           GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0;
9490 
9491       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
9492       // but it should be a vector, otherwise.
9493       auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
9494                                              Indices, "", IsInBounds);
9495       assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
9496              "NewGEP is not a pointer vector");
9497       State.set(this, NewGEP, Part);
9498       State.ILV->addMetadata(NewGEP, GEP);
9499     }
9500   }
9501 }
9502 
9503 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
9504   assert(!State.Instance && "Int or FP induction being replicated.");
9505 
9506   Value *Start = getStartValue()->getLiveInIRValue();
9507   const InductionDescriptor &ID = getInductionDescriptor();
9508   TruncInst *Trunc = getTruncInst();
9509   IRBuilderBase &Builder = State.Builder;
9510   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
9511   assert(State.VF.isVector() && "must have vector VF");
9512 
9513   // The value from the original loop to which we are mapping the new induction
9514   // variable.
9515   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
9516 
9517   // Fast-math-flags propagate from the original induction instruction.
9518   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
9519   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
9520     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
9521 
9522   // Now do the actual transformations, and start with fetching the step value.
9523   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9524 
9525   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
9526          "Expected either an induction phi-node or a truncate of it!");
9527 
9528   // Construct the initial value of the vector IV in the vector loop preheader
9529   auto CurrIP = Builder.saveIP();
9530   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9531   Builder.SetInsertPoint(VectorPH->getTerminator());
9532   if (isa<TruncInst>(EntryVal)) {
9533     assert(Start->getType()->isIntegerTy() &&
9534            "Truncation requires an integer type");
9535     auto *TruncType = cast<IntegerType>(EntryVal->getType());
9536     Step = Builder.CreateTrunc(Step, TruncType);
9537     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
9538   }
9539 
9540   Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
9541   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
9542   Value *SteppedStart = getStepVector(
9543       SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
9544 
9545   // We create vector phi nodes for both integer and floating-point induction
9546   // variables. Here, we determine the kind of arithmetic we will perform.
9547   Instruction::BinaryOps AddOp;
9548   Instruction::BinaryOps MulOp;
9549   if (Step->getType()->isIntegerTy()) {
9550     AddOp = Instruction::Add;
9551     MulOp = Instruction::Mul;
9552   } else {
9553     AddOp = ID.getInductionOpcode();
9554     MulOp = Instruction::FMul;
9555   }
9556 
9557   // Multiply the vectorization factor by the step using integer or
9558   // floating-point arithmetic as appropriate.
9559   Type *StepType = Step->getType();
9560   Value *RuntimeVF;
9561   if (Step->getType()->isFloatingPointTy())
9562     RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
9563   else
9564     RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
9565   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
9566 
9567   // Create a vector splat to use in the induction update.
9568   //
9569   // FIXME: If the step is non-constant, we create the vector splat with
9570   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
9571   //        handle a constant vector splat.
9572   Value *SplatVF = isa<Constant>(Mul)
9573                        ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
9574                        : Builder.CreateVectorSplat(State.VF, Mul);
9575   Builder.restoreIP(CurrIP);
9576 
9577   // We may need to add the step a number of times, depending on the unroll
9578   // factor. The last of those goes into the PHI.
9579   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
9580                                     &*State.CFG.PrevBB->getFirstInsertionPt());
9581   VecInd->setDebugLoc(EntryVal->getDebugLoc());
9582   Instruction *LastInduction = VecInd;
9583   for (unsigned Part = 0; Part < State.UF; ++Part) {
9584     State.set(this, LastInduction, Part);
9585 
9586     if (isa<TruncInst>(EntryVal))
9587       State.ILV->addMetadata(LastInduction, EntryVal);
9588 
9589     LastInduction = cast<Instruction>(
9590         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
9591     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
9592   }
9593 
9594   LastInduction->setName("vec.ind.next");
9595   VecInd->addIncoming(SteppedStart, VectorPH);
9596   // Add induction update using an incorrect block temporarily. The phi node
9597   // will be fixed after VPlan execution. Note that at this point the latch
9598   // block cannot be used, as it does not exist yet.
9599   // TODO: Model increment value in VPlan, by turning the recipe into a
9600   // multi-def and a subclass of VPHeaderPHIRecipe.
9601   VecInd->addIncoming(LastInduction, VectorPH);
9602 }
9603 
9604 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
9605   assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
9606          "Not a pointer induction according to InductionDescriptor!");
9607   assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
9608          "Unexpected type.");
9609 
9610   auto *IVR = getParent()->getPlan()->getCanonicalIV();
9611   PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
9612 
9613   if (all_of(users(),
9614              [this](const VPUser *U) { return U->usesScalars(this); })) {
9615     // This is the normalized GEP that starts counting at zero.
9616     Value *PtrInd = State.Builder.CreateSExtOrTrunc(
9617         CanonicalIV, IndDesc.getStep()->getType());
9618     // Determine the number of scalars we need to generate for each unroll
9619     // iteration. If the instruction is uniform, we only need to generate the
9620     // first lane. Otherwise, we generate all VF values.
9621     bool IsUniform = vputils::onlyFirstLaneUsed(this);
9622     assert((IsUniform || !State.VF.isScalable()) &&
9623            "Cannot scalarize a scalable VF");
9624     unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
9625 
9626     for (unsigned Part = 0; Part < State.UF; ++Part) {
9627       Value *PartStart =
9628           createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part);
9629 
9630       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
9631         Value *Idx = State.Builder.CreateAdd(
9632             PartStart, ConstantInt::get(PtrInd->getType(), Lane));
9633         Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx);
9634 
9635         Value *Step = CreateStepValue(IndDesc.getStep(), SE,
9636                                       State.CFG.PrevBB->getTerminator());
9637         Value *SclrGep = emitTransformedIndex(
9638             State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc);
9639         SclrGep->setName("next.gep");
9640         State.set(this, SclrGep, VPIteration(Part, Lane));
9641       }
9642     }
9643     return;
9644   }
9645 
9646   assert(isa<SCEVConstant>(IndDesc.getStep()) &&
9647          "Induction step not a SCEV constant!");
9648   Type *PhiType = IndDesc.getStep()->getType();
9649 
9650   // Build a pointer phi
9651   Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
9652   Type *ScStValueType = ScalarStartValue->getType();
9653   PHINode *NewPointerPhi =
9654       PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
9655 
9656   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9657   NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
9658 
9659   // A pointer induction, performed by using a gep
9660   const DataLayout &DL = NewPointerPhi->getModule()->getDataLayout();
9661   Instruction *InductionLoc = &*State.Builder.GetInsertPoint();
9662 
9663   const SCEV *ScalarStep = IndDesc.getStep();
9664   SCEVExpander Exp(SE, DL, "induction");
9665   Value *ScalarStepValue = Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
9666   Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
9667   Value *NumUnrolledElems =
9668       State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
9669   Value *InductionGEP = GetElementPtrInst::Create(
9670       IndDesc.getElementType(), NewPointerPhi,
9671       State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
9672       InductionLoc);
9673   // Add induction update using an incorrect block temporarily. The phi node
9674   // will be fixed after VPlan execution. Note that at this point the latch
9675   // block cannot be used, as it does not exist yet.
9676   // TODO: Model increment value in VPlan, by turning the recipe into a
9677   // multi-def and a subclass of VPHeaderPHIRecipe.
9678   NewPointerPhi->addIncoming(InductionGEP, VectorPH);
9679 
9680   // Create UF many actual address geps that use the pointer
9681   // phi as base and a vectorized version of the step value
9682   // (<step*0, ..., step*N>) as offset.
9683   for (unsigned Part = 0; Part < State.UF; ++Part) {
9684     Type *VecPhiType = VectorType::get(PhiType, State.VF);
9685     Value *StartOffsetScalar =
9686         State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
9687     Value *StartOffset =
9688         State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
9689     // Create a vector of consecutive numbers from zero to VF.
9690     StartOffset = State.Builder.CreateAdd(
9691         StartOffset, State.Builder.CreateStepVector(VecPhiType));
9692 
9693     Value *GEP = State.Builder.CreateGEP(
9694         IndDesc.getElementType(), NewPointerPhi,
9695         State.Builder.CreateMul(
9696             StartOffset,
9697             State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
9698             "vector.gep"));
9699     State.set(this, GEP, Part);
9700   }
9701 }
9702 
9703 void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
9704   assert(!State.Instance && "VPScalarIVStepsRecipe being replicated.");
9705 
9706   // Fast-math-flags propagate from the original induction instruction.
9707   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9708   if (IndDesc.getInductionBinOp() &&
9709       isa<FPMathOperator>(IndDesc.getInductionBinOp()))
9710     State.Builder.setFastMathFlags(
9711         IndDesc.getInductionBinOp()->getFastMathFlags());
9712 
9713   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9714   auto CreateScalarIV = [&](Value *&Step) -> Value * {
9715     Value *ScalarIV = State.get(getCanonicalIV(), VPIteration(0, 0));
9716     auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0);
9717     if (!isCanonical() || CanonicalIV->getType() != Ty) {
9718       ScalarIV =
9719           Ty->isIntegerTy()
9720               ? State.Builder.CreateSExtOrTrunc(ScalarIV, Ty)
9721               : State.Builder.CreateCast(Instruction::SIToFP, ScalarIV, Ty);
9722       ScalarIV = emitTransformedIndex(State.Builder, ScalarIV,
9723                                       getStartValue()->getLiveInIRValue(), Step,
9724                                       IndDesc);
9725       ScalarIV->setName("offset.idx");
9726     }
9727     if (TruncToTy) {
9728       assert(Step->getType()->isIntegerTy() &&
9729              "Truncation requires an integer step");
9730       ScalarIV = State.Builder.CreateTrunc(ScalarIV, TruncToTy);
9731       Step = State.Builder.CreateTrunc(Step, TruncToTy);
9732     }
9733     return ScalarIV;
9734   };
9735 
9736   Value *ScalarIV = CreateScalarIV(Step);
9737   if (State.VF.isVector()) {
9738     buildScalarSteps(ScalarIV, Step, IndDesc, this, State);
9739     return;
9740   }
9741 
9742   for (unsigned Part = 0; Part < State.UF; ++Part) {
9743     assert(!State.VF.isScalable() && "scalable vectors not yet supported.");
9744     Value *EntryPart;
9745     if (Step->getType()->isFloatingPointTy()) {
9746       Value *StartIdx =
9747           getRuntimeVFAsFloat(State.Builder, Step->getType(), State.VF * Part);
9748       // Floating-point operations inherit FMF via the builder's flags.
9749       Value *MulOp = State.Builder.CreateFMul(StartIdx, Step);
9750       EntryPart = State.Builder.CreateBinOp(IndDesc.getInductionOpcode(),
9751                                             ScalarIV, MulOp);
9752     } else {
9753       Value *StartIdx =
9754           getRuntimeVF(State.Builder, Step->getType(), State.VF * Part);
9755       EntryPart = State.Builder.CreateAdd(
9756           ScalarIV, State.Builder.CreateMul(StartIdx, Step), "induction");
9757     }
9758     State.set(this, EntryPart, Part);
9759   }
9760 }
9761 
9762 void VPWidenPHIRecipe::execute(VPTransformState &State) {
9763   State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this,
9764                                  State);
9765 }
9766 
9767 void VPBlendRecipe::execute(VPTransformState &State) {
9768   State.ILV->setDebugLocFromInst(Phi, &State.Builder);
9769   // We know that all PHIs in non-header blocks are converted into
9770   // selects, so we don't have to worry about the insertion order and we
9771   // can just use the builder.
9772   // At this point we generate the predication tree. There may be
9773   // duplications since this is a simple recursive scan, but future
9774   // optimizations will clean it up.
9775 
9776   unsigned NumIncoming = getNumIncomingValues();
9777 
9778   // Generate a sequence of selects of the form:
9779   // SELECT(Mask3, In3,
9780   //        SELECT(Mask2, In2,
9781   //               SELECT(Mask1, In1,
9782   //                      In0)))
9783   // Note that Mask0 is never used: lanes for which no path reaches this phi and
9784   // are essentially undef are taken from In0.
9785   InnerLoopVectorizer::VectorParts Entry(State.UF);
9786   for (unsigned In = 0; In < NumIncoming; ++In) {
9787     for (unsigned Part = 0; Part < State.UF; ++Part) {
9788       // We might have single edge PHIs (blocks) - use an identity
9789       // 'select' for the first PHI operand.
9790       Value *In0 = State.get(getIncomingValue(In), Part);
9791       if (In == 0)
9792         Entry[Part] = In0; // Initialize with the first incoming value.
9793       else {
9794         // Select between the current value and the previous incoming edge
9795         // based on the incoming mask.
9796         Value *Cond = State.get(getMask(In), Part);
9797         Entry[Part] =
9798             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
9799       }
9800     }
9801   }
9802   for (unsigned Part = 0; Part < State.UF; ++Part)
9803     State.set(this, Entry[Part], Part);
9804 }
9805 
9806 void VPInterleaveRecipe::execute(VPTransformState &State) {
9807   assert(!State.Instance && "Interleave group being replicated.");
9808   State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9809                                       getStoredValues(), getMask());
9810 }
9811 
9812 void VPReductionRecipe::execute(VPTransformState &State) {
9813   assert(!State.Instance && "Reduction being replicated.");
9814   Value *PrevInChain = State.get(getChainOp(), 0);
9815   RecurKind Kind = RdxDesc->getRecurrenceKind();
9816   bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc);
9817   // Propagate the fast-math flags carried by the underlying instruction.
9818   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
9819   State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags());
9820   for (unsigned Part = 0; Part < State.UF; ++Part) {
9821     Value *NewVecOp = State.get(getVecOp(), Part);
9822     if (VPValue *Cond = getCondOp()) {
9823       Value *NewCond = State.get(Cond, Part);
9824       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
9825       Value *Iden = RdxDesc->getRecurrenceIdentity(
9826           Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags());
9827       Value *IdenVec =
9828           State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
9829       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
9830       NewVecOp = Select;
9831     }
9832     Value *NewRed;
9833     Value *NextInChain;
9834     if (IsOrdered) {
9835       if (State.VF.isVector())
9836         NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp,
9837                                         PrevInChain);
9838       else
9839         NewRed = State.Builder.CreateBinOp(
9840             (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain,
9841             NewVecOp);
9842       PrevInChain = NewRed;
9843     } else {
9844       PrevInChain = State.get(getChainOp(), Part);
9845       NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
9846     }
9847     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9848       NextInChain =
9849           createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
9850                          NewRed, PrevInChain);
9851     } else if (IsOrdered)
9852       NextInChain = NewRed;
9853     else
9854       NextInChain = State.Builder.CreateBinOp(
9855           (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed,
9856           PrevInChain);
9857     State.set(this, NextInChain, Part);
9858   }
9859 }
9860 
9861 void VPReplicateRecipe::execute(VPTransformState &State) {
9862   if (State.Instance) { // Generate a single instance.
9863     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9864     State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance,
9865                                     IsPredicated, State);
9866     // Insert scalar instance packing it into a vector.
9867     if (AlsoPack && State.VF.isVector()) {
9868       // If we're constructing lane 0, initialize to start from poison.
9869       if (State.Instance->Lane.isFirstLane()) {
9870         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9871         Value *Poison = PoisonValue::get(
9872             VectorType::get(getUnderlyingValue()->getType(), State.VF));
9873         State.set(this, Poison, State.Instance->Part);
9874       }
9875       State.ILV->packScalarIntoVectorValue(this, *State.Instance, State);
9876     }
9877     return;
9878   }
9879 
9880   // Generate scalar instances for all VF lanes of all UF parts, unless the
9881   // instruction is uniform inwhich case generate only the first lane for each
9882   // of the UF parts.
9883   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
9884   assert((!State.VF.isScalable() || IsUniform) &&
9885          "Can't scalarize a scalable vector");
9886   for (unsigned Part = 0; Part < State.UF; ++Part)
9887     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9888       State.ILV->scalarizeInstruction(getUnderlyingInstr(), this,
9889                                       VPIteration(Part, Lane), IsPredicated,
9890                                       State);
9891 }
9892 
9893 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
9894   assert(State.Instance && "Branch on Mask works only on single instance.");
9895 
9896   unsigned Part = State.Instance->Part;
9897   unsigned Lane = State.Instance->Lane.getKnownLane();
9898 
9899   Value *ConditionBit = nullptr;
9900   VPValue *BlockInMask = getMask();
9901   if (BlockInMask) {
9902     ConditionBit = State.get(BlockInMask, Part);
9903     if (ConditionBit->getType()->isVectorTy())
9904       ConditionBit = State.Builder.CreateExtractElement(
9905           ConditionBit, State.Builder.getInt32(Lane));
9906   } else // Block in mask is all-one.
9907     ConditionBit = State.Builder.getTrue();
9908 
9909   // Replace the temporary unreachable terminator with a new conditional branch,
9910   // whose two destinations will be set later when they are created.
9911   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
9912   assert(isa<UnreachableInst>(CurrentTerminator) &&
9913          "Expected to replace unreachable terminator with conditional branch.");
9914   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
9915   CondBr->setSuccessor(0, nullptr);
9916   ReplaceInstWithInst(CurrentTerminator, CondBr);
9917 }
9918 
9919 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
9920   assert(State.Instance && "Predicated instruction PHI works per instance.");
9921   Instruction *ScalarPredInst =
9922       cast<Instruction>(State.get(getOperand(0), *State.Instance));
9923   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
9924   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
9925   assert(PredicatingBB && "Predicated block has no single predecessor.");
9926   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
9927          "operand must be VPReplicateRecipe");
9928 
9929   // By current pack/unpack logic we need to generate only a single phi node: if
9930   // a vector value for the predicated instruction exists at this point it means
9931   // the instruction has vector users only, and a phi for the vector value is
9932   // needed. In this case the recipe of the predicated instruction is marked to
9933   // also do that packing, thereby "hoisting" the insert-element sequence.
9934   // Otherwise, a phi node for the scalar value is needed.
9935   unsigned Part = State.Instance->Part;
9936   if (State.hasVectorValue(getOperand(0), Part)) {
9937     Value *VectorValue = State.get(getOperand(0), Part);
9938     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
9939     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
9940     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
9941     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
9942     if (State.hasVectorValue(this, Part))
9943       State.reset(this, VPhi, Part);
9944     else
9945       State.set(this, VPhi, Part);
9946     // NOTE: Currently we need to update the value of the operand, so the next
9947     // predicated iteration inserts its generated value in the correct vector.
9948     State.reset(getOperand(0), VPhi, Part);
9949   } else {
9950     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
9951     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
9952     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
9953                      PredicatingBB);
9954     Phi->addIncoming(ScalarPredInst, PredicatedBB);
9955     if (State.hasScalarValue(this, *State.Instance))
9956       State.reset(this, Phi, *State.Instance);
9957     else
9958       State.set(this, Phi, *State.Instance);
9959     // NOTE: Currently we need to update the value of the operand, so the next
9960     // predicated iteration inserts its generated value in the correct vector.
9961     State.reset(getOperand(0), Phi, *State.Instance);
9962   }
9963 }
9964 
9965 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9966   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9967 
9968   // Attempt to issue a wide load.
9969   LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
9970   StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
9971 
9972   assert((LI || SI) && "Invalid Load/Store instruction");
9973   assert((!SI || StoredValue) && "No stored value provided for widened store");
9974   assert((!LI || !StoredValue) && "Stored value provided for widened load");
9975 
9976   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9977 
9978   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9979   const Align Alignment = getLoadStoreAlignment(&Ingredient);
9980   bool CreateGatherScatter = !Consecutive;
9981 
9982   auto &Builder = State.Builder;
9983   InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
9984   bool isMaskRequired = getMask();
9985   if (isMaskRequired)
9986     for (unsigned Part = 0; Part < State.UF; ++Part)
9987       BlockInMaskParts[Part] = State.get(getMask(), Part);
9988 
9989   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
9990     // Calculate the pointer for the specific unroll-part.
9991     GetElementPtrInst *PartPtr = nullptr;
9992 
9993     bool InBounds = false;
9994     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
9995       InBounds = gep->isInBounds();
9996     if (Reverse) {
9997       // If the address is consecutive but reversed, then the
9998       // wide store needs to start at the last vector element.
9999       // RunTimeVF =  VScale * VF.getKnownMinValue()
10000       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
10001       Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF);
10002       // NumElt = -Part * RunTimeVF
10003       Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
10004       // LastLane = 1 - RunTimeVF
10005       Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
10006       PartPtr =
10007           cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
10008       PartPtr->setIsInBounds(InBounds);
10009       PartPtr = cast<GetElementPtrInst>(
10010           Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
10011       PartPtr->setIsInBounds(InBounds);
10012       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
10013         BlockInMaskParts[Part] =
10014             Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse");
10015     } else {
10016       Value *Increment =
10017           createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part);
10018       PartPtr = cast<GetElementPtrInst>(
10019           Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
10020       PartPtr->setIsInBounds(InBounds);
10021     }
10022 
10023     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
10024     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
10025   };
10026 
10027   // Handle Stores:
10028   if (SI) {
10029     State.ILV->setDebugLocFromInst(SI);
10030 
10031     for (unsigned Part = 0; Part < State.UF; ++Part) {
10032       Instruction *NewSI = nullptr;
10033       Value *StoredVal = State.get(StoredValue, Part);
10034       if (CreateGatherScatter) {
10035         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
10036         Value *VectorGep = State.get(getAddr(), Part);
10037         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
10038                                             MaskPart);
10039       } else {
10040         if (Reverse) {
10041           // If we store to reverse consecutive memory locations, then we need
10042           // to reverse the order of elements in the stored value.
10043           StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
10044           // We don't want to update the value in the map as it might be used in
10045           // another expression. So don't call resetVectorValue(StoredVal).
10046         }
10047         auto *VecPtr =
10048             CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
10049         if (isMaskRequired)
10050           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
10051                                             BlockInMaskParts[Part]);
10052         else
10053           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
10054       }
10055       State.ILV->addMetadata(NewSI, SI);
10056     }
10057     return;
10058   }
10059 
10060   // Handle loads.
10061   assert(LI && "Must have a load instruction");
10062   State.ILV->setDebugLocFromInst(LI);
10063   for (unsigned Part = 0; Part < State.UF; ++Part) {
10064     Value *NewLI;
10065     if (CreateGatherScatter) {
10066       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
10067       Value *VectorGep = State.get(getAddr(), Part);
10068       NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
10069                                          nullptr, "wide.masked.gather");
10070       State.ILV->addMetadata(NewLI, LI);
10071     } else {
10072       auto *VecPtr =
10073           CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
10074       if (isMaskRequired)
10075         NewLI = Builder.CreateMaskedLoad(
10076             DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
10077             PoisonValue::get(DataTy), "wide.masked.load");
10078       else
10079         NewLI =
10080             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
10081 
10082       // Add metadata to the load, but setVectorValue to the reverse shuffle.
10083       State.ILV->addMetadata(NewLI, LI);
10084       if (Reverse)
10085         NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
10086     }
10087 
10088     State.set(this, NewLI, Part);
10089   }
10090 }
10091 
10092 // Determine how to lower the scalar epilogue, which depends on 1) optimising
10093 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
10094 // predication, and 4) a TTI hook that analyses whether the loop is suitable
10095 // for predication.
10096 static ScalarEpilogueLowering getScalarEpilogueLowering(
10097     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
10098     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
10099     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
10100     LoopVectorizationLegality &LVL) {
10101   // 1) OptSize takes precedence over all other options, i.e. if this is set,
10102   // don't look at hints or options, and don't request a scalar epilogue.
10103   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
10104   // LoopAccessInfo (due to code dependency and not being able to reliably get
10105   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
10106   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
10107   // versioning when the vectorization is forced, unlike hasOptSize. So revert
10108   // back to the old way and vectorize with versioning when forced. See D81345.)
10109   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
10110                                                       PGSOQueryType::IRPass) &&
10111                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
10112     return CM_ScalarEpilogueNotAllowedOptSize;
10113 
10114   // 2) If set, obey the directives
10115   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
10116     switch (PreferPredicateOverEpilogue) {
10117     case PreferPredicateTy::ScalarEpilogue:
10118       return CM_ScalarEpilogueAllowed;
10119     case PreferPredicateTy::PredicateElseScalarEpilogue:
10120       return CM_ScalarEpilogueNotNeededUsePredicate;
10121     case PreferPredicateTy::PredicateOrDontVectorize:
10122       return CM_ScalarEpilogueNotAllowedUsePredicate;
10123     };
10124   }
10125 
10126   // 3) If set, obey the hints
10127   switch (Hints.getPredicate()) {
10128   case LoopVectorizeHints::FK_Enabled:
10129     return CM_ScalarEpilogueNotNeededUsePredicate;
10130   case LoopVectorizeHints::FK_Disabled:
10131     return CM_ScalarEpilogueAllowed;
10132   };
10133 
10134   // 4) if the TTI hook indicates this is profitable, request predication.
10135   if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
10136                                        LVL.getLAI()))
10137     return CM_ScalarEpilogueNotNeededUsePredicate;
10138 
10139   return CM_ScalarEpilogueAllowed;
10140 }
10141 
10142 Value *VPTransformState::get(VPValue *Def, unsigned Part) {
10143   // If Values have been set for this Def return the one relevant for \p Part.
10144   if (hasVectorValue(Def, Part))
10145     return Data.PerPartOutput[Def][Part];
10146 
10147   if (!hasScalarValue(Def, {Part, 0})) {
10148     Value *IRV = Def->getLiveInIRValue();
10149     Value *B = ILV->getBroadcastInstrs(IRV);
10150     set(Def, B, Part);
10151     return B;
10152   }
10153 
10154   Value *ScalarValue = get(Def, {Part, 0});
10155   // If we aren't vectorizing, we can just copy the scalar map values over
10156   // to the vector map.
10157   if (VF.isScalar()) {
10158     set(Def, ScalarValue, Part);
10159     return ScalarValue;
10160   }
10161 
10162   auto *RepR = dyn_cast<VPReplicateRecipe>(Def);
10163   bool IsUniform = RepR && RepR->isUniform();
10164 
10165   unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
10166   // Check if there is a scalar value for the selected lane.
10167   if (!hasScalarValue(Def, {Part, LastLane})) {
10168     // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform.
10169     assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) ||
10170             isa<VPScalarIVStepsRecipe>(Def->getDef())) &&
10171            "unexpected recipe found to be invariant");
10172     IsUniform = true;
10173     LastLane = 0;
10174   }
10175 
10176   auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));
10177   // Set the insert point after the last scalarized instruction or after the
10178   // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
10179   // will directly follow the scalar definitions.
10180   auto OldIP = Builder.saveIP();
10181   auto NewIP =
10182       isa<PHINode>(LastInst)
10183           ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
10184           : std::next(BasicBlock::iterator(LastInst));
10185   Builder.SetInsertPoint(&*NewIP);
10186 
10187   // However, if we are vectorizing, we need to construct the vector values.
10188   // If the value is known to be uniform after vectorization, we can just
10189   // broadcast the scalar value corresponding to lane zero for each unroll
10190   // iteration. Otherwise, we construct the vector values using
10191   // insertelement instructions. Since the resulting vectors are stored in
10192   // State, we will only generate the insertelements once.
10193   Value *VectorValue = nullptr;
10194   if (IsUniform) {
10195     VectorValue = ILV->getBroadcastInstrs(ScalarValue);
10196     set(Def, VectorValue, Part);
10197   } else {
10198     // Initialize packing with insertelements to start from undef.
10199     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
10200     Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
10201     set(Def, Undef, Part);
10202     for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
10203       ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this);
10204     VectorValue = get(Def, Part);
10205   }
10206   Builder.restoreIP(OldIP);
10207   return VectorValue;
10208 }
10209 
10210 // Process the loop in the VPlan-native vectorization path. This path builds
10211 // VPlan upfront in the vectorization pipeline, which allows to apply
10212 // VPlan-to-VPlan transformations from the very beginning without modifying the
10213 // input LLVM IR.
10214 static bool processLoopInVPlanNativePath(
10215     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
10216     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
10217     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
10218     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
10219     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
10220     LoopVectorizationRequirements &Requirements) {
10221 
10222   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
10223     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
10224     return false;
10225   }
10226   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
10227   Function *F = L->getHeader()->getParent();
10228   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
10229 
10230   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10231       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
10232 
10233   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
10234                                 &Hints, IAI);
10235   // Use the planner for outer loop vectorization.
10236   // TODO: CM is not used at this point inside the planner. Turn CM into an
10237   // optional argument if we don't need it in the future.
10238   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints,
10239                                Requirements, ORE);
10240 
10241   // Get user vectorization factor.
10242   ElementCount UserVF = Hints.getWidth();
10243 
10244   CM.collectElementTypesForWidening();
10245 
10246   // Plan how to best vectorize, return the best VF and its cost.
10247   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
10248 
10249   // If we are stress testing VPlan builds, do not attempt to generate vector
10250   // code. Masked vector code generation support will follow soon.
10251   // Also, do not attempt to vectorize if no vector code will be produced.
10252   if (VPlanBuildStressTest || EnableVPlanPredication ||
10253       VectorizationFactor::Disabled() == VF)
10254     return false;
10255 
10256   VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10257 
10258   {
10259     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10260                              F->getParent()->getDataLayout());
10261     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
10262                            &CM, BFI, PSI, Checks);
10263     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
10264                       << L->getHeader()->getParent()->getName() << "\"\n");
10265     LVP.executePlan(VF.Width, 1, BestPlan, LB, DT);
10266   }
10267 
10268   // Mark the loop as already vectorized to avoid vectorizing again.
10269   Hints.setAlreadyVectorized();
10270   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10271   return true;
10272 }
10273 
10274 // Emit a remark if there are stores to floats that required a floating point
10275 // extension. If the vectorized loop was generated with floating point there
10276 // will be a performance penalty from the conversion overhead and the change in
10277 // the vector width.
10278 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
10279   SmallVector<Instruction *, 4> Worklist;
10280   for (BasicBlock *BB : L->getBlocks()) {
10281     for (Instruction &Inst : *BB) {
10282       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
10283         if (S->getValueOperand()->getType()->isFloatTy())
10284           Worklist.push_back(S);
10285       }
10286     }
10287   }
10288 
10289   // Traverse the floating point stores upwards searching, for floating point
10290   // conversions.
10291   SmallPtrSet<const Instruction *, 4> Visited;
10292   SmallPtrSet<const Instruction *, 4> EmittedRemark;
10293   while (!Worklist.empty()) {
10294     auto *I = Worklist.pop_back_val();
10295     if (!L->contains(I))
10296       continue;
10297     if (!Visited.insert(I).second)
10298       continue;
10299 
10300     // Emit a remark if the floating point store required a floating
10301     // point conversion.
10302     // TODO: More work could be done to identify the root cause such as a
10303     // constant or a function return type and point the user to it.
10304     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
10305       ORE->emit([&]() {
10306         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
10307                                           I->getDebugLoc(), L->getHeader())
10308                << "floating point conversion changes vector width. "
10309                << "Mixed floating point precision requires an up/down "
10310                << "cast that will negatively impact performance.";
10311       });
10312 
10313     for (Use &Op : I->operands())
10314       if (auto *OpI = dyn_cast<Instruction>(Op))
10315         Worklist.push_back(OpI);
10316   }
10317 }
10318 
10319 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
10320     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10321                                !EnableLoopInterleaving),
10322       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10323                               !EnableLoopVectorization) {}
10324 
10325 bool LoopVectorizePass::processLoop(Loop *L) {
10326   assert((EnableVPlanNativePath || L->isInnermost()) &&
10327          "VPlan-native path is not enabled. Only process inner loops.");
10328 
10329 #ifndef NDEBUG
10330   const std::string DebugLocStr = getDebugLocString(L);
10331 #endif /* NDEBUG */
10332 
10333   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
10334                     << L->getHeader()->getParent()->getName() << "' from "
10335                     << DebugLocStr << "\n");
10336 
10337   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
10338 
10339   LLVM_DEBUG(
10340       dbgs() << "LV: Loop hints:"
10341              << " force="
10342              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
10343                      ? "disabled"
10344                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
10345                             ? "enabled"
10346                             : "?"))
10347              << " width=" << Hints.getWidth()
10348              << " interleave=" << Hints.getInterleave() << "\n");
10349 
10350   // Function containing loop
10351   Function *F = L->getHeader()->getParent();
10352 
10353   // Looking at the diagnostic output is the only way to determine if a loop
10354   // was vectorized (other than looking at the IR or machine code), so it
10355   // is important to generate an optimization remark for each loop. Most of
10356   // these messages are generated as OptimizationRemarkAnalysis. Remarks
10357   // generated as OptimizationRemark and OptimizationRemarkMissed are
10358   // less verbose reporting vectorized loops and unvectorized loops that may
10359   // benefit from vectorization, respectively.
10360 
10361   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10362     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10363     return false;
10364   }
10365 
10366   PredicatedScalarEvolution PSE(*SE, *L);
10367 
10368   // Check if it is legal to vectorize the loop.
10369   LoopVectorizationRequirements Requirements;
10370   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
10371                                 &Requirements, &Hints, DB, AC, BFI, PSI);
10372   if (!LVL.canVectorize(EnableVPlanNativePath)) {
10373     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10374     Hints.emitRemarkWithHints();
10375     return false;
10376   }
10377 
10378   // Check the function attributes and profiles to find out if this function
10379   // should be optimized for size.
10380   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10381       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
10382 
10383   // Entrance to the VPlan-native vectorization path. Outer loops are processed
10384   // here. They may require CFG and instruction level transformations before
10385   // even evaluating whether vectorization is profitable. Since we cannot modify
10386   // the incoming IR, we need to build VPlan upfront in the vectorization
10387   // pipeline.
10388   if (!L->isInnermost())
10389     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10390                                         ORE, BFI, PSI, Hints, Requirements);
10391 
10392   assert(L->isInnermost() && "Inner loop expected.");
10393 
10394   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10395   // count by optimizing for size, to minimize overheads.
10396   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
10397   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10398     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10399                       << "This loop is worth vectorizing only if no scalar "
10400                       << "iteration overheads are incurred.");
10401     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10402       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10403     else {
10404       LLVM_DEBUG(dbgs() << "\n");
10405       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10406     }
10407   }
10408 
10409   // Check the function attributes to see if implicit floats are allowed.
10410   // FIXME: This check doesn't seem possibly correct -- what if the loop is
10411   // an integer loop and the vector instructions selected are purely integer
10412   // vector instructions?
10413   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10414     reportVectorizationFailure(
10415         "Can't vectorize when the NoImplicitFloat attribute is used",
10416         "loop not vectorized due to NoImplicitFloat attribute",
10417         "NoImplicitFloat", ORE, L);
10418     Hints.emitRemarkWithHints();
10419     return false;
10420   }
10421 
10422   // Check if the target supports potentially unsafe FP vectorization.
10423   // FIXME: Add a check for the type of safety issue (denormal, signaling)
10424   // for the target we're vectorizing for, to make sure none of the
10425   // additional fp-math flags can help.
10426   if (Hints.isPotentiallyUnsafe() &&
10427       TTI->isFPVectorizationPotentiallyUnsafe()) {
10428     reportVectorizationFailure(
10429         "Potentially unsafe FP op prevents vectorization",
10430         "loop not vectorized due to unsafe FP support.",
10431         "UnsafeFP", ORE, L);
10432     Hints.emitRemarkWithHints();
10433     return false;
10434   }
10435 
10436   bool AllowOrderedReductions;
10437   // If the flag is set, use that instead and override the TTI behaviour.
10438   if (ForceOrderedReductions.getNumOccurrences() > 0)
10439     AllowOrderedReductions = ForceOrderedReductions;
10440   else
10441     AllowOrderedReductions = TTI->enableOrderedReductions();
10442   if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10443     ORE->emit([&]() {
10444       auto *ExactFPMathInst = Requirements.getExactFPInst();
10445       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10446                                                  ExactFPMathInst->getDebugLoc(),
10447                                                  ExactFPMathInst->getParent())
10448              << "loop not vectorized: cannot prove it is safe to reorder "
10449                 "floating-point operations";
10450     });
10451     LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10452                          "reorder floating-point operations\n");
10453     Hints.emitRemarkWithHints();
10454     return false;
10455   }
10456 
10457   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10458   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10459 
10460   // If an override option has been passed in for interleaved accesses, use it.
10461   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
10462     UseInterleaved = EnableInterleavedMemAccesses;
10463 
10464   // Analyze interleaved memory accesses.
10465   if (UseInterleaved) {
10466     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
10467   }
10468 
10469   // Use the cost model.
10470   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10471                                 F, &Hints, IAI);
10472   CM.collectValuesToIgnore();
10473   CM.collectElementTypesForWidening();
10474 
10475   // Use the planner for vectorization.
10476   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints,
10477                                Requirements, ORE);
10478 
10479   // Get user vectorization factor and interleave count.
10480   ElementCount UserVF = Hints.getWidth();
10481   unsigned UserIC = Hints.getInterleave();
10482 
10483   // Plan how to best vectorize, return the best VF and its cost.
10484   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
10485 
10486   VectorizationFactor VF = VectorizationFactor::Disabled();
10487   unsigned IC = 1;
10488 
10489   if (MaybeVF) {
10490     VF = *MaybeVF;
10491     // Select the interleave count.
10492     IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());
10493   }
10494 
10495   // Identify the diagnostic messages that should be produced.
10496   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10497   bool VectorizeLoop = true, InterleaveLoop = true;
10498   if (VF.Width.isScalar()) {
10499     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10500     VecDiagMsg = std::make_pair(
10501         "VectorizationNotBeneficial",
10502         "the cost-model indicates that vectorization is not beneficial");
10503     VectorizeLoop = false;
10504   }
10505 
10506   if (!MaybeVF && UserIC > 1) {
10507     // Tell the user interleaving was avoided up-front, despite being explicitly
10508     // requested.
10509     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10510                          "interleaving should be avoided up front\n");
10511     IntDiagMsg = std::make_pair(
10512         "InterleavingAvoided",
10513         "Ignoring UserIC, because interleaving was avoided up front");
10514     InterleaveLoop = false;
10515   } else if (IC == 1 && UserIC <= 1) {
10516     // Tell the user interleaving is not beneficial.
10517     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10518     IntDiagMsg = std::make_pair(
10519         "InterleavingNotBeneficial",
10520         "the cost-model indicates that interleaving is not beneficial");
10521     InterleaveLoop = false;
10522     if (UserIC == 1) {
10523       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10524       IntDiagMsg.second +=
10525           " and is explicitly disabled or interleave count is set to 1";
10526     }
10527   } else if (IC > 1 && UserIC == 1) {
10528     // Tell the user interleaving is beneficial, but it explicitly disabled.
10529     LLVM_DEBUG(
10530         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10531     IntDiagMsg = std::make_pair(
10532         "InterleavingBeneficialButDisabled",
10533         "the cost-model indicates that interleaving is beneficial "
10534         "but is explicitly disabled or interleave count is set to 1");
10535     InterleaveLoop = false;
10536   }
10537 
10538   // Override IC if user provided an interleave count.
10539   IC = UserIC > 0 ? UserIC : IC;
10540 
10541   // Emit diagnostic messages, if any.
10542   const char *VAPassName = Hints.vectorizeAnalysisPassName();
10543   if (!VectorizeLoop && !InterleaveLoop) {
10544     // Do not vectorize or interleaving the loop.
10545     ORE->emit([&]() {
10546       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10547                                       L->getStartLoc(), L->getHeader())
10548              << VecDiagMsg.second;
10549     });
10550     ORE->emit([&]() {
10551       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10552                                       L->getStartLoc(), L->getHeader())
10553              << IntDiagMsg.second;
10554     });
10555     return false;
10556   } else if (!VectorizeLoop && InterleaveLoop) {
10557     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10558     ORE->emit([&]() {
10559       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10560                                         L->getStartLoc(), L->getHeader())
10561              << VecDiagMsg.second;
10562     });
10563   } else if (VectorizeLoop && !InterleaveLoop) {
10564     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10565                       << ") in " << DebugLocStr << '\n');
10566     ORE->emit([&]() {
10567       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10568                                         L->getStartLoc(), L->getHeader())
10569              << IntDiagMsg.second;
10570     });
10571   } else if (VectorizeLoop && InterleaveLoop) {
10572     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10573                       << ") in " << DebugLocStr << '\n');
10574     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10575   }
10576 
10577   bool DisableRuntimeUnroll = false;
10578   MDNode *OrigLoopID = L->getLoopID();
10579   {
10580     // Optimistically generate runtime checks. Drop them if they turn out to not
10581     // be profitable. Limit the scope of Checks, so the cleanup happens
10582     // immediately after vector codegeneration is done.
10583     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10584                              F->getParent()->getDataLayout());
10585     if (!VF.Width.isScalar() || IC > 1)
10586       Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, IC);
10587 
10588     using namespace ore;
10589     if (!VectorizeLoop) {
10590       assert(IC > 1 && "interleave count should not be 1 or 0");
10591       // If we decided that it is not legal to vectorize the loop, then
10592       // interleave it.
10593       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10594                                  &CM, BFI, PSI, Checks);
10595 
10596       VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10597       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT);
10598 
10599       ORE->emit([&]() {
10600         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10601                                   L->getHeader())
10602                << "interleaved loop (interleaved count: "
10603                << NV("InterleaveCount", IC) << ")";
10604       });
10605     } else {
10606       // If we decided that it is *legal* to vectorize the loop, then do it.
10607 
10608       // Consider vectorizing the epilogue too if it's profitable.
10609       VectorizationFactor EpilogueVF =
10610           CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
10611       if (EpilogueVF.Width.isVector()) {
10612 
10613         // The first pass vectorizes the main loop and creates a scalar epilogue
10614         // to be vectorized by executing the plan (potentially with a different
10615         // factor) again shortly afterwards.
10616         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10617         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10618                                            EPI, &LVL, &CM, BFI, PSI, Checks);
10619 
10620         VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
10621         LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV,
10622                         DT);
10623         ++LoopsVectorized;
10624 
10625         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10626         formLCSSARecursively(*L, *DT, LI, SE);
10627 
10628         // Second pass vectorizes the epilogue and adjusts the control flow
10629         // edges from the first pass.
10630         EPI.MainLoopVF = EPI.EpilogueVF;
10631         EPI.MainLoopUF = EPI.EpilogueUF;
10632         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10633                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
10634                                                  Checks);
10635 
10636         VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
10637         VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
10638         VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10639         Header->setName("vec.epilog.vector.body");
10640 
10641         // Ensure that the start values for any VPReductionPHIRecipes are
10642         // updated before vectorising the epilogue loop.
10643         for (VPRecipeBase &R : Header->phis()) {
10644           if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10645             if (auto *Resume = MainILV.getReductionResumeValue(
10646                     ReductionPhi->getRecurrenceDescriptor())) {
10647               VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(Resume);
10648               ReductionPhi->setOperand(0, StartVal);
10649             }
10650           }
10651         }
10652 
10653         LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10654                         DT);
10655         ++LoopsEpilogueVectorized;
10656 
10657         if (!MainILV.areSafetyChecksAdded())
10658           DisableRuntimeUnroll = true;
10659       } else {
10660         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
10661                                &LVL, &CM, BFI, PSI, Checks);
10662 
10663         VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10664         LVP.executePlan(VF.Width, IC, BestPlan, LB, DT);
10665         ++LoopsVectorized;
10666 
10667         // Add metadata to disable runtime unrolling a scalar loop when there
10668         // are no runtime checks about strides and memory. A scalar loop that is
10669         // rarely used is not worth unrolling.
10670         if (!LB.areSafetyChecksAdded())
10671           DisableRuntimeUnroll = true;
10672       }
10673       // Report the vectorization decision.
10674       ORE->emit([&]() {
10675         return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
10676                                   L->getHeader())
10677                << "vectorized loop (vectorization width: "
10678                << NV("VectorizationFactor", VF.Width)
10679                << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
10680       });
10681     }
10682 
10683     if (ORE->allowExtraAnalysis(LV_NAME))
10684       checkMixedPrecision(L, ORE);
10685   }
10686 
10687   Optional<MDNode *> RemainderLoopID =
10688       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10689                                       LLVMLoopVectorizeFollowupEpilogue});
10690   if (RemainderLoopID.hasValue()) {
10691     L->setLoopID(RemainderLoopID.getValue());
10692   } else {
10693     if (DisableRuntimeUnroll)
10694       AddRuntimeUnrollDisableMetaData(L);
10695 
10696     // Mark the loop as already vectorized to avoid vectorizing again.
10697     Hints.setAlreadyVectorized();
10698   }
10699 
10700   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10701   return true;
10702 }
10703 
10704 LoopVectorizeResult LoopVectorizePass::runImpl(
10705     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10706     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
10707     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
10708     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
10709     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10710   SE = &SE_;
10711   LI = &LI_;
10712   TTI = &TTI_;
10713   DT = &DT_;
10714   BFI = &BFI_;
10715   TLI = TLI_;
10716   AA = &AA_;
10717   AC = &AC_;
10718   GetLAA = &GetLAA_;
10719   DB = &DB_;
10720   ORE = &ORE_;
10721   PSI = PSI_;
10722 
10723   // Don't attempt if
10724   // 1. the target claims to have no vector registers, and
10725   // 2. interleaving won't help ILP.
10726   //
10727   // The second condition is necessary because, even if the target has no
10728   // vector registers, loop vectorization may still enable scalar
10729   // interleaving.
10730   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10731       TTI->getMaxInterleaveFactor(1) < 2)
10732     return LoopVectorizeResult(false, false);
10733 
10734   bool Changed = false, CFGChanged = false;
10735 
10736   // The vectorizer requires loops to be in simplified form.
10737   // Since simplification may add new inner loops, it has to run before the
10738   // legality and profitability checks. This means running the loop vectorizer
10739   // will simplify all loops, regardless of whether anything end up being
10740   // vectorized.
10741   for (auto &L : *LI)
10742     Changed |= CFGChanged |=
10743         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10744 
10745   // Build up a worklist of inner-loops to vectorize. This is necessary as
10746   // the act of vectorizing or partially unrolling a loop creates new loops
10747   // and can invalidate iterators across the loops.
10748   SmallVector<Loop *, 8> Worklist;
10749 
10750   for (Loop *L : *LI)
10751     collectSupportedLoops(*L, LI, ORE, Worklist);
10752 
10753   LoopsAnalyzed += Worklist.size();
10754 
10755   // Now walk the identified inner loops.
10756   while (!Worklist.empty()) {
10757     Loop *L = Worklist.pop_back_val();
10758 
10759     // For the inner loops we actually process, form LCSSA to simplify the
10760     // transform.
10761     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10762 
10763     Changed |= CFGChanged |= processLoop(L);
10764   }
10765 
10766   // Process each loop nest in the function.
10767   return LoopVectorizeResult(Changed, CFGChanged);
10768 }
10769 
10770 PreservedAnalyses LoopVectorizePass::run(Function &F,
10771                                          FunctionAnalysisManager &AM) {
10772     auto &LI = AM.getResult<LoopAnalysis>(F);
10773     // There are no loops in the function. Return before computing other expensive
10774     // analyses.
10775     if (LI.empty())
10776       return PreservedAnalyses::all();
10777     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
10778     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10779     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10780     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
10781     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10782     auto &AA = AM.getResult<AAManager>(F);
10783     auto &AC = AM.getResult<AssumptionAnalysis>(F);
10784     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10785     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10786 
10787     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
10788     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
10789         [&](Loop &L) -> const LoopAccessInfo & {
10790       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,      SE,
10791                                         TLI, TTI, nullptr, nullptr, nullptr};
10792       return LAM.getResult<LoopAccessAnalysis>(L, AR);
10793     };
10794     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10795     ProfileSummaryInfo *PSI =
10796         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10797     LoopVectorizeResult Result =
10798         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
10799     if (!Result.MadeAnyChange)
10800       return PreservedAnalyses::all();
10801     PreservedAnalyses PA;
10802 
10803     // We currently do not preserve loopinfo/dominator analyses with outer loop
10804     // vectorization. Until this is addressed, mark these analyses as preserved
10805     // only for non-VPlan-native path.
10806     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10807     if (!EnableVPlanNativePath) {
10808       PA.preserve<LoopAnalysis>();
10809       PA.preserve<DominatorTreeAnalysis>();
10810     }
10811 
10812     if (Result.MadeCFGChange) {
10813       // Making CFG changes likely means a loop got vectorized. Indicate that
10814       // extra simplification passes should be run.
10815       // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10816       // be run if runtime checks have been added.
10817       AM.getResult<ShouldRunExtraVectorPasses>(F);
10818       PA.preserve<ShouldRunExtraVectorPasses>();
10819     } else {
10820       PA.preserveSet<CFGAnalyses>();
10821     }
10822     return PA;
10823 }
10824 
10825 void LoopVectorizePass::printPipeline(
10826     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10827   static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10828       OS, MapClassName2PassName);
10829 
10830   OS << "<";
10831   OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10832   OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10833   OS << ">";
10834 }
10835