1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SmallPtrSet.h"
73 #include "llvm/ADT/SmallSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
91 #include "llvm/Analysis/ProfileSummaryInfo.h"
92 #include "llvm/Analysis/ScalarEvolution.h"
93 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
94 #include "llvm/Analysis/TargetLibraryInfo.h"
95 #include "llvm/Analysis/TargetTransformInfo.h"
96 #include "llvm/Analysis/VectorUtils.h"
97 #include "llvm/IR/Attributes.h"
98 #include "llvm/IR/BasicBlock.h"
99 #include "llvm/IR/CFG.h"
100 #include "llvm/IR/Constant.h"
101 #include "llvm/IR/Constants.h"
102 #include "llvm/IR/DataLayout.h"
103 #include "llvm/IR/DebugInfoMetadata.h"
104 #include "llvm/IR/DebugLoc.h"
105 #include "llvm/IR/DerivedTypes.h"
106 #include "llvm/IR/DiagnosticInfo.h"
107 #include "llvm/IR/Dominators.h"
108 #include "llvm/IR/Function.h"
109 #include "llvm/IR/IRBuilder.h"
110 #include "llvm/IR/InstrTypes.h"
111 #include "llvm/IR/Instruction.h"
112 #include "llvm/IR/Instructions.h"
113 #include "llvm/IR/IntrinsicInst.h"
114 #include "llvm/IR/Intrinsics.h"
115 #include "llvm/IR/Metadata.h"
116 #include "llvm/IR/Module.h"
117 #include "llvm/IR/Operator.h"
118 #include "llvm/IR/PatternMatch.h"
119 #include "llvm/IR/Type.h"
120 #include "llvm/IR/Use.h"
121 #include "llvm/IR/User.h"
122 #include "llvm/IR/Value.h"
123 #include "llvm/IR/ValueHandle.h"
124 #include "llvm/IR/Verifier.h"
125 #include "llvm/InitializePasses.h"
126 #include "llvm/Pass.h"
127 #include "llvm/Support/Casting.h"
128 #include "llvm/Support/CommandLine.h"
129 #include "llvm/Support/Compiler.h"
130 #include "llvm/Support/Debug.h"
131 #include "llvm/Support/ErrorHandling.h"
132 #include "llvm/Support/InstructionCost.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <functional>
147 #include <iterator>
148 #include <limits>
149 #include <map>
150 #include <memory>
151 #include <string>
152 #include <tuple>
153 #include <utility>
154 
155 using namespace llvm;
156 
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
159 
160 #ifndef NDEBUG
161 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
162 #endif
163 
164 /// @{
165 /// Metadata attribute names
166 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
167 const char LLVMLoopVectorizeFollowupVectorized[] =
168     "llvm.loop.vectorize.followup_vectorized";
169 const char LLVMLoopVectorizeFollowupEpilogue[] =
170     "llvm.loop.vectorize.followup_epilogue";
171 /// @}
172 
173 STATISTIC(LoopsVectorized, "Number of loops vectorized");
174 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
175 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
176 
177 static cl::opt<bool> EnableEpilogueVectorization(
178     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
179     cl::desc("Enable vectorization of epilogue loops."));
180 
181 static cl::opt<unsigned> EpilogueVectorizationForceVF(
182     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
183     cl::desc("When epilogue vectorization is enabled, and a value greater than "
184              "1 is specified, forces the given VF for all applicable epilogue "
185              "loops."));
186 
187 static cl::opt<unsigned> EpilogueVectorizationMinVF(
188     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
189     cl::desc("Only loops with vectorization factor equal to or larger than "
190              "the specified value are considered for epilogue vectorization."));
191 
192 /// Loops with a known constant trip count below this number are vectorized only
193 /// if no scalar iteration overheads are incurred.
194 static cl::opt<unsigned> TinyTripCountVectorThreshold(
195     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
196     cl::desc("Loops with a constant trip count that is smaller than this "
197              "value are vectorized only if no scalar iteration overheads "
198              "are incurred."));
199 
200 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
201     "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
202     cl::desc("The maximum allowed number of runtime memory checks with a "
203              "vectorize(enable) pragma."));
204 
205 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
206 // that predication is preferred, and this lists all options. I.e., the
207 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
208 // and predicate the instructions accordingly. If tail-folding fails, there are
209 // different fallback strategies depending on these values:
210 namespace PreferPredicateTy {
211   enum Option {
212     ScalarEpilogue = 0,
213     PredicateElseScalarEpilogue,
214     PredicateOrDontVectorize
215   };
216 } // namespace PreferPredicateTy
217 
218 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
219     "prefer-predicate-over-epilogue",
220     cl::init(PreferPredicateTy::ScalarEpilogue),
221     cl::Hidden,
222     cl::desc("Tail-folding and predication preferences over creating a scalar "
223              "epilogue loop."),
224     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
225                          "scalar-epilogue",
226                          "Don't tail-predicate loops, create scalar epilogue"),
227               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
228                          "predicate-else-scalar-epilogue",
229                          "prefer tail-folding, create scalar epilogue if tail "
230                          "folding fails."),
231               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
232                          "predicate-dont-vectorize",
233                          "prefers tail-folding, don't attempt vectorization if "
234                          "tail-folding fails.")));
235 
236 static cl::opt<bool> MaximizeBandwidth(
237     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
238     cl::desc("Maximize bandwidth when selecting vectorization factor which "
239              "will be determined by the smallest type in loop."));
240 
241 static cl::opt<bool> EnableInterleavedMemAccesses(
242     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
243     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
244 
245 /// An interleave-group may need masking if it resides in a block that needs
246 /// predication, or in order to mask away gaps.
247 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
248     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
249     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
250 
251 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
252     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
253     cl::desc("We don't interleave loops with a estimated constant trip count "
254              "below this number"));
255 
256 static cl::opt<unsigned> ForceTargetNumScalarRegs(
257     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
258     cl::desc("A flag that overrides the target's number of scalar registers."));
259 
260 static cl::opt<unsigned> ForceTargetNumVectorRegs(
261     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
262     cl::desc("A flag that overrides the target's number of vector registers."));
263 
264 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
265     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
266     cl::desc("A flag that overrides the target's max interleave factor for "
267              "scalar loops."));
268 
269 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
270     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
271     cl::desc("A flag that overrides the target's max interleave factor for "
272              "vectorized loops."));
273 
274 static cl::opt<unsigned> ForceTargetInstructionCost(
275     "force-target-instruction-cost", cl::init(0), cl::Hidden,
276     cl::desc("A flag that overrides the target's expected cost for "
277              "an instruction to a single constant value. Mostly "
278              "useful for getting consistent testing."));
279 
280 static cl::opt<bool> ForceTargetSupportsScalableVectors(
281     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
282     cl::desc(
283         "Pretend that scalable vectors are supported, even if the target does "
284         "not support them. This flag should only be used for testing."));
285 
286 static cl::opt<unsigned> SmallLoopCost(
287     "small-loop-cost", cl::init(20), cl::Hidden,
288     cl::desc(
289         "The cost of a loop that is considered 'small' by the interleaver."));
290 
291 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
292     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
293     cl::desc("Enable the use of the block frequency analysis to access PGO "
294              "heuristics minimizing code growth in cold regions and being more "
295              "aggressive in hot regions."));
296 
297 // Runtime interleave loops for load/store throughput.
298 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
299     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
300     cl::desc(
301         "Enable runtime interleaving until load/store ports are saturated"));
302 
303 /// Interleave small loops with scalar reductions.
304 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
305     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
306     cl::desc("Enable interleaving for loops with small iteration counts that "
307              "contain scalar reductions to expose ILP."));
308 
309 /// The number of stores in a loop that are allowed to need predication.
310 static cl::opt<unsigned> NumberOfStoresToPredicate(
311     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
312     cl::desc("Max number of stores to be predicated behind an if."));
313 
314 static cl::opt<bool> EnableIndVarRegisterHeur(
315     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
316     cl::desc("Count the induction variable only once when interleaving"));
317 
318 static cl::opt<bool> EnableCondStoresVectorization(
319     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
320     cl::desc("Enable if predication of stores during vectorization."));
321 
322 static cl::opt<unsigned> MaxNestedScalarReductionIC(
323     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
324     cl::desc("The maximum interleave count to use when interleaving a scalar "
325              "reduction in a nested loop."));
326 
327 static cl::opt<bool>
328     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
329                            cl::Hidden,
330                            cl::desc("Prefer in-loop vector reductions, "
331                                     "overriding the targets preference."));
332 
333 static cl::opt<bool> ForceOrderedReductions(
334     "force-ordered-reductions", cl::init(false), cl::Hidden,
335     cl::desc("Enable the vectorisation of loops with in-order (strict) "
336              "FP reductions"));
337 
338 static cl::opt<bool> PreferPredicatedReductionSelect(
339     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
340     cl::desc(
341         "Prefer predicating a reduction operation over an after loop select."));
342 
343 cl::opt<bool> EnableVPlanNativePath(
344     "enable-vplan-native-path", cl::init(false), cl::Hidden,
345     cl::desc("Enable VPlan-native vectorization path with "
346              "support for outer loop vectorization."));
347 
348 // FIXME: Remove this switch once we have divergence analysis. Currently we
349 // assume divergent non-backedge branches when this switch is true.
350 cl::opt<bool> EnableVPlanPredication(
351     "enable-vplan-predication", cl::init(false), cl::Hidden,
352     cl::desc("Enable VPlan-native vectorization path predicator with "
353              "support for outer loop vectorization."));
354 
355 // This flag enables the stress testing of the VPlan H-CFG construction in the
356 // VPlan-native vectorization path. It must be used in conjuction with
357 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
358 // verification of the H-CFGs built.
359 static cl::opt<bool> VPlanBuildStressTest(
360     "vplan-build-stress-test", cl::init(false), cl::Hidden,
361     cl::desc(
362         "Build VPlan for every supported loop nest in the function and bail "
363         "out right after the build (stress test the VPlan H-CFG construction "
364         "in the VPlan-native vectorization path)."));
365 
366 cl::opt<bool> llvm::EnableLoopInterleaving(
367     "interleave-loops", cl::init(true), cl::Hidden,
368     cl::desc("Enable loop interleaving in Loop vectorization passes"));
369 cl::opt<bool> llvm::EnableLoopVectorization(
370     "vectorize-loops", cl::init(true), cl::Hidden,
371     cl::desc("Run the Loop vectorization passes"));
372 
373 cl::opt<bool> PrintVPlansInDotFormat(
374     "vplan-print-in-dot-format", cl::init(false), cl::Hidden,
375     cl::desc("Use dot format instead of plain text when dumping VPlans"));
376 
377 /// A helper function that returns true if the given type is irregular. The
378 /// type is irregular if its allocated size doesn't equal the store size of an
379 /// element of the corresponding vector type.
380 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
381   // Determine if an array of N elements of type Ty is "bitcast compatible"
382   // with a <N x Ty> vector.
383   // This is only true if there is no padding between the array elements.
384   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
385 }
386 
387 /// A helper function that returns the reciprocal of the block probability of
388 /// predicated blocks. If we return X, we are assuming the predicated block
389 /// will execute once for every X iterations of the loop header.
390 ///
391 /// TODO: We should use actual block probability here, if available. Currently,
392 ///       we always assume predicated blocks have a 50% chance of executing.
393 static unsigned getReciprocalPredBlockProb() { return 2; }
394 
395 /// A helper function that returns an integer or floating-point constant with
396 /// value C.
397 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
398   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
399                            : ConstantFP::get(Ty, C);
400 }
401 
402 /// Returns "best known" trip count for the specified loop \p L as defined by
403 /// the following procedure:
404 ///   1) Returns exact trip count if it is known.
405 ///   2) Returns expected trip count according to profile data if any.
406 ///   3) Returns upper bound estimate if it is known.
407 ///   4) Returns None if all of the above failed.
408 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
409   // Check if exact trip count is known.
410   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
411     return ExpectedTC;
412 
413   // Check if there is an expected trip count available from profile data.
414   if (LoopVectorizeWithBlockFrequency)
415     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
416       return EstimatedTC;
417 
418   // Check if upper bound estimate is known.
419   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
420     return ExpectedTC;
421 
422   return None;
423 }
424 
425 // Forward declare GeneratedRTChecks.
426 class GeneratedRTChecks;
427 
428 namespace llvm {
429 
430 AnalysisKey ShouldRunExtraVectorPasses::Key;
431 
432 /// InnerLoopVectorizer vectorizes loops which contain only one basic
433 /// block to a specified vectorization factor (VF).
434 /// This class performs the widening of scalars into vectors, or multiple
435 /// scalars. This class also implements the following features:
436 /// * It inserts an epilogue loop for handling loops that don't have iteration
437 ///   counts that are known to be a multiple of the vectorization factor.
438 /// * It handles the code generation for reduction variables.
439 /// * Scalarization (implementation using scalars) of un-vectorizable
440 ///   instructions.
441 /// InnerLoopVectorizer does not perform any vectorization-legality
442 /// checks, and relies on the caller to check for the different legality
443 /// aspects. The InnerLoopVectorizer relies on the
444 /// LoopVectorizationLegality class to provide information about the induction
445 /// and reduction variables that were found to a given vectorization factor.
446 class InnerLoopVectorizer {
447 public:
448   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
449                       LoopInfo *LI, DominatorTree *DT,
450                       const TargetLibraryInfo *TLI,
451                       const TargetTransformInfo *TTI, AssumptionCache *AC,
452                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
453                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
454                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
455                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
456       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
457         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
458         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
459         PSI(PSI), RTChecks(RTChecks) {
460     // Query this against the original loop and save it here because the profile
461     // of the original loop header may change as the transformation happens.
462     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
463         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
464   }
465 
466   virtual ~InnerLoopVectorizer() = default;
467 
468   /// Create a new empty loop that will contain vectorized instructions later
469   /// on, while the old loop will be used as the scalar remainder. Control flow
470   /// is generated around the vectorized (and scalar epilogue) loops consisting
471   /// of various checks and bypasses. Return the pre-header block of the new
472   /// loop and the start value for the canonical induction, if it is != 0. The
473   /// latter is the case when vectorizing the epilogue loop. In the case of
474   /// epilogue vectorization, this function is overriden to handle the more
475   /// complex control flow around the loops.
476   virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton();
477 
478   /// Widen a single call instruction within the innermost loop.
479   void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
480                             VPTransformState &State);
481 
482   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
483   void fixVectorizedLoop(VPTransformState &State);
484 
485   // Return true if any runtime check is added.
486   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
487 
488   /// A type for vectorized values in the new loop. Each value from the
489   /// original loop, when vectorized, is represented by UF vector values in the
490   /// new unrolled loop, where UF is the unroll factor.
491   using VectorParts = SmallVector<Value *, 2>;
492 
493   /// Vectorize a single vector PHINode in a block in the VPlan-native path
494   /// only.
495   void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR,
496                            VPTransformState &State);
497 
498   /// A helper function to scalarize a single Instruction in the innermost loop.
499   /// Generates a sequence of scalar instances for each lane between \p MinLane
500   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
501   /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
502   /// Instr's operands.
503   void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe,
504                             const VPIteration &Instance, bool IfPredicateInstr,
505                             VPTransformState &State);
506 
507   /// Construct the vector value of a scalarized value \p V one lane at a time.
508   void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
509                                  VPTransformState &State);
510 
511   /// Try to vectorize interleaved access group \p Group with the base address
512   /// given in \p Addr, optionally masking the vector operations if \p
513   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
514   /// values in the vectorized loop.
515   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
516                                 ArrayRef<VPValue *> VPDefs,
517                                 VPTransformState &State, VPValue *Addr,
518                                 ArrayRef<VPValue *> StoredValues,
519                                 VPValue *BlockInMask = nullptr);
520 
521   /// Set the debug location in the builder \p Ptr using the debug location in
522   /// \p V. If \p Ptr is None then it uses the class member's Builder.
523   void setDebugLocFromInst(const Value *V,
524                            Optional<IRBuilderBase *> CustomBuilder = None);
525 
526   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
527   void fixNonInductionPHIs(VPTransformState &State);
528 
529   /// Returns true if the reordering of FP operations is not allowed, but we are
530   /// able to vectorize with strict in-order reductions for the given RdxDesc.
531   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
532 
533   /// Create a broadcast instruction. This method generates a broadcast
534   /// instruction (shuffle) for loop invariant values and for the induction
535   /// value. If this is the induction variable then we extend it to N, N+1, ...
536   /// this is needed because each iteration in the loop corresponds to a SIMD
537   /// element.
538   virtual Value *getBroadcastInstrs(Value *V);
539 
540   /// Add metadata from one instruction to another.
541   ///
542   /// This includes both the original MDs from \p From and additional ones (\see
543   /// addNewMetadata).  Use this for *newly created* instructions in the vector
544   /// loop.
545   void addMetadata(Instruction *To, Instruction *From);
546 
547   /// Similar to the previous function but it adds the metadata to a
548   /// vector of instructions.
549   void addMetadata(ArrayRef<Value *> To, Instruction *From);
550 
551   // Returns the resume value (bc.merge.rdx) for a reduction as
552   // generated by fixReduction.
553   PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);
554 
555 protected:
556   friend class LoopVectorizationPlanner;
557 
558   /// A small list of PHINodes.
559   using PhiVector = SmallVector<PHINode *, 4>;
560 
561   /// A type for scalarized values in the new loop. Each value from the
562   /// original loop, when scalarized, is represented by UF x VF scalar values
563   /// in the new unrolled loop, where UF is the unroll factor and VF is the
564   /// vectorization factor.
565   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
566 
567   /// Set up the values of the IVs correctly when exiting the vector loop.
568   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
569                     Value *CountRoundDown, Value *EndValue,
570                     BasicBlock *MiddleBlock, BasicBlock *VectorHeader);
571 
572   /// Handle all cross-iteration phis in the header.
573   void fixCrossIterationPHIs(VPTransformState &State);
574 
575   /// Create the exit value of first order recurrences in the middle block and
576   /// update their users.
577   void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
578                                VPTransformState &State);
579 
580   /// Create code for the loop exit value of the reduction.
581   void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
582 
583   /// Clear NSW/NUW flags from reduction instructions if necessary.
584   void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
585                                VPTransformState &State);
586 
587   /// Fixup the LCSSA phi nodes in the unique exit block.  This simply
588   /// means we need to add the appropriate incoming value from the middle
589   /// block as exiting edges from the scalar epilogue loop (if present) are
590   /// already in place, and we exit the vector loop exclusively to the middle
591   /// block.
592   void fixLCSSAPHIs(VPTransformState &State);
593 
594   /// Iteratively sink the scalarized operands of a predicated instruction into
595   /// the block that was created for it.
596   void sinkScalarOperands(Instruction *PredInst);
597 
598   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
599   /// represented as.
600   void truncateToMinimalBitwidths(VPTransformState &State);
601 
602   /// Returns (and creates if needed) the original loop trip count.
603   Value *getOrCreateTripCount(BasicBlock *InsertBlock);
604 
605   /// Returns (and creates if needed) the trip count of the widened loop.
606   Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
607 
608   /// Returns a bitcasted value to the requested vector type.
609   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
610   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
611                                 const DataLayout &DL);
612 
613   /// Emit a bypass check to see if the vector trip count is zero, including if
614   /// it overflows.
615   void emitMinimumIterationCountCheck(BasicBlock *Bypass);
616 
617   /// Emit a bypass check to see if all of the SCEV assumptions we've
618   /// had to make are correct. Returns the block containing the checks or
619   /// nullptr if no checks have been added.
620   BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
621 
622   /// Emit bypass checks to check any memory assumptions we may have made.
623   /// Returns the block containing the checks or nullptr if no checks have been
624   /// added.
625   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
626 
627   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
628   /// vector loop preheader, middle block and scalar preheader.
629   void createVectorLoopSkeleton(StringRef Prefix);
630 
631   /// Create new phi nodes for the induction variables to resume iteration count
632   /// in the scalar epilogue, from where the vectorized loop left off.
633   /// In cases where the loop skeleton is more complicated (eg. epilogue
634   /// vectorization) and the resume values can come from an additional bypass
635   /// block, the \p AdditionalBypass pair provides information about the bypass
636   /// block and the end value on the edge from bypass to this loop.
637   void createInductionResumeValues(
638       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
639 
640   /// Complete the loop skeleton by adding debug MDs, creating appropriate
641   /// conditional branches in the middle block, preparing the builder and
642   /// running the verifier. Return the preheader of the completed vector loop.
643   BasicBlock *completeLoopSkeleton(MDNode *OrigLoopID);
644 
645   /// Add additional metadata to \p To that was not present on \p Orig.
646   ///
647   /// Currently this is used to add the noalias annotations based on the
648   /// inserted memchecks.  Use this for instructions that are *cloned* into the
649   /// vector loop.
650   void addNewMetadata(Instruction *To, const Instruction *Orig);
651 
652   /// Collect poison-generating recipes that may generate a poison value that is
653   /// used after vectorization, even when their operands are not poison. Those
654   /// recipes meet the following conditions:
655   ///  * Contribute to the address computation of a recipe generating a widen
656   ///    memory load/store (VPWidenMemoryInstructionRecipe or
657   ///    VPInterleaveRecipe).
658   ///  * Such a widen memory load/store has at least one underlying Instruction
659   ///    that is in a basic block that needs predication and after vectorization
660   ///    the generated instruction won't be predicated.
661   void collectPoisonGeneratingRecipes(VPTransformState &State);
662 
663   /// Allow subclasses to override and print debug traces before/after vplan
664   /// execution, when trace information is requested.
665   virtual void printDebugTracesAtStart(){};
666   virtual void printDebugTracesAtEnd(){};
667 
668   /// The original loop.
669   Loop *OrigLoop;
670 
671   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
672   /// dynamic knowledge to simplify SCEV expressions and converts them to a
673   /// more usable form.
674   PredicatedScalarEvolution &PSE;
675 
676   /// Loop Info.
677   LoopInfo *LI;
678 
679   /// Dominator Tree.
680   DominatorTree *DT;
681 
682   /// Alias Analysis.
683   AAResults *AA;
684 
685   /// Target Library Info.
686   const TargetLibraryInfo *TLI;
687 
688   /// Target Transform Info.
689   const TargetTransformInfo *TTI;
690 
691   /// Assumption Cache.
692   AssumptionCache *AC;
693 
694   /// Interface to emit optimization remarks.
695   OptimizationRemarkEmitter *ORE;
696 
697   /// LoopVersioning.  It's only set up (non-null) if memchecks were
698   /// used.
699   ///
700   /// This is currently only used to add no-alias metadata based on the
701   /// memchecks.  The actually versioning is performed manually.
702   std::unique_ptr<LoopVersioning> LVer;
703 
704   /// The vectorization SIMD factor to use. Each vector will have this many
705   /// vector elements.
706   ElementCount VF;
707 
708   /// The vectorization unroll factor to use. Each scalar is vectorized to this
709   /// many different vector instructions.
710   unsigned UF;
711 
712   /// The builder that we use
713   IRBuilder<> Builder;
714 
715   // --- Vectorization state ---
716 
717   /// The vector-loop preheader.
718   BasicBlock *LoopVectorPreHeader;
719 
720   /// The scalar-loop preheader.
721   BasicBlock *LoopScalarPreHeader;
722 
723   /// Middle Block between the vector and the scalar.
724   BasicBlock *LoopMiddleBlock;
725 
726   /// The unique ExitBlock of the scalar loop if one exists.  Note that
727   /// there can be multiple exiting edges reaching this block.
728   BasicBlock *LoopExitBlock;
729 
730   /// The scalar loop body.
731   BasicBlock *LoopScalarBody;
732 
733   /// A list of all bypass blocks. The first block is the entry of the loop.
734   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
735 
736   /// Store instructions that were predicated.
737   SmallVector<Instruction *, 4> PredicatedInstructions;
738 
739   /// Trip count of the original loop.
740   Value *TripCount = nullptr;
741 
742   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
743   Value *VectorTripCount = nullptr;
744 
745   /// The legality analysis.
746   LoopVectorizationLegality *Legal;
747 
748   /// The profitablity analysis.
749   LoopVectorizationCostModel *Cost;
750 
751   // Record whether runtime checks are added.
752   bool AddedSafetyChecks = false;
753 
754   // Holds the end values for each induction variable. We save the end values
755   // so we can later fix-up the external users of the induction variables.
756   DenseMap<PHINode *, Value *> IVEndValues;
757 
758   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
759   // fixed up at the end of vector code generation.
760   SmallVector<PHINode *, 8> OrigPHIsToFix;
761 
762   /// BFI and PSI are used to check for profile guided size optimizations.
763   BlockFrequencyInfo *BFI;
764   ProfileSummaryInfo *PSI;
765 
766   // Whether this loop should be optimized for size based on profile guided size
767   // optimizatios.
768   bool OptForSizeBasedOnProfile;
769 
770   /// Structure to hold information about generated runtime checks, responsible
771   /// for cleaning the checks, if vectorization turns out unprofitable.
772   GeneratedRTChecks &RTChecks;
773 
774   // Holds the resume values for reductions in the loops, used to set the
775   // correct start value of reduction PHIs when vectorizing the epilogue.
776   SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
777       ReductionResumeValues;
778 };
779 
780 class InnerLoopUnroller : public InnerLoopVectorizer {
781 public:
782   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
783                     LoopInfo *LI, DominatorTree *DT,
784                     const TargetLibraryInfo *TLI,
785                     const TargetTransformInfo *TTI, AssumptionCache *AC,
786                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
787                     LoopVectorizationLegality *LVL,
788                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
789                     ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
790       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
791                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
792                             BFI, PSI, Check) {}
793 
794 private:
795   Value *getBroadcastInstrs(Value *V) override;
796 };
797 
798 /// Encapsulate information regarding vectorization of a loop and its epilogue.
799 /// This information is meant to be updated and used across two stages of
800 /// epilogue vectorization.
801 struct EpilogueLoopVectorizationInfo {
802   ElementCount MainLoopVF = ElementCount::getFixed(0);
803   unsigned MainLoopUF = 0;
804   ElementCount EpilogueVF = ElementCount::getFixed(0);
805   unsigned EpilogueUF = 0;
806   BasicBlock *MainLoopIterationCountCheck = nullptr;
807   BasicBlock *EpilogueIterationCountCheck = nullptr;
808   BasicBlock *SCEVSafetyCheck = nullptr;
809   BasicBlock *MemSafetyCheck = nullptr;
810   Value *TripCount = nullptr;
811   Value *VectorTripCount = nullptr;
812 
813   EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
814                                 ElementCount EVF, unsigned EUF)
815       : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
816     assert(EUF == 1 &&
817            "A high UF for the epilogue loop is likely not beneficial.");
818   }
819 };
820 
821 /// An extension of the inner loop vectorizer that creates a skeleton for a
822 /// vectorized loop that has its epilogue (residual) also vectorized.
823 /// The idea is to run the vplan on a given loop twice, firstly to setup the
824 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
825 /// from the first step and vectorize the epilogue.  This is achieved by
826 /// deriving two concrete strategy classes from this base class and invoking
827 /// them in succession from the loop vectorizer planner.
828 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
829 public:
830   InnerLoopAndEpilogueVectorizer(
831       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
832       DominatorTree *DT, const TargetLibraryInfo *TLI,
833       const TargetTransformInfo *TTI, AssumptionCache *AC,
834       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
835       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
836       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
837       GeneratedRTChecks &Checks)
838       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
839                             EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI,
840                             Checks),
841         EPI(EPI) {}
842 
843   // Override this function to handle the more complex control flow around the
844   // three loops.
845   std::pair<BasicBlock *, Value *>
846   createVectorizedLoopSkeleton() final override {
847     return createEpilogueVectorizedLoopSkeleton();
848   }
849 
850   /// The interface for creating a vectorized skeleton using one of two
851   /// different strategies, each corresponding to one execution of the vplan
852   /// as described above.
853   virtual std::pair<BasicBlock *, Value *>
854   createEpilogueVectorizedLoopSkeleton() = 0;
855 
856   /// Holds and updates state information required to vectorize the main loop
857   /// and its epilogue in two separate passes. This setup helps us avoid
858   /// regenerating and recomputing runtime safety checks. It also helps us to
859   /// shorten the iteration-count-check path length for the cases where the
860   /// iteration count of the loop is so small that the main vector loop is
861   /// completely skipped.
862   EpilogueLoopVectorizationInfo &EPI;
863 };
864 
865 /// A specialized derived class of inner loop vectorizer that performs
866 /// vectorization of *main* loops in the process of vectorizing loops and their
867 /// epilogues.
868 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
869 public:
870   EpilogueVectorizerMainLoop(
871       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
872       DominatorTree *DT, const TargetLibraryInfo *TLI,
873       const TargetTransformInfo *TTI, AssumptionCache *AC,
874       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
875       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
876       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
877       GeneratedRTChecks &Check)
878       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
879                                        EPI, LVL, CM, BFI, PSI, Check) {}
880   /// Implements the interface for creating a vectorized skeleton using the
881   /// *main loop* strategy (ie the first pass of vplan execution).
882   std::pair<BasicBlock *, Value *>
883   createEpilogueVectorizedLoopSkeleton() final override;
884 
885 protected:
886   /// Emits an iteration count bypass check once for the main loop (when \p
887   /// ForEpilogue is false) and once for the epilogue loop (when \p
888   /// ForEpilogue is true).
889   BasicBlock *emitMinimumIterationCountCheck(BasicBlock *Bypass,
890                                              bool ForEpilogue);
891   void printDebugTracesAtStart() override;
892   void printDebugTracesAtEnd() override;
893 };
894 
895 // A specialized derived class of inner loop vectorizer that performs
896 // vectorization of *epilogue* loops in the process of vectorizing loops and
897 // their epilogues.
898 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
899 public:
900   EpilogueVectorizerEpilogueLoop(
901       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
902       DominatorTree *DT, const TargetLibraryInfo *TLI,
903       const TargetTransformInfo *TTI, AssumptionCache *AC,
904       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
905       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
906       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
907       GeneratedRTChecks &Checks)
908       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
909                                        EPI, LVL, CM, BFI, PSI, Checks) {}
910   /// Implements the interface for creating a vectorized skeleton using the
911   /// *epilogue loop* strategy (ie the second pass of vplan execution).
912   std::pair<BasicBlock *, Value *>
913   createEpilogueVectorizedLoopSkeleton() final override;
914 
915 protected:
916   /// Emits an iteration count bypass check after the main vector loop has
917   /// finished to see if there are any iterations left to execute by either
918   /// the vector epilogue or the scalar epilogue.
919   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
920                                                       BasicBlock *Bypass,
921                                                       BasicBlock *Insert);
922   void printDebugTracesAtStart() override;
923   void printDebugTracesAtEnd() override;
924 };
925 } // end namespace llvm
926 
927 /// Look for a meaningful debug location on the instruction or it's
928 /// operands.
929 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
930   if (!I)
931     return I;
932 
933   DebugLoc Empty;
934   if (I->getDebugLoc() != Empty)
935     return I;
936 
937   for (Use &Op : I->operands()) {
938     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
939       if (OpInst->getDebugLoc() != Empty)
940         return OpInst;
941   }
942 
943   return I;
944 }
945 
946 void InnerLoopVectorizer::setDebugLocFromInst(
947     const Value *V, Optional<IRBuilderBase *> CustomBuilder) {
948   IRBuilderBase *B = (CustomBuilder == None) ? &Builder : *CustomBuilder;
949   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) {
950     const DILocation *DIL = Inst->getDebugLoc();
951 
952     // When a FSDiscriminator is enabled, we don't need to add the multiply
953     // factors to the discriminators.
954     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
955         !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) {
956       // FIXME: For scalable vectors, assume vscale=1.
957       auto NewDIL =
958           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
959       if (NewDIL)
960         B->SetCurrentDebugLocation(NewDIL.getValue());
961       else
962         LLVM_DEBUG(dbgs()
963                    << "Failed to create new discriminator: "
964                    << DIL->getFilename() << " Line: " << DIL->getLine());
965     } else
966       B->SetCurrentDebugLocation(DIL);
967   } else
968     B->SetCurrentDebugLocation(DebugLoc());
969 }
970 
971 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
972 /// is passed, the message relates to that particular instruction.
973 #ifndef NDEBUG
974 static void debugVectorizationMessage(const StringRef Prefix,
975                                       const StringRef DebugMsg,
976                                       Instruction *I) {
977   dbgs() << "LV: " << Prefix << DebugMsg;
978   if (I != nullptr)
979     dbgs() << " " << *I;
980   else
981     dbgs() << '.';
982   dbgs() << '\n';
983 }
984 #endif
985 
986 /// Create an analysis remark that explains why vectorization failed
987 ///
988 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
989 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
990 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
991 /// the location of the remark.  \return the remark object that can be
992 /// streamed to.
993 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
994     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
995   Value *CodeRegion = TheLoop->getHeader();
996   DebugLoc DL = TheLoop->getStartLoc();
997 
998   if (I) {
999     CodeRegion = I->getParent();
1000     // If there is no debug location attached to the instruction, revert back to
1001     // using the loop's.
1002     if (I->getDebugLoc())
1003       DL = I->getDebugLoc();
1004   }
1005 
1006   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
1007 }
1008 
1009 namespace llvm {
1010 
1011 /// Return a value for Step multiplied by VF.
1012 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
1013                        int64_t Step) {
1014   assert(Ty->isIntegerTy() && "Expected an integer step");
1015   Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue());
1016   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
1017 }
1018 
1019 /// Return the runtime value for VF.
1020 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
1021   Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
1022   return VF.isScalable() ? B.CreateVScale(EC) : EC;
1023 }
1024 
1025 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
1026                                   ElementCount VF) {
1027   assert(FTy->isFloatingPointTy() && "Expected floating point type!");
1028   Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
1029   Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
1030   return B.CreateUIToFP(RuntimeVF, FTy);
1031 }
1032 
1033 void reportVectorizationFailure(const StringRef DebugMsg,
1034                                 const StringRef OREMsg, const StringRef ORETag,
1035                                 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1036                                 Instruction *I) {
1037   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
1038   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1039   ORE->emit(
1040       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1041       << "loop not vectorized: " << OREMsg);
1042 }
1043 
1044 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1045                              OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1046                              Instruction *I) {
1047   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
1048   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1049   ORE->emit(
1050       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1051       << Msg);
1052 }
1053 
1054 } // end namespace llvm
1055 
1056 #ifndef NDEBUG
1057 /// \return string containing a file name and a line # for the given loop.
1058 static std::string getDebugLocString(const Loop *L) {
1059   std::string Result;
1060   if (L) {
1061     raw_string_ostream OS(Result);
1062     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1063       LoopDbgLoc.print(OS);
1064     else
1065       // Just print the module name.
1066       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1067     OS.flush();
1068   }
1069   return Result;
1070 }
1071 #endif
1072 
1073 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1074                                          const Instruction *Orig) {
1075   // If the loop was versioned with memchecks, add the corresponding no-alias
1076   // metadata.
1077   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1078     LVer->annotateInstWithNoAlias(To, Orig);
1079 }
1080 
1081 void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
1082     VPTransformState &State) {
1083 
1084   // Collect recipes in the backward slice of `Root` that may generate a poison
1085   // value that is used after vectorization.
1086   SmallPtrSet<VPRecipeBase *, 16> Visited;
1087   auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
1088     SmallVector<VPRecipeBase *, 16> Worklist;
1089     Worklist.push_back(Root);
1090 
1091     // Traverse the backward slice of Root through its use-def chain.
1092     while (!Worklist.empty()) {
1093       VPRecipeBase *CurRec = Worklist.back();
1094       Worklist.pop_back();
1095 
1096       if (!Visited.insert(CurRec).second)
1097         continue;
1098 
1099       // Prune search if we find another recipe generating a widen memory
1100       // instruction. Widen memory instructions involved in address computation
1101       // will lead to gather/scatter instructions, which don't need to be
1102       // handled.
1103       if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
1104           isa<VPInterleaveRecipe>(CurRec) ||
1105           isa<VPScalarIVStepsRecipe>(CurRec) ||
1106           isa<VPCanonicalIVPHIRecipe>(CurRec))
1107         continue;
1108 
1109       // This recipe contributes to the address computation of a widen
1110       // load/store. Collect recipe if its underlying instruction has
1111       // poison-generating flags.
1112       Instruction *Instr = CurRec->getUnderlyingInstr();
1113       if (Instr && Instr->hasPoisonGeneratingFlags())
1114         State.MayGeneratePoisonRecipes.insert(CurRec);
1115 
1116       // Add new definitions to the worklist.
1117       for (VPValue *operand : CurRec->operands())
1118         if (VPDef *OpDef = operand->getDef())
1119           Worklist.push_back(cast<VPRecipeBase>(OpDef));
1120     }
1121   });
1122 
1123   // Traverse all the recipes in the VPlan and collect the poison-generating
1124   // recipes in the backward slice starting at the address of a VPWidenRecipe or
1125   // VPInterleaveRecipe.
1126   auto Iter = depth_first(
1127       VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry()));
1128   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1129     for (VPRecipeBase &Recipe : *VPBB) {
1130       if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
1131         Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr();
1132         VPDef *AddrDef = WidenRec->getAddr()->getDef();
1133         if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr &&
1134             Legal->blockNeedsPredication(UnderlyingInstr->getParent()))
1135           collectPoisonGeneratingInstrsInBackwardSlice(
1136               cast<VPRecipeBase>(AddrDef));
1137       } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1138         VPDef *AddrDef = InterleaveRec->getAddr()->getDef();
1139         if (AddrDef) {
1140           // Check if any member of the interleave group needs predication.
1141           const InterleaveGroup<Instruction> *InterGroup =
1142               InterleaveRec->getInterleaveGroup();
1143           bool NeedPredication = false;
1144           for (int I = 0, NumMembers = InterGroup->getNumMembers();
1145                I < NumMembers; ++I) {
1146             Instruction *Member = InterGroup->getMember(I);
1147             if (Member)
1148               NeedPredication |=
1149                   Legal->blockNeedsPredication(Member->getParent());
1150           }
1151 
1152           if (NeedPredication)
1153             collectPoisonGeneratingInstrsInBackwardSlice(
1154                 cast<VPRecipeBase>(AddrDef));
1155         }
1156       }
1157     }
1158   }
1159 }
1160 
1161 void InnerLoopVectorizer::addMetadata(Instruction *To,
1162                                       Instruction *From) {
1163   propagateMetadata(To, From);
1164   addNewMetadata(To, From);
1165 }
1166 
1167 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1168                                       Instruction *From) {
1169   for (Value *V : To) {
1170     if (Instruction *I = dyn_cast<Instruction>(V))
1171       addMetadata(I, From);
1172   }
1173 }
1174 
1175 PHINode *InnerLoopVectorizer::getReductionResumeValue(
1176     const RecurrenceDescriptor &RdxDesc) {
1177   auto It = ReductionResumeValues.find(&RdxDesc);
1178   assert(It != ReductionResumeValues.end() &&
1179          "Expected to find a resume value for the reduction.");
1180   return It->second;
1181 }
1182 
1183 namespace llvm {
1184 
1185 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1186 // lowered.
1187 enum ScalarEpilogueLowering {
1188 
1189   // The default: allowing scalar epilogues.
1190   CM_ScalarEpilogueAllowed,
1191 
1192   // Vectorization with OptForSize: don't allow epilogues.
1193   CM_ScalarEpilogueNotAllowedOptSize,
1194 
1195   // A special case of vectorisation with OptForSize: loops with a very small
1196   // trip count are considered for vectorization under OptForSize, thereby
1197   // making sure the cost of their loop body is dominant, free of runtime
1198   // guards and scalar iteration overheads.
1199   CM_ScalarEpilogueNotAllowedLowTripLoop,
1200 
1201   // Loop hint predicate indicating an epilogue is undesired.
1202   CM_ScalarEpilogueNotNeededUsePredicate,
1203 
1204   // Directive indicating we must either tail fold or not vectorize
1205   CM_ScalarEpilogueNotAllowedUsePredicate
1206 };
1207 
1208 /// ElementCountComparator creates a total ordering for ElementCount
1209 /// for the purposes of using it in a set structure.
1210 struct ElementCountComparator {
1211   bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
1212     return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
1213            std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
1214   }
1215 };
1216 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
1217 
1218 /// LoopVectorizationCostModel - estimates the expected speedups due to
1219 /// vectorization.
1220 /// In many cases vectorization is not profitable. This can happen because of
1221 /// a number of reasons. In this class we mainly attempt to predict the
1222 /// expected speedup/slowdowns due to the supported instruction set. We use the
1223 /// TargetTransformInfo to query the different backends for the cost of
1224 /// different operations.
1225 class LoopVectorizationCostModel {
1226 public:
1227   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1228                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1229                              LoopVectorizationLegality *Legal,
1230                              const TargetTransformInfo &TTI,
1231                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1232                              AssumptionCache *AC,
1233                              OptimizationRemarkEmitter *ORE, const Function *F,
1234                              const LoopVectorizeHints *Hints,
1235                              InterleavedAccessInfo &IAI)
1236       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1237         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1238         Hints(Hints), InterleaveInfo(IAI) {}
1239 
1240   /// \return An upper bound for the vectorization factors (both fixed and
1241   /// scalable). If the factors are 0, vectorization and interleaving should be
1242   /// avoided up front.
1243   FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1244 
1245   /// \return True if runtime checks are required for vectorization, and false
1246   /// otherwise.
1247   bool runtimeChecksRequired();
1248 
1249   /// \return The most profitable vectorization factor and the cost of that VF.
1250   /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
1251   /// then this vectorization factor will be selected if vectorization is
1252   /// possible.
1253   VectorizationFactor
1254   selectVectorizationFactor(const ElementCountSet &CandidateVFs);
1255 
1256   VectorizationFactor
1257   selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1258                                     const LoopVectorizationPlanner &LVP);
1259 
1260   /// Setup cost-based decisions for user vectorization factor.
1261   /// \return true if the UserVF is a feasible VF to be chosen.
1262   bool selectUserVectorizationFactor(ElementCount UserVF) {
1263     collectUniformsAndScalars(UserVF);
1264     collectInstsToScalarize(UserVF);
1265     return expectedCost(UserVF).first.isValid();
1266   }
1267 
1268   /// \return The size (in bits) of the smallest and widest types in the code
1269   /// that needs to be vectorized. We ignore values that remain scalar such as
1270   /// 64 bit loop indices.
1271   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1272 
1273   /// \return The desired interleave count.
1274   /// If interleave count has been specified by metadata it will be returned.
1275   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1276   /// are the selected vectorization factor and the cost of the selected VF.
1277   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1278 
1279   /// Memory access instruction may be vectorized in more than one way.
1280   /// Form of instruction after vectorization depends on cost.
1281   /// This function takes cost-based decisions for Load/Store instructions
1282   /// and collects them in a map. This decisions map is used for building
1283   /// the lists of loop-uniform and loop-scalar instructions.
1284   /// The calculated cost is saved with widening decision in order to
1285   /// avoid redundant calculations.
1286   void setCostBasedWideningDecision(ElementCount VF);
1287 
1288   /// A struct that represents some properties of the register usage
1289   /// of a loop.
1290   struct RegisterUsage {
1291     /// Holds the number of loop invariant values that are used in the loop.
1292     /// The key is ClassID of target-provided register class.
1293     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1294     /// Holds the maximum number of concurrent live intervals in the loop.
1295     /// The key is ClassID of target-provided register class.
1296     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1297   };
1298 
1299   /// \return Returns information about the register usages of the loop for the
1300   /// given vectorization factors.
1301   SmallVector<RegisterUsage, 8>
1302   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1303 
1304   /// Collect values we want to ignore in the cost model.
1305   void collectValuesToIgnore();
1306 
1307   /// Collect all element types in the loop for which widening is needed.
1308   void collectElementTypesForWidening();
1309 
1310   /// Split reductions into those that happen in the loop, and those that happen
1311   /// outside. In loop reductions are collected into InLoopReductionChains.
1312   void collectInLoopReductions();
1313 
1314   /// Returns true if we should use strict in-order reductions for the given
1315   /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1316   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1317   /// of FP operations.
1318   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) {
1319     return !Hints->allowReordering() && RdxDesc.isOrdered();
1320   }
1321 
1322   /// \returns The smallest bitwidth each instruction can be represented with.
1323   /// The vector equivalents of these instructions should be truncated to this
1324   /// type.
1325   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1326     return MinBWs;
1327   }
1328 
1329   /// \returns True if it is more profitable to scalarize instruction \p I for
1330   /// vectorization factor \p VF.
1331   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1332     assert(VF.isVector() &&
1333            "Profitable to scalarize relevant only for VF > 1.");
1334 
1335     // Cost model is not run in the VPlan-native path - return conservative
1336     // result until this changes.
1337     if (EnableVPlanNativePath)
1338       return false;
1339 
1340     auto Scalars = InstsToScalarize.find(VF);
1341     assert(Scalars != InstsToScalarize.end() &&
1342            "VF not yet analyzed for scalarization profitability");
1343     return Scalars->second.find(I) != Scalars->second.end();
1344   }
1345 
1346   /// Returns true if \p I is known to be uniform after vectorization.
1347   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1348     if (VF.isScalar())
1349       return true;
1350 
1351     // Cost model is not run in the VPlan-native path - return conservative
1352     // result until this changes.
1353     if (EnableVPlanNativePath)
1354       return false;
1355 
1356     auto UniformsPerVF = Uniforms.find(VF);
1357     assert(UniformsPerVF != Uniforms.end() &&
1358            "VF not yet analyzed for uniformity");
1359     return UniformsPerVF->second.count(I);
1360   }
1361 
1362   /// Returns true if \p I is known to be scalar after vectorization.
1363   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1364     if (VF.isScalar())
1365       return true;
1366 
1367     // Cost model is not run in the VPlan-native path - return conservative
1368     // result until this changes.
1369     if (EnableVPlanNativePath)
1370       return false;
1371 
1372     auto ScalarsPerVF = Scalars.find(VF);
1373     assert(ScalarsPerVF != Scalars.end() &&
1374            "Scalar values are not calculated for VF");
1375     return ScalarsPerVF->second.count(I);
1376   }
1377 
1378   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1379   /// for vectorization factor \p VF.
1380   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1381     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1382            !isProfitableToScalarize(I, VF) &&
1383            !isScalarAfterVectorization(I, VF);
1384   }
1385 
1386   /// Decision that was taken during cost calculation for memory instruction.
1387   enum InstWidening {
1388     CM_Unknown,
1389     CM_Widen,         // For consecutive accesses with stride +1.
1390     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1391     CM_Interleave,
1392     CM_GatherScatter,
1393     CM_Scalarize
1394   };
1395 
1396   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1397   /// instruction \p I and vector width \p VF.
1398   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1399                            InstructionCost Cost) {
1400     assert(VF.isVector() && "Expected VF >=2");
1401     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1402   }
1403 
1404   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1405   /// interleaving group \p Grp and vector width \p VF.
1406   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1407                            ElementCount VF, InstWidening W,
1408                            InstructionCost Cost) {
1409     assert(VF.isVector() && "Expected VF >=2");
1410     /// Broadcast this decicion to all instructions inside the group.
1411     /// But the cost will be assigned to one instruction only.
1412     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1413       if (auto *I = Grp->getMember(i)) {
1414         if (Grp->getInsertPos() == I)
1415           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1416         else
1417           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1418       }
1419     }
1420   }
1421 
1422   /// Return the cost model decision for the given instruction \p I and vector
1423   /// width \p VF. Return CM_Unknown if this instruction did not pass
1424   /// through the cost modeling.
1425   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1426     assert(VF.isVector() && "Expected VF to be a vector VF");
1427     // Cost model is not run in the VPlan-native path - return conservative
1428     // result until this changes.
1429     if (EnableVPlanNativePath)
1430       return CM_GatherScatter;
1431 
1432     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1433     auto Itr = WideningDecisions.find(InstOnVF);
1434     if (Itr == WideningDecisions.end())
1435       return CM_Unknown;
1436     return Itr->second.first;
1437   }
1438 
1439   /// Return the vectorization cost for the given instruction \p I and vector
1440   /// width \p VF.
1441   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1442     assert(VF.isVector() && "Expected VF >=2");
1443     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1444     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1445            "The cost is not calculated");
1446     return WideningDecisions[InstOnVF].second;
1447   }
1448 
1449   /// Return True if instruction \p I is an optimizable truncate whose operand
1450   /// is an induction variable. Such a truncate will be removed by adding a new
1451   /// induction variable with the destination type.
1452   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1453     // If the instruction is not a truncate, return false.
1454     auto *Trunc = dyn_cast<TruncInst>(I);
1455     if (!Trunc)
1456       return false;
1457 
1458     // Get the source and destination types of the truncate.
1459     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1460     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1461 
1462     // If the truncate is free for the given types, return false. Replacing a
1463     // free truncate with an induction variable would add an induction variable
1464     // update instruction to each iteration of the loop. We exclude from this
1465     // check the primary induction variable since it will need an update
1466     // instruction regardless.
1467     Value *Op = Trunc->getOperand(0);
1468     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1469       return false;
1470 
1471     // If the truncated value is not an induction variable, return false.
1472     return Legal->isInductionPhi(Op);
1473   }
1474 
1475   /// Collects the instructions to scalarize for each predicated instruction in
1476   /// the loop.
1477   void collectInstsToScalarize(ElementCount VF);
1478 
1479   /// Collect Uniform and Scalar values for the given \p VF.
1480   /// The sets depend on CM decision for Load/Store instructions
1481   /// that may be vectorized as interleave, gather-scatter or scalarized.
1482   void collectUniformsAndScalars(ElementCount VF) {
1483     // Do the analysis once.
1484     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1485       return;
1486     setCostBasedWideningDecision(VF);
1487     collectLoopUniforms(VF);
1488     collectLoopScalars(VF);
1489   }
1490 
1491   /// Returns true if the target machine supports masked store operation
1492   /// for the given \p DataType and kind of access to \p Ptr.
1493   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1494     return Legal->isConsecutivePtr(DataType, Ptr) &&
1495            TTI.isLegalMaskedStore(DataType, Alignment);
1496   }
1497 
1498   /// Returns true if the target machine supports masked load operation
1499   /// for the given \p DataType and kind of access to \p Ptr.
1500   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1501     return Legal->isConsecutivePtr(DataType, Ptr) &&
1502            TTI.isLegalMaskedLoad(DataType, Alignment);
1503   }
1504 
1505   /// Returns true if the target machine can represent \p V as a masked gather
1506   /// or scatter operation.
1507   bool isLegalGatherOrScatter(Value *V,
1508                               ElementCount VF = ElementCount::getFixed(1)) {
1509     bool LI = isa<LoadInst>(V);
1510     bool SI = isa<StoreInst>(V);
1511     if (!LI && !SI)
1512       return false;
1513     auto *Ty = getLoadStoreType(V);
1514     Align Align = getLoadStoreAlignment(V);
1515     if (VF.isVector())
1516       Ty = VectorType::get(Ty, VF);
1517     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1518            (SI && TTI.isLegalMaskedScatter(Ty, Align));
1519   }
1520 
1521   /// Returns true if the target machine supports all of the reduction
1522   /// variables found for the given VF.
1523   bool canVectorizeReductions(ElementCount VF) const {
1524     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1525       const RecurrenceDescriptor &RdxDesc = Reduction.second;
1526       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1527     }));
1528   }
1529 
1530   /// Returns true if \p I is an instruction that will be scalarized with
1531   /// predication when vectorizing \p I with vectorization factor \p VF. Such
1532   /// instructions include conditional stores and instructions that may divide
1533   /// by zero.
1534   bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1535 
1536   // Returns true if \p I is an instruction that will be predicated either
1537   // through scalar predication or masked load/store or masked gather/scatter.
1538   // \p VF is the vectorization factor that will be used to vectorize \p I.
1539   // Superset of instructions that return true for isScalarWithPredication.
1540   bool isPredicatedInst(Instruction *I, ElementCount VF,
1541                         bool IsKnownUniform = false) {
1542     // When we know the load is uniform and the original scalar loop was not
1543     // predicated we don't need to mark it as a predicated instruction. Any
1544     // vectorised blocks created when tail-folding are something artificial we
1545     // have introduced and we know there is always at least one active lane.
1546     // That's why we call Legal->blockNeedsPredication here because it doesn't
1547     // query tail-folding.
1548     if (IsKnownUniform && isa<LoadInst>(I) &&
1549         !Legal->blockNeedsPredication(I->getParent()))
1550       return false;
1551     if (!blockNeedsPredicationForAnyReason(I->getParent()))
1552       return false;
1553     // Loads and stores that need some form of masked operation are predicated
1554     // instructions.
1555     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1556       return Legal->isMaskRequired(I);
1557     return isScalarWithPredication(I, VF);
1558   }
1559 
1560   /// Returns true if \p I is a memory instruction with consecutive memory
1561   /// access that can be widened.
1562   bool
1563   memoryInstructionCanBeWidened(Instruction *I,
1564                                 ElementCount VF = ElementCount::getFixed(1));
1565 
1566   /// Returns true if \p I is a memory instruction in an interleaved-group
1567   /// of memory accesses that can be vectorized with wide vector loads/stores
1568   /// and shuffles.
1569   bool
1570   interleavedAccessCanBeWidened(Instruction *I,
1571                                 ElementCount VF = ElementCount::getFixed(1));
1572 
1573   /// Check if \p Instr belongs to any interleaved access group.
1574   bool isAccessInterleaved(Instruction *Instr) {
1575     return InterleaveInfo.isInterleaved(Instr);
1576   }
1577 
1578   /// Get the interleaved access group that \p Instr belongs to.
1579   const InterleaveGroup<Instruction> *
1580   getInterleavedAccessGroup(Instruction *Instr) {
1581     return InterleaveInfo.getInterleaveGroup(Instr);
1582   }
1583 
1584   /// Returns true if we're required to use a scalar epilogue for at least
1585   /// the final iteration of the original loop.
1586   bool requiresScalarEpilogue(ElementCount VF) const {
1587     if (!isScalarEpilogueAllowed())
1588       return false;
1589     // If we might exit from anywhere but the latch, must run the exiting
1590     // iteration in scalar form.
1591     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1592       return true;
1593     return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();
1594   }
1595 
1596   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1597   /// loop hint annotation.
1598   bool isScalarEpilogueAllowed() const {
1599     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1600   }
1601 
1602   /// Returns true if all loop blocks should be masked to fold tail loop.
1603   bool foldTailByMasking() const { return FoldTailByMasking; }
1604 
1605   /// Returns true if the instructions in this block requires predication
1606   /// for any reason, e.g. because tail folding now requires a predicate
1607   /// or because the block in the original loop was predicated.
1608   bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1609     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1610   }
1611 
1612   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1613   /// nodes to the chain of instructions representing the reductions. Uses a
1614   /// MapVector to ensure deterministic iteration order.
1615   using ReductionChainMap =
1616       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1617 
1618   /// Return the chain of instructions representing an inloop reduction.
1619   const ReductionChainMap &getInLoopReductionChains() const {
1620     return InLoopReductionChains;
1621   }
1622 
1623   /// Returns true if the Phi is part of an inloop reduction.
1624   bool isInLoopReduction(PHINode *Phi) const {
1625     return InLoopReductionChains.count(Phi);
1626   }
1627 
1628   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1629   /// with factor VF.  Return the cost of the instruction, including
1630   /// scalarization overhead if it's needed.
1631   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1632 
1633   /// Estimate cost of a call instruction CI if it were vectorized with factor
1634   /// VF. Return the cost of the instruction, including scalarization overhead
1635   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1636   /// scalarized -
1637   /// i.e. either vector version isn't available, or is too expensive.
1638   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1639                                     bool &NeedToScalarize) const;
1640 
1641   /// Returns true if the per-lane cost of VectorizationFactor A is lower than
1642   /// that of B.
1643   bool isMoreProfitable(const VectorizationFactor &A,
1644                         const VectorizationFactor &B) const;
1645 
1646   /// Invalidates decisions already taken by the cost model.
1647   void invalidateCostModelingDecisions() {
1648     WideningDecisions.clear();
1649     Uniforms.clear();
1650     Scalars.clear();
1651   }
1652 
1653 private:
1654   unsigned NumPredStores = 0;
1655 
1656   /// Convenience function that returns the value of vscale_range iff
1657   /// vscale_range.min == vscale_range.max or otherwise returns the value
1658   /// returned by the corresponding TLI method.
1659   Optional<unsigned> getVScaleForTuning() const;
1660 
1661   /// \return An upper bound for the vectorization factors for both
1662   /// fixed and scalable vectorization, where the minimum-known number of
1663   /// elements is a power-of-2 larger than zero. If scalable vectorization is
1664   /// disabled or unsupported, then the scalable part will be equal to
1665   /// ElementCount::getScalable(0).
1666   FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1667                                            ElementCount UserVF,
1668                                            bool FoldTailByMasking);
1669 
1670   /// \return the maximized element count based on the targets vector
1671   /// registers and the loop trip-count, but limited to a maximum safe VF.
1672   /// This is a helper function of computeFeasibleMaxVF.
1673   /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure
1674   /// issue that occurred on one of the buildbots which cannot be reproduced
1675   /// without having access to the properietary compiler (see comments on
1676   /// D98509). The issue is currently under investigation and this workaround
1677   /// will be removed as soon as possible.
1678   ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
1679                                        unsigned SmallestType,
1680                                        unsigned WidestType,
1681                                        const ElementCount &MaxSafeVF,
1682                                        bool FoldTailByMasking);
1683 
1684   /// \return the maximum legal scalable VF, based on the safe max number
1685   /// of elements.
1686   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1687 
1688   /// The vectorization cost is a combination of the cost itself and a boolean
1689   /// indicating whether any of the contributing operations will actually
1690   /// operate on vector values after type legalization in the backend. If this
1691   /// latter value is false, then all operations will be scalarized (i.e. no
1692   /// vectorization has actually taken place).
1693   using VectorizationCostTy = std::pair<InstructionCost, bool>;
1694 
1695   /// Returns the expected execution cost. The unit of the cost does
1696   /// not matter because we use the 'cost' units to compare different
1697   /// vector widths. The cost that is returned is *not* normalized by
1698   /// the factor width. If \p Invalid is not nullptr, this function
1699   /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1700   /// each instruction that has an Invalid cost for the given VF.
1701   using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1702   VectorizationCostTy
1703   expectedCost(ElementCount VF,
1704                SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1705 
1706   /// Returns the execution time cost of an instruction for a given vector
1707   /// width. Vector width of one means scalar.
1708   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1709 
1710   /// The cost-computation logic from getInstructionCost which provides
1711   /// the vector type as an output parameter.
1712   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1713                                      Type *&VectorTy);
1714 
1715   /// Return the cost of instructions in an inloop reduction pattern, if I is
1716   /// part of that pattern.
1717   Optional<InstructionCost>
1718   getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1719                           TTI::TargetCostKind CostKind);
1720 
1721   /// Calculate vectorization cost of memory instruction \p I.
1722   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1723 
1724   /// The cost computation for scalarized memory instruction.
1725   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1726 
1727   /// The cost computation for interleaving group of memory instructions.
1728   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1729 
1730   /// The cost computation for Gather/Scatter instruction.
1731   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1732 
1733   /// The cost computation for widening instruction \p I with consecutive
1734   /// memory access.
1735   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1736 
1737   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1738   /// Load: scalar load + broadcast.
1739   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1740   /// element)
1741   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1742 
1743   /// Estimate the overhead of scalarizing an instruction. This is a
1744   /// convenience wrapper for the type-based getScalarizationOverhead API.
1745   InstructionCost getScalarizationOverhead(Instruction *I,
1746                                            ElementCount VF) const;
1747 
1748   /// Returns whether the instruction is a load or store and will be a emitted
1749   /// as a vector operation.
1750   bool isConsecutiveLoadOrStore(Instruction *I);
1751 
1752   /// Returns true if an artificially high cost for emulated masked memrefs
1753   /// should be used.
1754   bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1755 
1756   /// Map of scalar integer values to the smallest bitwidth they can be legally
1757   /// represented as. The vector equivalents of these values should be truncated
1758   /// to this type.
1759   MapVector<Instruction *, uint64_t> MinBWs;
1760 
1761   /// A type representing the costs for instructions if they were to be
1762   /// scalarized rather than vectorized. The entries are Instruction-Cost
1763   /// pairs.
1764   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1765 
1766   /// A set containing all BasicBlocks that are known to present after
1767   /// vectorization as a predicated block.
1768   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1769 
1770   /// Records whether it is allowed to have the original scalar loop execute at
1771   /// least once. This may be needed as a fallback loop in case runtime
1772   /// aliasing/dependence checks fail, or to handle the tail/remainder
1773   /// iterations when the trip count is unknown or doesn't divide by the VF,
1774   /// or as a peel-loop to handle gaps in interleave-groups.
1775   /// Under optsize and when the trip count is very small we don't allow any
1776   /// iterations to execute in the scalar loop.
1777   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1778 
1779   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1780   bool FoldTailByMasking = false;
1781 
1782   /// A map holding scalar costs for different vectorization factors. The
1783   /// presence of a cost for an instruction in the mapping indicates that the
1784   /// instruction will be scalarized when vectorizing with the associated
1785   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1786   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1787 
1788   /// Holds the instructions known to be uniform after vectorization.
1789   /// The data is collected per VF.
1790   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1791 
1792   /// Holds the instructions known to be scalar after vectorization.
1793   /// The data is collected per VF.
1794   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1795 
1796   /// Holds the instructions (address computations) that are forced to be
1797   /// scalarized.
1798   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1799 
1800   /// PHINodes of the reductions that should be expanded in-loop along with
1801   /// their associated chains of reduction operations, in program order from top
1802   /// (PHI) to bottom
1803   ReductionChainMap InLoopReductionChains;
1804 
1805   /// A Map of inloop reduction operations and their immediate chain operand.
1806   /// FIXME: This can be removed once reductions can be costed correctly in
1807   /// vplan. This was added to allow quick lookup to the inloop operations,
1808   /// without having to loop through InLoopReductionChains.
1809   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1810 
1811   /// Returns the expected difference in cost from scalarizing the expression
1812   /// feeding a predicated instruction \p PredInst. The instructions to
1813   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1814   /// non-negative return value implies the expression will be scalarized.
1815   /// Currently, only single-use chains are considered for scalarization.
1816   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1817                               ElementCount VF);
1818 
1819   /// Collect the instructions that are uniform after vectorization. An
1820   /// instruction is uniform if we represent it with a single scalar value in
1821   /// the vectorized loop corresponding to each vector iteration. Examples of
1822   /// uniform instructions include pointer operands of consecutive or
1823   /// interleaved memory accesses. Note that although uniformity implies an
1824   /// instruction will be scalar, the reverse is not true. In general, a
1825   /// scalarized instruction will be represented by VF scalar values in the
1826   /// vectorized loop, each corresponding to an iteration of the original
1827   /// scalar loop.
1828   void collectLoopUniforms(ElementCount VF);
1829 
1830   /// Collect the instructions that are scalar after vectorization. An
1831   /// instruction is scalar if it is known to be uniform or will be scalarized
1832   /// during vectorization. collectLoopScalars should only add non-uniform nodes
1833   /// to the list if they are used by a load/store instruction that is marked as
1834   /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1835   /// VF values in the vectorized loop, each corresponding to an iteration of
1836   /// the original scalar loop.
1837   void collectLoopScalars(ElementCount VF);
1838 
1839   /// Keeps cost model vectorization decision and cost for instructions.
1840   /// Right now it is used for memory instructions only.
1841   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1842                                 std::pair<InstWidening, InstructionCost>>;
1843 
1844   DecisionList WideningDecisions;
1845 
1846   /// Returns true if \p V is expected to be vectorized and it needs to be
1847   /// extracted.
1848   bool needsExtract(Value *V, ElementCount VF) const {
1849     Instruction *I = dyn_cast<Instruction>(V);
1850     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1851         TheLoop->isLoopInvariant(I))
1852       return false;
1853 
1854     // Assume we can vectorize V (and hence we need extraction) if the
1855     // scalars are not computed yet. This can happen, because it is called
1856     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1857     // the scalars are collected. That should be a safe assumption in most
1858     // cases, because we check if the operands have vectorizable types
1859     // beforehand in LoopVectorizationLegality.
1860     return Scalars.find(VF) == Scalars.end() ||
1861            !isScalarAfterVectorization(I, VF);
1862   };
1863 
1864   /// Returns a range containing only operands needing to be extracted.
1865   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1866                                                    ElementCount VF) const {
1867     return SmallVector<Value *, 4>(make_filter_range(
1868         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1869   }
1870 
1871   /// Determines if we have the infrastructure to vectorize loop \p L and its
1872   /// epilogue, assuming the main loop is vectorized by \p VF.
1873   bool isCandidateForEpilogueVectorization(const Loop &L,
1874                                            const ElementCount VF) const;
1875 
1876   /// Returns true if epilogue vectorization is considered profitable, and
1877   /// false otherwise.
1878   /// \p VF is the vectorization factor chosen for the original loop.
1879   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1880 
1881 public:
1882   /// The loop that we evaluate.
1883   Loop *TheLoop;
1884 
1885   /// Predicated scalar evolution analysis.
1886   PredicatedScalarEvolution &PSE;
1887 
1888   /// Loop Info analysis.
1889   LoopInfo *LI;
1890 
1891   /// Vectorization legality.
1892   LoopVectorizationLegality *Legal;
1893 
1894   /// Vector target information.
1895   const TargetTransformInfo &TTI;
1896 
1897   /// Target Library Info.
1898   const TargetLibraryInfo *TLI;
1899 
1900   /// Demanded bits analysis.
1901   DemandedBits *DB;
1902 
1903   /// Assumption cache.
1904   AssumptionCache *AC;
1905 
1906   /// Interface to emit optimization remarks.
1907   OptimizationRemarkEmitter *ORE;
1908 
1909   const Function *TheFunction;
1910 
1911   /// Loop Vectorize Hint.
1912   const LoopVectorizeHints *Hints;
1913 
1914   /// The interleave access information contains groups of interleaved accesses
1915   /// with the same stride and close to each other.
1916   InterleavedAccessInfo &InterleaveInfo;
1917 
1918   /// Values to ignore in the cost model.
1919   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1920 
1921   /// Values to ignore in the cost model when VF > 1.
1922   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1923 
1924   /// All element types found in the loop.
1925   SmallPtrSet<Type *, 16> ElementTypesInLoop;
1926 
1927   /// Profitable vector factors.
1928   SmallVector<VectorizationFactor, 8> ProfitableVFs;
1929 };
1930 } // end namespace llvm
1931 
1932 /// Helper struct to manage generating runtime checks for vectorization.
1933 ///
1934 /// The runtime checks are created up-front in temporary blocks to allow better
1935 /// estimating the cost and un-linked from the existing IR. After deciding to
1936 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1937 /// temporary blocks are completely removed.
1938 class GeneratedRTChecks {
1939   /// Basic block which contains the generated SCEV checks, if any.
1940   BasicBlock *SCEVCheckBlock = nullptr;
1941 
1942   /// The value representing the result of the generated SCEV checks. If it is
1943   /// nullptr, either no SCEV checks have been generated or they have been used.
1944   Value *SCEVCheckCond = nullptr;
1945 
1946   /// Basic block which contains the generated memory runtime checks, if any.
1947   BasicBlock *MemCheckBlock = nullptr;
1948 
1949   /// The value representing the result of the generated memory runtime checks.
1950   /// If it is nullptr, either no memory runtime checks have been generated or
1951   /// they have been used.
1952   Value *MemRuntimeCheckCond = nullptr;
1953 
1954   DominatorTree *DT;
1955   LoopInfo *LI;
1956 
1957   SCEVExpander SCEVExp;
1958   SCEVExpander MemCheckExp;
1959 
1960 public:
1961   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1962                     const DataLayout &DL)
1963       : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"),
1964         MemCheckExp(SE, DL, "scev.check") {}
1965 
1966   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1967   /// accurately estimate the cost of the runtime checks. The blocks are
1968   /// un-linked from the IR and is added back during vector code generation. If
1969   /// there is no vector code generation, the check blocks are removed
1970   /// completely.
1971   void Create(Loop *L, const LoopAccessInfo &LAI,
1972               const SCEVPredicate &Pred) {
1973 
1974     BasicBlock *LoopHeader = L->getHeader();
1975     BasicBlock *Preheader = L->getLoopPreheader();
1976 
1977     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1978     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1979     // may be used by SCEVExpander. The blocks will be un-linked from their
1980     // predecessors and removed from LI & DT at the end of the function.
1981     if (!Pred.isAlwaysTrue()) {
1982       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1983                                   nullptr, "vector.scevcheck");
1984 
1985       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1986           &Pred, SCEVCheckBlock->getTerminator());
1987     }
1988 
1989     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1990     if (RtPtrChecking.Need) {
1991       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1992       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1993                                  "vector.memcheck");
1994 
1995       MemRuntimeCheckCond =
1996           addRuntimeChecks(MemCheckBlock->getTerminator(), L,
1997                            RtPtrChecking.getChecks(), MemCheckExp);
1998       assert(MemRuntimeCheckCond &&
1999              "no RT checks generated although RtPtrChecking "
2000              "claimed checks are required");
2001     }
2002 
2003     if (!MemCheckBlock && !SCEVCheckBlock)
2004       return;
2005 
2006     // Unhook the temporary block with the checks, update various places
2007     // accordingly.
2008     if (SCEVCheckBlock)
2009       SCEVCheckBlock->replaceAllUsesWith(Preheader);
2010     if (MemCheckBlock)
2011       MemCheckBlock->replaceAllUsesWith(Preheader);
2012 
2013     if (SCEVCheckBlock) {
2014       SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2015       new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
2016       Preheader->getTerminator()->eraseFromParent();
2017     }
2018     if (MemCheckBlock) {
2019       MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2020       new UnreachableInst(Preheader->getContext(), MemCheckBlock);
2021       Preheader->getTerminator()->eraseFromParent();
2022     }
2023 
2024     DT->changeImmediateDominator(LoopHeader, Preheader);
2025     if (MemCheckBlock) {
2026       DT->eraseNode(MemCheckBlock);
2027       LI->removeBlock(MemCheckBlock);
2028     }
2029     if (SCEVCheckBlock) {
2030       DT->eraseNode(SCEVCheckBlock);
2031       LI->removeBlock(SCEVCheckBlock);
2032     }
2033   }
2034 
2035   /// Remove the created SCEV & memory runtime check blocks & instructions, if
2036   /// unused.
2037   ~GeneratedRTChecks() {
2038     SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2039     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2040     if (!SCEVCheckCond)
2041       SCEVCleaner.markResultUsed();
2042 
2043     if (!MemRuntimeCheckCond)
2044       MemCheckCleaner.markResultUsed();
2045 
2046     if (MemRuntimeCheckCond) {
2047       auto &SE = *MemCheckExp.getSE();
2048       // Memory runtime check generation creates compares that use expanded
2049       // values. Remove them before running the SCEVExpanderCleaners.
2050       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2051         if (MemCheckExp.isInsertedInstruction(&I))
2052           continue;
2053         SE.forgetValue(&I);
2054         I.eraseFromParent();
2055       }
2056     }
2057     MemCheckCleaner.cleanup();
2058     SCEVCleaner.cleanup();
2059 
2060     if (SCEVCheckCond)
2061       SCEVCheckBlock->eraseFromParent();
2062     if (MemRuntimeCheckCond)
2063       MemCheckBlock->eraseFromParent();
2064   }
2065 
2066   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2067   /// adjusts the branches to branch to the vector preheader or \p Bypass,
2068   /// depending on the generated condition.
2069   BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2070                              BasicBlock *LoopVectorPreHeader,
2071                              BasicBlock *LoopExitBlock) {
2072     if (!SCEVCheckCond)
2073       return nullptr;
2074     if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond))
2075       if (C->isZero())
2076         return nullptr;
2077 
2078     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2079 
2080     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2081     // Create new preheader for vector loop.
2082     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2083       PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2084 
2085     SCEVCheckBlock->getTerminator()->eraseFromParent();
2086     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2087     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2088                                                 SCEVCheckBlock);
2089 
2090     DT->addNewBlock(SCEVCheckBlock, Pred);
2091     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2092 
2093     ReplaceInstWithInst(
2094         SCEVCheckBlock->getTerminator(),
2095         BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond));
2096     // Mark the check as used, to prevent it from being removed during cleanup.
2097     SCEVCheckCond = nullptr;
2098     return SCEVCheckBlock;
2099   }
2100 
2101   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2102   /// the branches to branch to the vector preheader or \p Bypass, depending on
2103   /// the generated condition.
2104   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2105                                    BasicBlock *LoopVectorPreHeader) {
2106     // Check if we generated code that checks in runtime if arrays overlap.
2107     if (!MemRuntimeCheckCond)
2108       return nullptr;
2109 
2110     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2111     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2112                                                 MemCheckBlock);
2113 
2114     DT->addNewBlock(MemCheckBlock, Pred);
2115     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2116     MemCheckBlock->moveBefore(LoopVectorPreHeader);
2117 
2118     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2119       PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2120 
2121     ReplaceInstWithInst(
2122         MemCheckBlock->getTerminator(),
2123         BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2124     MemCheckBlock->getTerminator()->setDebugLoc(
2125         Pred->getTerminator()->getDebugLoc());
2126 
2127     // Mark the check as used, to prevent it from being removed during cleanup.
2128     MemRuntimeCheckCond = nullptr;
2129     return MemCheckBlock;
2130   }
2131 };
2132 
2133 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2134 // vectorization. The loop needs to be annotated with #pragma omp simd
2135 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2136 // vector length information is not provided, vectorization is not considered
2137 // explicit. Interleave hints are not allowed either. These limitations will be
2138 // relaxed in the future.
2139 // Please, note that we are currently forced to abuse the pragma 'clang
2140 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2141 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2142 // provides *explicit vectorization hints* (LV can bypass legal checks and
2143 // assume that vectorization is legal). However, both hints are implemented
2144 // using the same metadata (llvm.loop.vectorize, processed by
2145 // LoopVectorizeHints). This will be fixed in the future when the native IR
2146 // representation for pragma 'omp simd' is introduced.
2147 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2148                                    OptimizationRemarkEmitter *ORE) {
2149   assert(!OuterLp->isInnermost() && "This is not an outer loop");
2150   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2151 
2152   // Only outer loops with an explicit vectorization hint are supported.
2153   // Unannotated outer loops are ignored.
2154   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2155     return false;
2156 
2157   Function *Fn = OuterLp->getHeader()->getParent();
2158   if (!Hints.allowVectorization(Fn, OuterLp,
2159                                 true /*VectorizeOnlyWhenForced*/)) {
2160     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2161     return false;
2162   }
2163 
2164   if (Hints.getInterleave() > 1) {
2165     // TODO: Interleave support is future work.
2166     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2167                          "outer loops.\n");
2168     Hints.emitRemarkWithHints();
2169     return false;
2170   }
2171 
2172   return true;
2173 }
2174 
2175 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2176                                   OptimizationRemarkEmitter *ORE,
2177                                   SmallVectorImpl<Loop *> &V) {
2178   // Collect inner loops and outer loops without irreducible control flow. For
2179   // now, only collect outer loops that have explicit vectorization hints. If we
2180   // are stress testing the VPlan H-CFG construction, we collect the outermost
2181   // loop of every loop nest.
2182   if (L.isInnermost() || VPlanBuildStressTest ||
2183       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2184     LoopBlocksRPO RPOT(&L);
2185     RPOT.perform(LI);
2186     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2187       V.push_back(&L);
2188       // TODO: Collect inner loops inside marked outer loops in case
2189       // vectorization fails for the outer loop. Do not invoke
2190       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2191       // already known to be reducible. We can use an inherited attribute for
2192       // that.
2193       return;
2194     }
2195   }
2196   for (Loop *InnerL : L)
2197     collectSupportedLoops(*InnerL, LI, ORE, V);
2198 }
2199 
2200 namespace {
2201 
2202 /// The LoopVectorize Pass.
2203 struct LoopVectorize : public FunctionPass {
2204   /// Pass identification, replacement for typeid
2205   static char ID;
2206 
2207   LoopVectorizePass Impl;
2208 
2209   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2210                          bool VectorizeOnlyWhenForced = false)
2211       : FunctionPass(ID),
2212         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2213     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2214   }
2215 
2216   bool runOnFunction(Function &F) override {
2217     if (skipFunction(F))
2218       return false;
2219 
2220     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2221     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2222     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2223     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2224     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2225     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2226     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2227     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2228     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2229     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2230     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2231     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2232     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2233 
2234     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2235         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2236 
2237     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2238                         GetLAA, *ORE, PSI).MadeAnyChange;
2239   }
2240 
2241   void getAnalysisUsage(AnalysisUsage &AU) const override {
2242     AU.addRequired<AssumptionCacheTracker>();
2243     AU.addRequired<BlockFrequencyInfoWrapperPass>();
2244     AU.addRequired<DominatorTreeWrapperPass>();
2245     AU.addRequired<LoopInfoWrapperPass>();
2246     AU.addRequired<ScalarEvolutionWrapperPass>();
2247     AU.addRequired<TargetTransformInfoWrapperPass>();
2248     AU.addRequired<AAResultsWrapperPass>();
2249     AU.addRequired<LoopAccessLegacyAnalysis>();
2250     AU.addRequired<DemandedBitsWrapperPass>();
2251     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2252     AU.addRequired<InjectTLIMappingsLegacy>();
2253 
2254     // We currently do not preserve loopinfo/dominator analyses with outer loop
2255     // vectorization. Until this is addressed, mark these analyses as preserved
2256     // only for non-VPlan-native path.
2257     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2258     if (!EnableVPlanNativePath) {
2259       AU.addPreserved<LoopInfoWrapperPass>();
2260       AU.addPreserved<DominatorTreeWrapperPass>();
2261     }
2262 
2263     AU.addPreserved<BasicAAWrapperPass>();
2264     AU.addPreserved<GlobalsAAWrapperPass>();
2265     AU.addRequired<ProfileSummaryInfoWrapperPass>();
2266   }
2267 };
2268 
2269 } // end anonymous namespace
2270 
2271 //===----------------------------------------------------------------------===//
2272 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2273 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2274 //===----------------------------------------------------------------------===//
2275 
2276 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2277   // We need to place the broadcast of invariant variables outside the loop,
2278   // but only if it's proven safe to do so. Else, broadcast will be inside
2279   // vector loop body.
2280   Instruction *Instr = dyn_cast<Instruction>(V);
2281   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2282                      (!Instr ||
2283                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2284   // Place the code for broadcasting invariant variables in the new preheader.
2285   IRBuilder<>::InsertPointGuard Guard(Builder);
2286   if (SafeToHoist)
2287     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2288 
2289   // Broadcast the scalar into all locations in the vector.
2290   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2291 
2292   return Shuf;
2293 }
2294 
2295 /// This function adds
2296 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
2297 /// to each vector element of Val. The sequence starts at StartIndex.
2298 /// \p Opcode is relevant for FP induction variable.
2299 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
2300                             Instruction::BinaryOps BinOp, ElementCount VF,
2301                             IRBuilderBase &Builder) {
2302   assert(VF.isVector() && "only vector VFs are supported");
2303 
2304   // Create and check the types.
2305   auto *ValVTy = cast<VectorType>(Val->getType());
2306   ElementCount VLen = ValVTy->getElementCount();
2307 
2308   Type *STy = Val->getType()->getScalarType();
2309   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2310          "Induction Step must be an integer or FP");
2311   assert(Step->getType() == STy && "Step has wrong type");
2312 
2313   SmallVector<Constant *, 8> Indices;
2314 
2315   // Create a vector of consecutive numbers from zero to VF.
2316   VectorType *InitVecValVTy = ValVTy;
2317   if (STy->isFloatingPointTy()) {
2318     Type *InitVecValSTy =
2319         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
2320     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2321   }
2322   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2323 
2324   // Splat the StartIdx
2325   Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
2326 
2327   if (STy->isIntegerTy()) {
2328     InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2329     Step = Builder.CreateVectorSplat(VLen, Step);
2330     assert(Step->getType() == Val->getType() && "Invalid step vec");
2331     // FIXME: The newly created binary instructions should contain nsw/nuw
2332     // flags, which can be found from the original scalar operations.
2333     Step = Builder.CreateMul(InitVec, Step);
2334     return Builder.CreateAdd(Val, Step, "induction");
2335   }
2336 
2337   // Floating point induction.
2338   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2339          "Binary Opcode should be specified for FP induction");
2340   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2341   InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
2342 
2343   Step = Builder.CreateVectorSplat(VLen, Step);
2344   Value *MulOp = Builder.CreateFMul(InitVec, Step);
2345   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2346 }
2347 
2348 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
2349 /// variable on which to base the steps, \p Step is the size of the step.
2350 static void buildScalarSteps(Value *ScalarIV, Value *Step,
2351                              const InductionDescriptor &ID, VPValue *Def,
2352                              VPTransformState &State) {
2353   IRBuilderBase &Builder = State.Builder;
2354   // We shouldn't have to build scalar steps if we aren't vectorizing.
2355   assert(State.VF.isVector() && "VF should be greater than one");
2356   // Get the value type and ensure it and the step have the same integer type.
2357   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2358   assert(ScalarIVTy == Step->getType() &&
2359          "Val and Step should have the same type");
2360 
2361   // We build scalar steps for both integer and floating-point induction
2362   // variables. Here, we determine the kind of arithmetic we will perform.
2363   Instruction::BinaryOps AddOp;
2364   Instruction::BinaryOps MulOp;
2365   if (ScalarIVTy->isIntegerTy()) {
2366     AddOp = Instruction::Add;
2367     MulOp = Instruction::Mul;
2368   } else {
2369     AddOp = ID.getInductionOpcode();
2370     MulOp = Instruction::FMul;
2371   }
2372 
2373   // Determine the number of scalars we need to generate for each unroll
2374   // iteration.
2375   bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def);
2376   unsigned Lanes = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2377   // Compute the scalar steps and save the results in State.
2378   Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2379                                      ScalarIVTy->getScalarSizeInBits());
2380   Type *VecIVTy = nullptr;
2381   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2382   if (!FirstLaneOnly && State.VF.isScalable()) {
2383     VecIVTy = VectorType::get(ScalarIVTy, State.VF);
2384     UnitStepVec =
2385         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
2386     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
2387     SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV);
2388   }
2389 
2390   for (unsigned Part = 0; Part < State.UF; ++Part) {
2391     Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
2392 
2393     if (!FirstLaneOnly && State.VF.isScalable()) {
2394       auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
2395       auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2396       if (ScalarIVTy->isFloatingPointTy())
2397         InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2398       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2399       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2400       State.set(Def, Add, Part);
2401       // It's useful to record the lane values too for the known minimum number
2402       // of elements so we do those below. This improves the code quality when
2403       // trying to extract the first element, for example.
2404     }
2405 
2406     if (ScalarIVTy->isFloatingPointTy())
2407       StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2408 
2409     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2410       Value *StartIdx = Builder.CreateBinOp(
2411           AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2412       // The step returned by `createStepForVF` is a runtime-evaluated value
2413       // when VF is scalable. Otherwise, it should be folded into a Constant.
2414       assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
2415              "Expected StartIdx to be folded to a constant when VF is not "
2416              "scalable");
2417       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2418       auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2419       State.set(Def, Add, VPIteration(Part, Lane));
2420     }
2421   }
2422 }
2423 
2424 // Generate code for the induction step. Note that induction steps are
2425 // required to be loop-invariant
2426 static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE,
2427                               Instruction *InsertBefore,
2428                               Loop *OrigLoop = nullptr) {
2429   const DataLayout &DL = SE.getDataLayout();
2430   assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) &&
2431          "Induction step should be loop invariant");
2432   if (auto *E = dyn_cast<SCEVUnknown>(Step))
2433     return E->getValue();
2434 
2435   SCEVExpander Exp(SE, DL, "induction");
2436   return Exp.expandCodeFor(Step, Step->getType(), InsertBefore);
2437 }
2438 
2439 /// Compute the transformed value of Index at offset StartValue using step
2440 /// StepValue.
2441 /// For integer induction, returns StartValue + Index * StepValue.
2442 /// For pointer induction, returns StartValue[Index * StepValue].
2443 /// FIXME: The newly created binary instructions should contain nsw/nuw
2444 /// flags, which can be found from the original scalar operations.
2445 static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index,
2446                                    Value *StartValue, Value *Step,
2447                                    const InductionDescriptor &ID) {
2448   assert(Index->getType()->getScalarType() == Step->getType() &&
2449          "Index scalar type does not match StepValue type");
2450 
2451   // Note: the IR at this point is broken. We cannot use SE to create any new
2452   // SCEV and then expand it, hoping that SCEV's simplification will give us
2453   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2454   // lead to various SCEV crashes. So all we can do is to use builder and rely
2455   // on InstCombine for future simplifications. Here we handle some trivial
2456   // cases only.
2457   auto CreateAdd = [&B](Value *X, Value *Y) {
2458     assert(X->getType() == Y->getType() && "Types don't match!");
2459     if (auto *CX = dyn_cast<ConstantInt>(X))
2460       if (CX->isZero())
2461         return Y;
2462     if (auto *CY = dyn_cast<ConstantInt>(Y))
2463       if (CY->isZero())
2464         return X;
2465     return B.CreateAdd(X, Y);
2466   };
2467 
2468   // We allow X to be a vector type, in which case Y will potentially be
2469   // splatted into a vector with the same element count.
2470   auto CreateMul = [&B](Value *X, Value *Y) {
2471     assert(X->getType()->getScalarType() == Y->getType() &&
2472            "Types don't match!");
2473     if (auto *CX = dyn_cast<ConstantInt>(X))
2474       if (CX->isOne())
2475         return Y;
2476     if (auto *CY = dyn_cast<ConstantInt>(Y))
2477       if (CY->isOne())
2478         return X;
2479     VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2480     if (XVTy && !isa<VectorType>(Y->getType()))
2481       Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2482     return B.CreateMul(X, Y);
2483   };
2484 
2485   switch (ID.getKind()) {
2486   case InductionDescriptor::IK_IntInduction: {
2487     assert(!isa<VectorType>(Index->getType()) &&
2488            "Vector indices not supported for integer inductions yet");
2489     assert(Index->getType() == StartValue->getType() &&
2490            "Index type does not match StartValue type");
2491     if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2492       return B.CreateSub(StartValue, Index);
2493     auto *Offset = CreateMul(Index, Step);
2494     return CreateAdd(StartValue, Offset);
2495   }
2496   case InductionDescriptor::IK_PtrInduction: {
2497     assert(isa<Constant>(Step) &&
2498            "Expected constant step for pointer induction");
2499     return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step));
2500   }
2501   case InductionDescriptor::IK_FpInduction: {
2502     assert(!isa<VectorType>(Index->getType()) &&
2503            "Vector indices not supported for FP inductions yet");
2504     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2505     auto InductionBinOp = ID.getInductionBinOp();
2506     assert(InductionBinOp &&
2507            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2508             InductionBinOp->getOpcode() == Instruction::FSub) &&
2509            "Original bin op should be defined for FP induction");
2510 
2511     Value *MulExp = B.CreateFMul(Step, Index);
2512     return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2513                          "induction");
2514   }
2515   case InductionDescriptor::IK_NoInduction:
2516     return nullptr;
2517   }
2518   llvm_unreachable("invalid enum");
2519 }
2520 
2521 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2522                                                     const VPIteration &Instance,
2523                                                     VPTransformState &State) {
2524   Value *ScalarInst = State.get(Def, Instance);
2525   Value *VectorValue = State.get(Def, Instance.Part);
2526   VectorValue = Builder.CreateInsertElement(
2527       VectorValue, ScalarInst,
2528       Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2529   State.set(Def, VectorValue, Instance.Part);
2530 }
2531 
2532 // Return whether we allow using masked interleave-groups (for dealing with
2533 // strided loads/stores that reside in predicated blocks, or for dealing
2534 // with gaps).
2535 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2536   // If an override option has been passed in for interleaved accesses, use it.
2537   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2538     return EnableMaskedInterleavedMemAccesses;
2539 
2540   return TTI.enableMaskedInterleavedAccessVectorization();
2541 }
2542 
2543 // Try to vectorize the interleave group that \p Instr belongs to.
2544 //
2545 // E.g. Translate following interleaved load group (factor = 3):
2546 //   for (i = 0; i < N; i+=3) {
2547 //     R = Pic[i];             // Member of index 0
2548 //     G = Pic[i+1];           // Member of index 1
2549 //     B = Pic[i+2];           // Member of index 2
2550 //     ... // do something to R, G, B
2551 //   }
2552 // To:
2553 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2554 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2555 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2556 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2557 //
2558 // Or translate following interleaved store group (factor = 3):
2559 //   for (i = 0; i < N; i+=3) {
2560 //     ... do something to R, G, B
2561 //     Pic[i]   = R;           // Member of index 0
2562 //     Pic[i+1] = G;           // Member of index 1
2563 //     Pic[i+2] = B;           // Member of index 2
2564 //   }
2565 // To:
2566 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2567 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2568 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2569 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2570 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2571 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2572     const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2573     VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2574     VPValue *BlockInMask) {
2575   Instruction *Instr = Group->getInsertPos();
2576   const DataLayout &DL = Instr->getModule()->getDataLayout();
2577 
2578   // Prepare for the vector type of the interleaved load/store.
2579   Type *ScalarTy = getLoadStoreType(Instr);
2580   unsigned InterleaveFactor = Group->getFactor();
2581   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2582   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2583 
2584   // Prepare for the new pointers.
2585   SmallVector<Value *, 2> AddrParts;
2586   unsigned Index = Group->getIndex(Instr);
2587 
2588   // TODO: extend the masked interleaved-group support to reversed access.
2589   assert((!BlockInMask || !Group->isReverse()) &&
2590          "Reversed masked interleave-group not supported.");
2591 
2592   // If the group is reverse, adjust the index to refer to the last vector lane
2593   // instead of the first. We adjust the index from the first vector lane,
2594   // rather than directly getting the pointer for lane VF - 1, because the
2595   // pointer operand of the interleaved access is supposed to be uniform. For
2596   // uniform instructions, we're only required to generate a value for the
2597   // first vector lane in each unroll iteration.
2598   if (Group->isReverse())
2599     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2600 
2601   for (unsigned Part = 0; Part < UF; Part++) {
2602     Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2603     setDebugLocFromInst(AddrPart);
2604 
2605     // Notice current instruction could be any index. Need to adjust the address
2606     // to the member of index 0.
2607     //
2608     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2609     //       b = A[i];       // Member of index 0
2610     // Current pointer is pointed to A[i+1], adjust it to A[i].
2611     //
2612     // E.g.  A[i+1] = a;     // Member of index 1
2613     //       A[i]   = b;     // Member of index 0
2614     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2615     // Current pointer is pointed to A[i+2], adjust it to A[i].
2616 
2617     bool InBounds = false;
2618     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2619       InBounds = gep->isInBounds();
2620     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2621     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2622 
2623     // Cast to the vector pointer type.
2624     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2625     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2626     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2627   }
2628 
2629   setDebugLocFromInst(Instr);
2630   Value *PoisonVec = PoisonValue::get(VecTy);
2631 
2632   Value *MaskForGaps = nullptr;
2633   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2634     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2635     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2636   }
2637 
2638   // Vectorize the interleaved load group.
2639   if (isa<LoadInst>(Instr)) {
2640     // For each unroll part, create a wide load for the group.
2641     SmallVector<Value *, 2> NewLoads;
2642     for (unsigned Part = 0; Part < UF; Part++) {
2643       Instruction *NewLoad;
2644       if (BlockInMask || MaskForGaps) {
2645         assert(useMaskedInterleavedAccesses(*TTI) &&
2646                "masked interleaved groups are not allowed.");
2647         Value *GroupMask = MaskForGaps;
2648         if (BlockInMask) {
2649           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2650           Value *ShuffledMask = Builder.CreateShuffleVector(
2651               BlockInMaskPart,
2652               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2653               "interleaved.mask");
2654           GroupMask = MaskForGaps
2655                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2656                                                 MaskForGaps)
2657                           : ShuffledMask;
2658         }
2659         NewLoad =
2660             Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2661                                      GroupMask, PoisonVec, "wide.masked.vec");
2662       }
2663       else
2664         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2665                                             Group->getAlign(), "wide.vec");
2666       Group->addMetadata(NewLoad);
2667       NewLoads.push_back(NewLoad);
2668     }
2669 
2670     // For each member in the group, shuffle out the appropriate data from the
2671     // wide loads.
2672     unsigned J = 0;
2673     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2674       Instruction *Member = Group->getMember(I);
2675 
2676       // Skip the gaps in the group.
2677       if (!Member)
2678         continue;
2679 
2680       auto StrideMask =
2681           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2682       for (unsigned Part = 0; Part < UF; Part++) {
2683         Value *StridedVec = Builder.CreateShuffleVector(
2684             NewLoads[Part], StrideMask, "strided.vec");
2685 
2686         // If this member has different type, cast the result type.
2687         if (Member->getType() != ScalarTy) {
2688           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2689           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2690           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2691         }
2692 
2693         if (Group->isReverse())
2694           StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2695 
2696         State.set(VPDefs[J], StridedVec, Part);
2697       }
2698       ++J;
2699     }
2700     return;
2701   }
2702 
2703   // The sub vector type for current instruction.
2704   auto *SubVT = VectorType::get(ScalarTy, VF);
2705 
2706   // Vectorize the interleaved store group.
2707   MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2708   assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2709          "masked interleaved groups are not allowed.");
2710   assert((!MaskForGaps || !VF.isScalable()) &&
2711          "masking gaps for scalable vectors is not yet supported.");
2712   for (unsigned Part = 0; Part < UF; Part++) {
2713     // Collect the stored vector from each member.
2714     SmallVector<Value *, 4> StoredVecs;
2715     for (unsigned i = 0; i < InterleaveFactor; i++) {
2716       assert((Group->getMember(i) || MaskForGaps) &&
2717              "Fail to get a member from an interleaved store group");
2718       Instruction *Member = Group->getMember(i);
2719 
2720       // Skip the gaps in the group.
2721       if (!Member) {
2722         Value *Undef = PoisonValue::get(SubVT);
2723         StoredVecs.push_back(Undef);
2724         continue;
2725       }
2726 
2727       Value *StoredVec = State.get(StoredValues[i], Part);
2728 
2729       if (Group->isReverse())
2730         StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
2731 
2732       // If this member has different type, cast it to a unified type.
2733 
2734       if (StoredVec->getType() != SubVT)
2735         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2736 
2737       StoredVecs.push_back(StoredVec);
2738     }
2739 
2740     // Concatenate all vectors into a wide vector.
2741     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2742 
2743     // Interleave the elements in the wide vector.
2744     Value *IVec = Builder.CreateShuffleVector(
2745         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2746         "interleaved.vec");
2747 
2748     Instruction *NewStoreInstr;
2749     if (BlockInMask || MaskForGaps) {
2750       Value *GroupMask = MaskForGaps;
2751       if (BlockInMask) {
2752         Value *BlockInMaskPart = State.get(BlockInMask, Part);
2753         Value *ShuffledMask = Builder.CreateShuffleVector(
2754             BlockInMaskPart,
2755             createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2756             "interleaved.mask");
2757         GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And,
2758                                                       ShuffledMask, MaskForGaps)
2759                                 : ShuffledMask;
2760       }
2761       NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2762                                                 Group->getAlign(), GroupMask);
2763     } else
2764       NewStoreInstr =
2765           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2766 
2767     Group->addMetadata(NewStoreInstr);
2768   }
2769 }
2770 
2771 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2772                                                VPReplicateRecipe *RepRecipe,
2773                                                const VPIteration &Instance,
2774                                                bool IfPredicateInstr,
2775                                                VPTransformState &State) {
2776   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2777 
2778   // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2779   // the first lane and part.
2780   if (isa<NoAliasScopeDeclInst>(Instr))
2781     if (!Instance.isFirstIteration())
2782       return;
2783 
2784   setDebugLocFromInst(Instr);
2785 
2786   // Does this instruction return a value ?
2787   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2788 
2789   Instruction *Cloned = Instr->clone();
2790   if (!IsVoidRetTy)
2791     Cloned->setName(Instr->getName() + ".cloned");
2792 
2793   // If the scalarized instruction contributes to the address computation of a
2794   // widen masked load/store which was in a basic block that needed predication
2795   // and is not predicated after vectorization, we can't propagate
2796   // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized
2797   // instruction could feed a poison value to the base address of the widen
2798   // load/store.
2799   if (State.MayGeneratePoisonRecipes.contains(RepRecipe))
2800     Cloned->dropPoisonGeneratingFlags();
2801 
2802   State.Builder.SetInsertPoint(Builder.GetInsertBlock(),
2803                                Builder.GetInsertPoint());
2804   // Replace the operands of the cloned instructions with their scalar
2805   // equivalents in the new loop.
2806   for (auto &I : enumerate(RepRecipe->operands())) {
2807     auto InputInstance = Instance;
2808     VPValue *Operand = I.value();
2809     VPReplicateRecipe *OperandR = dyn_cast<VPReplicateRecipe>(Operand);
2810     if (OperandR && OperandR->isUniform())
2811       InputInstance.Lane = VPLane::getFirstLane();
2812     Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2813   }
2814   addNewMetadata(Cloned, Instr);
2815 
2816   // Place the cloned scalar in the new loop.
2817   Builder.Insert(Cloned);
2818 
2819   State.set(RepRecipe, Cloned, Instance);
2820 
2821   // If we just cloned a new assumption, add it the assumption cache.
2822   if (auto *II = dyn_cast<AssumeInst>(Cloned))
2823     AC->registerAssumption(II);
2824 
2825   // End if-block.
2826   if (IfPredicateInstr)
2827     PredicatedInstructions.push_back(Cloned);
2828 }
2829 
2830 Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) {
2831   if (TripCount)
2832     return TripCount;
2833 
2834   assert(InsertBlock);
2835   IRBuilder<> Builder(InsertBlock->getTerminator());
2836   // Find the loop boundaries.
2837   ScalarEvolution *SE = PSE.getSE();
2838   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2839   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
2840          "Invalid loop count");
2841 
2842   Type *IdxTy = Legal->getWidestInductionType();
2843   assert(IdxTy && "No type for induction");
2844 
2845   // The exit count might have the type of i64 while the phi is i32. This can
2846   // happen if we have an induction variable that is sign extended before the
2847   // compare. The only way that we get a backedge taken count is that the
2848   // induction variable was signed and as such will not overflow. In such a case
2849   // truncation is legal.
2850   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2851       IdxTy->getPrimitiveSizeInBits())
2852     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2853   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2854 
2855   // Get the total trip count from the count by adding 1.
2856   const SCEV *ExitCount = SE->getAddExpr(
2857       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2858 
2859   const DataLayout &DL = InsertBlock->getModule()->getDataLayout();
2860 
2861   // Expand the trip count and place the new instructions in the preheader.
2862   // Notice that the pre-header does not change, only the loop body.
2863   SCEVExpander Exp(*SE, DL, "induction");
2864 
2865   // Count holds the overall loop count (N).
2866   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2867                                 InsertBlock->getTerminator());
2868 
2869   if (TripCount->getType()->isPointerTy())
2870     TripCount =
2871         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2872                                     InsertBlock->getTerminator());
2873 
2874   return TripCount;
2875 }
2876 
2877 Value *
2878 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2879   if (VectorTripCount)
2880     return VectorTripCount;
2881 
2882   Value *TC = getOrCreateTripCount(InsertBlock);
2883   IRBuilder<> Builder(InsertBlock->getTerminator());
2884 
2885   Type *Ty = TC->getType();
2886   // This is where we can make the step a runtime constant.
2887   Value *Step = createStepForVF(Builder, Ty, VF, UF);
2888 
2889   // If the tail is to be folded by masking, round the number of iterations N
2890   // up to a multiple of Step instead of rounding down. This is done by first
2891   // adding Step-1 and then rounding down. Note that it's ok if this addition
2892   // overflows: the vector induction variable will eventually wrap to zero given
2893   // that it starts at zero and its Step is a power of two; the loop will then
2894   // exit, with the last early-exit vector comparison also producing all-true.
2895   if (Cost->foldTailByMasking()) {
2896     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2897            "VF*UF must be a power of 2 when folding tail by masking");
2898     Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
2899     TC = Builder.CreateAdd(
2900         TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
2901   }
2902 
2903   // Now we need to generate the expression for the part of the loop that the
2904   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2905   // iterations are not required for correctness, or N - Step, otherwise. Step
2906   // is equal to the vectorization factor (number of SIMD elements) times the
2907   // unroll factor (number of SIMD instructions).
2908   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2909 
2910   // There are cases where we *must* run at least one iteration in the remainder
2911   // loop.  See the cost model for when this can happen.  If the step evenly
2912   // divides the trip count, we set the remainder to be equal to the step. If
2913   // the step does not evenly divide the trip count, no adjustment is necessary
2914   // since there will already be scalar iterations. Note that the minimum
2915   // iterations check ensures that N >= Step.
2916   if (Cost->requiresScalarEpilogue(VF)) {
2917     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2918     R = Builder.CreateSelect(IsZero, Step, R);
2919   }
2920 
2921   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2922 
2923   return VectorTripCount;
2924 }
2925 
2926 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2927                                                    const DataLayout &DL) {
2928   // Verify that V is a vector type with same number of elements as DstVTy.
2929   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
2930   unsigned VF = DstFVTy->getNumElements();
2931   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
2932   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2933   Type *SrcElemTy = SrcVecTy->getElementType();
2934   Type *DstElemTy = DstFVTy->getElementType();
2935   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2936          "Vector elements must have same size");
2937 
2938   // Do a direct cast if element types are castable.
2939   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2940     return Builder.CreateBitOrPointerCast(V, DstFVTy);
2941   }
2942   // V cannot be directly casted to desired vector type.
2943   // May happen when V is a floating point vector but DstVTy is a vector of
2944   // pointers or vice-versa. Handle this using a two-step bitcast using an
2945   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2946   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2947          "Only one type should be a pointer type");
2948   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2949          "Only one type should be a floating point type");
2950   Type *IntTy =
2951       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2952   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
2953   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2954   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2955 }
2956 
2957 void InnerLoopVectorizer::emitMinimumIterationCountCheck(BasicBlock *Bypass) {
2958   Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
2959   // Reuse existing vector loop preheader for TC checks.
2960   // Note that new preheader block is generated for vector loop.
2961   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2962   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2963 
2964   // Generate code to check if the loop's trip count is less than VF * UF, or
2965   // equal to it in case a scalar epilogue is required; this implies that the
2966   // vector trip count is zero. This check also covers the case where adding one
2967   // to the backedge-taken count overflowed leading to an incorrect trip count
2968   // of zero. In this case we will also jump to the scalar loop.
2969   auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
2970                                             : ICmpInst::ICMP_ULT;
2971 
2972   // If tail is to be folded, vector loop takes care of all iterations.
2973   Value *CheckMinIters = Builder.getFalse();
2974   if (!Cost->foldTailByMasking()) {
2975     Value *Step = createStepForVF(Builder, Count->getType(), VF, UF);
2976     CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
2977   }
2978   // Create new preheader for vector loop.
2979   LoopVectorPreHeader =
2980       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2981                  "vector.ph");
2982 
2983   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2984                                DT->getNode(Bypass)->getIDom()) &&
2985          "TC check is expected to dominate Bypass");
2986 
2987   // Update dominator for Bypass & LoopExit (if needed).
2988   DT->changeImmediateDominator(Bypass, TCCheckBlock);
2989   if (!Cost->requiresScalarEpilogue(VF))
2990     // If there is an epilogue which must run, there's no edge from the
2991     // middle block to exit blocks  and thus no need to update the immediate
2992     // dominator of the exit blocks.
2993     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2994 
2995   ReplaceInstWithInst(
2996       TCCheckBlock->getTerminator(),
2997       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
2998   LoopBypassBlocks.push_back(TCCheckBlock);
2999 }
3000 
3001 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
3002 
3003   BasicBlock *const SCEVCheckBlock =
3004       RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
3005   if (!SCEVCheckBlock)
3006     return nullptr;
3007 
3008   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3009            (OptForSizeBasedOnProfile &&
3010             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
3011          "Cannot SCEV check stride or overflow when optimizing for size");
3012 
3013 
3014   // Update dominator only if this is first RT check.
3015   if (LoopBypassBlocks.empty()) {
3016     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3017     if (!Cost->requiresScalarEpilogue(VF))
3018       // If there is an epilogue which must run, there's no edge from the
3019       // middle block to exit blocks  and thus no need to update the immediate
3020       // dominator of the exit blocks.
3021       DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3022   }
3023 
3024   LoopBypassBlocks.push_back(SCEVCheckBlock);
3025   AddedSafetyChecks = true;
3026   return SCEVCheckBlock;
3027 }
3028 
3029 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
3030   // VPlan-native path does not do any analysis for runtime checks currently.
3031   if (EnableVPlanNativePath)
3032     return nullptr;
3033 
3034   BasicBlock *const MemCheckBlock =
3035       RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
3036 
3037   // Check if we generated code that checks in runtime if arrays overlap. We put
3038   // the checks into a separate block to make the more common case of few
3039   // elements faster.
3040   if (!MemCheckBlock)
3041     return nullptr;
3042 
3043   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3044     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3045            "Cannot emit memory checks when optimizing for size, unless forced "
3046            "to vectorize.");
3047     ORE->emit([&]() {
3048       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3049                                         OrigLoop->getStartLoc(),
3050                                         OrigLoop->getHeader())
3051              << "Code-size may be reduced by not forcing "
3052                 "vectorization, or by source-code modifications "
3053                 "eliminating the need for runtime checks "
3054                 "(e.g., adding 'restrict').";
3055     });
3056   }
3057 
3058   LoopBypassBlocks.push_back(MemCheckBlock);
3059 
3060   AddedSafetyChecks = true;
3061 
3062   // We currently don't use LoopVersioning for the actual loop cloning but we
3063   // still use it to add the noalias metadata.
3064   LVer = std::make_unique<LoopVersioning>(
3065       *Legal->getLAI(),
3066       Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3067       DT, PSE.getSE());
3068   LVer->prepareNoAliasMetadata();
3069   return MemCheckBlock;
3070 }
3071 
3072 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3073   LoopScalarBody = OrigLoop->getHeader();
3074   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3075   assert(LoopVectorPreHeader && "Invalid loop structure");
3076   LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3077   assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) &&
3078          "multiple exit loop without required epilogue?");
3079 
3080   LoopMiddleBlock =
3081       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3082                  LI, nullptr, Twine(Prefix) + "middle.block");
3083   LoopScalarPreHeader =
3084       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3085                  nullptr, Twine(Prefix) + "scalar.ph");
3086 
3087   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3088 
3089   // Set up the middle block terminator.  Two cases:
3090   // 1) If we know that we must execute the scalar epilogue, emit an
3091   //    unconditional branch.
3092   // 2) Otherwise, we must have a single unique exit block (due to how we
3093   //    implement the multiple exit case).  In this case, set up a conditonal
3094   //    branch from the middle block to the loop scalar preheader, and the
3095   //    exit block.  completeLoopSkeleton will update the condition to use an
3096   //    iteration check, if required to decide whether to execute the remainder.
3097   BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ?
3098     BranchInst::Create(LoopScalarPreHeader) :
3099     BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3100                        Builder.getTrue());
3101   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3102   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3103 
3104   SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3105              nullptr, nullptr, Twine(Prefix) + "vector.body");
3106 
3107   // Update dominator for loop exit.
3108   if (!Cost->requiresScalarEpilogue(VF))
3109     // If there is an epilogue which must run, there's no edge from the
3110     // middle block to exit blocks  and thus no need to update the immediate
3111     // dominator of the exit blocks.
3112     DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3113 }
3114 
3115 void InnerLoopVectorizer::createInductionResumeValues(
3116     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3117   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3118           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3119          "Inconsistent information about additional bypass.");
3120 
3121   Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3122   assert(VectorTripCount && "Expected valid arguments");
3123   // We are going to resume the execution of the scalar loop.
3124   // Go over all of the induction variables that we found and fix the
3125   // PHIs that are left in the scalar version of the loop.
3126   // The starting values of PHI nodes depend on the counter of the last
3127   // iteration in the vectorized loop.
3128   // If we come from a bypass edge then we need to start from the original
3129   // start value.
3130   Instruction *OldInduction = Legal->getPrimaryInduction();
3131   for (auto &InductionEntry : Legal->getInductionVars()) {
3132     PHINode *OrigPhi = InductionEntry.first;
3133     InductionDescriptor II = InductionEntry.second;
3134 
3135     // Create phi nodes to merge from the  backedge-taken check block.
3136     PHINode *BCResumeVal =
3137         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3138                         LoopScalarPreHeader->getTerminator());
3139     // Copy original phi DL over to the new one.
3140     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3141     Value *&EndValue = IVEndValues[OrigPhi];
3142     Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3143     if (OrigPhi == OldInduction) {
3144       // We know what the end value is.
3145       EndValue = VectorTripCount;
3146     } else {
3147       IRBuilder<> B(LoopVectorPreHeader->getTerminator());
3148 
3149       // Fast-math-flags propagate from the original induction instruction.
3150       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3151         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3152 
3153       Type *StepType = II.getStep()->getType();
3154       Instruction::CastOps CastOp =
3155           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3156       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3157       Value *Step =
3158           CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3159       EndValue = emitTransformedIndex(B, CRD, II.getStartValue(), Step, II);
3160       EndValue->setName("ind.end");
3161 
3162       // Compute the end value for the additional bypass (if applicable).
3163       if (AdditionalBypass.first) {
3164         B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3165         CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3166                                          StepType, true);
3167         Value *Step =
3168             CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3169         CRD =
3170             B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
3171         EndValueFromAdditionalBypass =
3172             emitTransformedIndex(B, CRD, II.getStartValue(), Step, II);
3173         EndValueFromAdditionalBypass->setName("ind.end");
3174       }
3175     }
3176     // The new PHI merges the original incoming value, in case of a bypass,
3177     // or the value at the end of the vectorized loop.
3178     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3179 
3180     // Fix the scalar body counter (PHI node).
3181     // The old induction's phi node in the scalar body needs the truncated
3182     // value.
3183     for (BasicBlock *BB : LoopBypassBlocks)
3184       BCResumeVal->addIncoming(II.getStartValue(), BB);
3185 
3186     if (AdditionalBypass.first)
3187       BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3188                                             EndValueFromAdditionalBypass);
3189 
3190     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3191   }
3192 }
3193 
3194 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(MDNode *OrigLoopID) {
3195   // The trip counts should be cached by now.
3196   Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
3197   Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3198 
3199   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3200 
3201   // Add a check in the middle block to see if we have completed
3202   // all of the iterations in the first vector loop.  Three cases:
3203   // 1) If we require a scalar epilogue, there is no conditional branch as
3204   //    we unconditionally branch to the scalar preheader.  Do nothing.
3205   // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3206   //    Thus if tail is to be folded, we know we don't need to run the
3207   //    remainder and we can use the previous value for the condition (true).
3208   // 3) Otherwise, construct a runtime check.
3209   if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) {
3210     Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3211                                         Count, VectorTripCount, "cmp.n",
3212                                         LoopMiddleBlock->getTerminator());
3213 
3214     // Here we use the same DebugLoc as the scalar loop latch terminator instead
3215     // of the corresponding compare because they may have ended up with
3216     // different line numbers and we want to avoid awkward line stepping while
3217     // debugging. Eg. if the compare has got a line number inside the loop.
3218     CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3219     cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3220   }
3221 
3222 #ifdef EXPENSIVE_CHECKS
3223   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3224 #endif
3225 
3226   return LoopVectorPreHeader;
3227 }
3228 
3229 std::pair<BasicBlock *, Value *>
3230 InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3231   /*
3232    In this function we generate a new loop. The new loop will contain
3233    the vectorized instructions while the old loop will continue to run the
3234    scalar remainder.
3235 
3236        [ ] <-- loop iteration number check.
3237     /   |
3238    /    v
3239   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3240   |  /  |
3241   | /   v
3242   ||   [ ]     <-- vector pre header.
3243   |/    |
3244   |     v
3245   |    [  ] \
3246   |    [  ]_|   <-- vector loop.
3247   |     |
3248   |     v
3249   \   -[ ]   <--- middle-block.
3250    \/   |
3251    /\   v
3252    | ->[ ]     <--- new preheader.
3253    |    |
3254  (opt)  v      <-- edge from middle to exit iff epilogue is not required.
3255    |   [ ] \
3256    |   [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue).
3257     \   |
3258      \  v
3259       >[ ]     <-- exit block(s).
3260    ...
3261    */
3262 
3263   // Get the metadata of the original loop before it gets modified.
3264   MDNode *OrigLoopID = OrigLoop->getLoopID();
3265 
3266   // Workaround!  Compute the trip count of the original loop and cache it
3267   // before we start modifying the CFG.  This code has a systemic problem
3268   // wherein it tries to run analysis over partially constructed IR; this is
3269   // wrong, and not simply for SCEV.  The trip count of the original loop
3270   // simply happens to be prone to hitting this in practice.  In theory, we
3271   // can hit the same issue for any SCEV, or ValueTracking query done during
3272   // mutation.  See PR49900.
3273   getOrCreateTripCount(OrigLoop->getLoopPreheader());
3274 
3275   // Create an empty vector loop, and prepare basic blocks for the runtime
3276   // checks.
3277   createVectorLoopSkeleton("");
3278 
3279   // Now, compare the new count to zero. If it is zero skip the vector loop and
3280   // jump to the scalar loop. This check also covers the case where the
3281   // backedge-taken count is uint##_max: adding one to it will overflow leading
3282   // to an incorrect trip count of zero. In this (rare) case we will also jump
3283   // to the scalar loop.
3284   emitMinimumIterationCountCheck(LoopScalarPreHeader);
3285 
3286   // Generate the code to check any assumptions that we've made for SCEV
3287   // expressions.
3288   emitSCEVChecks(LoopScalarPreHeader);
3289 
3290   // Generate the code that checks in runtime if arrays overlap. We put the
3291   // checks into a separate block to make the more common case of few elements
3292   // faster.
3293   emitMemRuntimeChecks(LoopScalarPreHeader);
3294 
3295   // Emit phis for the new starting index of the scalar loop.
3296   createInductionResumeValues();
3297 
3298   return {completeLoopSkeleton(OrigLoopID), nullptr};
3299 }
3300 
3301 // Fix up external users of the induction variable. At this point, we are
3302 // in LCSSA form, with all external PHIs that use the IV having one input value,
3303 // coming from the remainder loop. We need those PHIs to also have a correct
3304 // value for the IV when arriving directly from the middle block.
3305 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3306                                        const InductionDescriptor &II,
3307                                        Value *CountRoundDown, Value *EndValue,
3308                                        BasicBlock *MiddleBlock,
3309                                        BasicBlock *VectorHeader) {
3310   // There are two kinds of external IV usages - those that use the value
3311   // computed in the last iteration (the PHI) and those that use the penultimate
3312   // value (the value that feeds into the phi from the loop latch).
3313   // We allow both, but they, obviously, have different values.
3314 
3315   assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3316 
3317   DenseMap<Value *, Value *> MissingVals;
3318 
3319   // An external user of the last iteration's value should see the value that
3320   // the remainder loop uses to initialize its own IV.
3321   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3322   for (User *U : PostInc->users()) {
3323     Instruction *UI = cast<Instruction>(U);
3324     if (!OrigLoop->contains(UI)) {
3325       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3326       MissingVals[UI] = EndValue;
3327     }
3328   }
3329 
3330   // An external user of the penultimate value need to see EndValue - Step.
3331   // The simplest way to get this is to recompute it from the constituent SCEVs,
3332   // that is Start + (Step * (CRD - 1)).
3333   for (User *U : OrigPhi->users()) {
3334     auto *UI = cast<Instruction>(U);
3335     if (!OrigLoop->contains(UI)) {
3336       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3337 
3338       IRBuilder<> B(MiddleBlock->getTerminator());
3339 
3340       // Fast-math-flags propagate from the original induction instruction.
3341       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3342         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3343 
3344       Value *CountMinusOne = B.CreateSub(
3345           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3346       Value *CMO =
3347           !II.getStep()->getType()->isIntegerTy()
3348               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3349                              II.getStep()->getType())
3350               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3351       CMO->setName("cast.cmo");
3352 
3353       Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(),
3354                                     VectorHeader->getTerminator());
3355       Value *Escape =
3356           emitTransformedIndex(B, CMO, II.getStartValue(), Step, II);
3357       Escape->setName("ind.escape");
3358       MissingVals[UI] = Escape;
3359     }
3360   }
3361 
3362   for (auto &I : MissingVals) {
3363     PHINode *PHI = cast<PHINode>(I.first);
3364     // One corner case we have to handle is two IVs "chasing" each-other,
3365     // that is %IV2 = phi [...], [ %IV1, %latch ]
3366     // In this case, if IV1 has an external use, we need to avoid adding both
3367     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3368     // don't already have an incoming value for the middle block.
3369     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3370       PHI->addIncoming(I.second, MiddleBlock);
3371   }
3372 }
3373 
3374 namespace {
3375 
3376 struct CSEDenseMapInfo {
3377   static bool canHandle(const Instruction *I) {
3378     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3379            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3380   }
3381 
3382   static inline Instruction *getEmptyKey() {
3383     return DenseMapInfo<Instruction *>::getEmptyKey();
3384   }
3385 
3386   static inline Instruction *getTombstoneKey() {
3387     return DenseMapInfo<Instruction *>::getTombstoneKey();
3388   }
3389 
3390   static unsigned getHashValue(const Instruction *I) {
3391     assert(canHandle(I) && "Unknown instruction!");
3392     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3393                                                            I->value_op_end()));
3394   }
3395 
3396   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3397     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3398         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3399       return LHS == RHS;
3400     return LHS->isIdenticalTo(RHS);
3401   }
3402 };
3403 
3404 } // end anonymous namespace
3405 
3406 ///Perform cse of induction variable instructions.
3407 static void cse(BasicBlock *BB) {
3408   // Perform simple cse.
3409   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3410   for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3411     if (!CSEDenseMapInfo::canHandle(&In))
3412       continue;
3413 
3414     // Check if we can replace this instruction with any of the
3415     // visited instructions.
3416     if (Instruction *V = CSEMap.lookup(&In)) {
3417       In.replaceAllUsesWith(V);
3418       In.eraseFromParent();
3419       continue;
3420     }
3421 
3422     CSEMap[&In] = &In;
3423   }
3424 }
3425 
3426 InstructionCost
3427 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3428                                               bool &NeedToScalarize) const {
3429   Function *F = CI->getCalledFunction();
3430   Type *ScalarRetTy = CI->getType();
3431   SmallVector<Type *, 4> Tys, ScalarTys;
3432   for (auto &ArgOp : CI->args())
3433     ScalarTys.push_back(ArgOp->getType());
3434 
3435   // Estimate cost of scalarized vector call. The source operands are assumed
3436   // to be vectors, so we need to extract individual elements from there,
3437   // execute VF scalar calls, and then gather the result into the vector return
3438   // value.
3439   InstructionCost ScalarCallCost =
3440       TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
3441   if (VF.isScalar())
3442     return ScalarCallCost;
3443 
3444   // Compute corresponding vector type for return value and arguments.
3445   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3446   for (Type *ScalarTy : ScalarTys)
3447     Tys.push_back(ToVectorTy(ScalarTy, VF));
3448 
3449   // Compute costs of unpacking argument values for the scalar calls and
3450   // packing the return values to a vector.
3451   InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
3452 
3453   InstructionCost Cost =
3454       ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3455 
3456   // If we can't emit a vector call for this function, then the currently found
3457   // cost is the cost we need to return.
3458   NeedToScalarize = true;
3459   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3460   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3461 
3462   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3463     return Cost;
3464 
3465   // If the corresponding vector cost is cheaper, return its cost.
3466   InstructionCost VectorCallCost =
3467       TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
3468   if (VectorCallCost < Cost) {
3469     NeedToScalarize = false;
3470     Cost = VectorCallCost;
3471   }
3472   return Cost;
3473 }
3474 
3475 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3476   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3477     return Elt;
3478   return VectorType::get(Elt, VF);
3479 }
3480 
3481 InstructionCost
3482 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3483                                                    ElementCount VF) const {
3484   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3485   assert(ID && "Expected intrinsic call!");
3486   Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3487   FastMathFlags FMF;
3488   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3489     FMF = FPMO->getFastMathFlags();
3490 
3491   SmallVector<const Value *> Arguments(CI->args());
3492   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3493   SmallVector<Type *> ParamTys;
3494   std::transform(FTy->param_begin(), FTy->param_end(),
3495                  std::back_inserter(ParamTys),
3496                  [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3497 
3498   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3499                                     dyn_cast<IntrinsicInst>(CI));
3500   return TTI.getIntrinsicInstrCost(CostAttrs,
3501                                    TargetTransformInfo::TCK_RecipThroughput);
3502 }
3503 
3504 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3505   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3506   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3507   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3508 }
3509 
3510 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3511   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3512   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3513   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3514 }
3515 
3516 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3517   // For every instruction `I` in MinBWs, truncate the operands, create a
3518   // truncated version of `I` and reextend its result. InstCombine runs
3519   // later and will remove any ext/trunc pairs.
3520   SmallPtrSet<Value *, 4> Erased;
3521   for (const auto &KV : Cost->getMinimalBitwidths()) {
3522     // If the value wasn't vectorized, we must maintain the original scalar
3523     // type. The absence of the value from State indicates that it
3524     // wasn't vectorized.
3525     // FIXME: Should not rely on getVPValue at this point.
3526     VPValue *Def = State.Plan->getVPValue(KV.first, true);
3527     if (!State.hasAnyVectorValue(Def))
3528       continue;
3529     for (unsigned Part = 0; Part < UF; ++Part) {
3530       Value *I = State.get(Def, Part);
3531       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3532         continue;
3533       Type *OriginalTy = I->getType();
3534       Type *ScalarTruncatedTy =
3535           IntegerType::get(OriginalTy->getContext(), KV.second);
3536       auto *TruncatedTy = VectorType::get(
3537           ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
3538       if (TruncatedTy == OriginalTy)
3539         continue;
3540 
3541       IRBuilder<> B(cast<Instruction>(I));
3542       auto ShrinkOperand = [&](Value *V) -> Value * {
3543         if (auto *ZI = dyn_cast<ZExtInst>(V))
3544           if (ZI->getSrcTy() == TruncatedTy)
3545             return ZI->getOperand(0);
3546         return B.CreateZExtOrTrunc(V, TruncatedTy);
3547       };
3548 
3549       // The actual instruction modification depends on the instruction type,
3550       // unfortunately.
3551       Value *NewI = nullptr;
3552       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3553         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3554                              ShrinkOperand(BO->getOperand(1)));
3555 
3556         // Any wrapping introduced by shrinking this operation shouldn't be
3557         // considered undefined behavior. So, we can't unconditionally copy
3558         // arithmetic wrapping flags to NewI.
3559         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3560       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3561         NewI =
3562             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3563                          ShrinkOperand(CI->getOperand(1)));
3564       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3565         NewI = B.CreateSelect(SI->getCondition(),
3566                               ShrinkOperand(SI->getTrueValue()),
3567                               ShrinkOperand(SI->getFalseValue()));
3568       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3569         switch (CI->getOpcode()) {
3570         default:
3571           llvm_unreachable("Unhandled cast!");
3572         case Instruction::Trunc:
3573           NewI = ShrinkOperand(CI->getOperand(0));
3574           break;
3575         case Instruction::SExt:
3576           NewI = B.CreateSExtOrTrunc(
3577               CI->getOperand(0),
3578               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3579           break;
3580         case Instruction::ZExt:
3581           NewI = B.CreateZExtOrTrunc(
3582               CI->getOperand(0),
3583               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3584           break;
3585         }
3586       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3587         auto Elements0 =
3588             cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
3589         auto *O0 = B.CreateZExtOrTrunc(
3590             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3591         auto Elements1 =
3592             cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
3593         auto *O1 = B.CreateZExtOrTrunc(
3594             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3595 
3596         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3597       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3598         // Don't do anything with the operands, just extend the result.
3599         continue;
3600       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3601         auto Elements =
3602             cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
3603         auto *O0 = B.CreateZExtOrTrunc(
3604             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3605         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3606         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3607       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3608         auto Elements =
3609             cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
3610         auto *O0 = B.CreateZExtOrTrunc(
3611             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3612         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3613       } else {
3614         // If we don't know what to do, be conservative and don't do anything.
3615         continue;
3616       }
3617 
3618       // Lastly, extend the result.
3619       NewI->takeName(cast<Instruction>(I));
3620       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3621       I->replaceAllUsesWith(Res);
3622       cast<Instruction>(I)->eraseFromParent();
3623       Erased.insert(I);
3624       State.reset(Def, Res, Part);
3625     }
3626   }
3627 
3628   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3629   for (const auto &KV : Cost->getMinimalBitwidths()) {
3630     // If the value wasn't vectorized, we must maintain the original scalar
3631     // type. The absence of the value from State indicates that it
3632     // wasn't vectorized.
3633     // FIXME: Should not rely on getVPValue at this point.
3634     VPValue *Def = State.Plan->getVPValue(KV.first, true);
3635     if (!State.hasAnyVectorValue(Def))
3636       continue;
3637     for (unsigned Part = 0; Part < UF; ++Part) {
3638       Value *I = State.get(Def, Part);
3639       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3640       if (Inst && Inst->use_empty()) {
3641         Value *NewI = Inst->getOperand(0);
3642         Inst->eraseFromParent();
3643         State.reset(Def, NewI, Part);
3644       }
3645     }
3646   }
3647 }
3648 
3649 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
3650   // Insert truncates and extends for any truncated instructions as hints to
3651   // InstCombine.
3652   if (VF.isVector())
3653     truncateToMinimalBitwidths(State);
3654 
3655   // Fix widened non-induction PHIs by setting up the PHI operands.
3656   if (OrigPHIsToFix.size()) {
3657     assert(EnableVPlanNativePath &&
3658            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3659     fixNonInductionPHIs(State);
3660   }
3661 
3662   // At this point every instruction in the original loop is widened to a
3663   // vector form. Now we need to fix the recurrences in the loop. These PHI
3664   // nodes are currently empty because we did not want to introduce cycles.
3665   // This is the second stage of vectorizing recurrences.
3666   fixCrossIterationPHIs(State);
3667 
3668   // Forget the original basic block.
3669   PSE.getSE()->forgetLoop(OrigLoop);
3670 
3671   Loop *VectorLoop = LI->getLoopFor(State.CFG.PrevBB);
3672   // If we inserted an edge from the middle block to the unique exit block,
3673   // update uses outside the loop (phis) to account for the newly inserted
3674   // edge.
3675   if (!Cost->requiresScalarEpilogue(VF)) {
3676     // Fix-up external users of the induction variables.
3677     for (auto &Entry : Legal->getInductionVars())
3678       fixupIVUsers(Entry.first, Entry.second,
3679                    getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
3680                    IVEndValues[Entry.first], LoopMiddleBlock,
3681                    VectorLoop->getHeader());
3682 
3683     fixLCSSAPHIs(State);
3684   }
3685 
3686   for (Instruction *PI : PredicatedInstructions)
3687     sinkScalarOperands(&*PI);
3688 
3689   // Remove redundant induction instructions.
3690   cse(VectorLoop->getHeader());
3691 
3692   // Set/update profile weights for the vector and remainder loops as original
3693   // loop iterations are now distributed among them. Note that original loop
3694   // represented by LoopScalarBody becomes remainder loop after vectorization.
3695   //
3696   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3697   // end up getting slightly roughened result but that should be OK since
3698   // profile is not inherently precise anyway. Note also possible bypass of
3699   // vector code caused by legality checks is ignored, assigning all the weight
3700   // to the vector loop, optimistically.
3701   //
3702   // For scalable vectorization we can't know at compile time how many iterations
3703   // of the loop are handled in one vector iteration, so instead assume a pessimistic
3704   // vscale of '1'.
3705   setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop,
3706                                LI->getLoopFor(LoopScalarBody),
3707                                VF.getKnownMinValue() * UF);
3708 }
3709 
3710 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
3711   // In order to support recurrences we need to be able to vectorize Phi nodes.
3712   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3713   // stage #2: We now need to fix the recurrences by adding incoming edges to
3714   // the currently empty PHI nodes. At this point every instruction in the
3715   // original loop is widened to a vector form so we can use them to construct
3716   // the incoming edges.
3717   VPBasicBlock *Header =
3718       State.Plan->getVectorLoopRegion()->getEntryBasicBlock();
3719   for (VPRecipeBase &R : Header->phis()) {
3720     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
3721       fixReduction(ReductionPhi, State);
3722     else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3723       fixFirstOrderRecurrence(FOR, State);
3724   }
3725 }
3726 
3727 void InnerLoopVectorizer::fixFirstOrderRecurrence(
3728     VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
3729   // This is the second phase of vectorizing first-order recurrences. An
3730   // overview of the transformation is described below. Suppose we have the
3731   // following loop.
3732   //
3733   //   for (int i = 0; i < n; ++i)
3734   //     b[i] = a[i] - a[i - 1];
3735   //
3736   // There is a first-order recurrence on "a". For this loop, the shorthand
3737   // scalar IR looks like:
3738   //
3739   //   scalar.ph:
3740   //     s_init = a[-1]
3741   //     br scalar.body
3742   //
3743   //   scalar.body:
3744   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3745   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3746   //     s2 = a[i]
3747   //     b[i] = s2 - s1
3748   //     br cond, scalar.body, ...
3749   //
3750   // In this example, s1 is a recurrence because it's value depends on the
3751   // previous iteration. In the first phase of vectorization, we created a
3752   // vector phi v1 for s1. We now complete the vectorization and produce the
3753   // shorthand vector IR shown below (for VF = 4, UF = 1).
3754   //
3755   //   vector.ph:
3756   //     v_init = vector(..., ..., ..., a[-1])
3757   //     br vector.body
3758   //
3759   //   vector.body
3760   //     i = phi [0, vector.ph], [i+4, vector.body]
3761   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3762   //     v2 = a[i, i+1, i+2, i+3];
3763   //     v3 = vector(v1(3), v2(0, 1, 2))
3764   //     b[i, i+1, i+2, i+3] = v2 - v3
3765   //     br cond, vector.body, middle.block
3766   //
3767   //   middle.block:
3768   //     x = v2(3)
3769   //     br scalar.ph
3770   //
3771   //   scalar.ph:
3772   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3773   //     br scalar.body
3774   //
3775   // After execution completes the vector loop, we extract the next value of
3776   // the recurrence (x) to use as the initial value in the scalar loop.
3777 
3778   // Extract the last vector element in the middle block. This will be the
3779   // initial value for the recurrence when jumping to the scalar loop.
3780   VPValue *PreviousDef = PhiR->getBackedgeValue();
3781   Value *Incoming = State.get(PreviousDef, UF - 1);
3782   auto *ExtractForScalar = Incoming;
3783   auto *IdxTy = Builder.getInt32Ty();
3784   if (VF.isVector()) {
3785     auto *One = ConstantInt::get(IdxTy, 1);
3786     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3787     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3788     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3789     ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
3790                                                     "vector.recur.extract");
3791   }
3792   // Extract the second last element in the middle block if the
3793   // Phi is used outside the loop. We need to extract the phi itself
3794   // and not the last element (the phi update in the current iteration). This
3795   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3796   // when the scalar loop is not run at all.
3797   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3798   if (VF.isVector()) {
3799     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3800     auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
3801     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3802         Incoming, Idx, "vector.recur.extract.for.phi");
3803   } else if (UF > 1)
3804     // When loop is unrolled without vectorizing, initialize
3805     // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
3806     // of `Incoming`. This is analogous to the vectorized case above: extracting
3807     // the second last element when VF > 1.
3808     ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
3809 
3810   // Fix the initial value of the original recurrence in the scalar loop.
3811   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3812   PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
3813   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3814   auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
3815   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3816     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3817     Start->addIncoming(Incoming, BB);
3818   }
3819 
3820   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3821   Phi->setName("scalar.recur");
3822 
3823   // Finally, fix users of the recurrence outside the loop. The users will need
3824   // either the last value of the scalar recurrence or the last value of the
3825   // vector recurrence we extracted in the middle block. Since the loop is in
3826   // LCSSA form, we just need to find all the phi nodes for the original scalar
3827   // recurrence in the exit block, and then add an edge for the middle block.
3828   // Note that LCSSA does not imply single entry when the original scalar loop
3829   // had multiple exiting edges (as we always run the last iteration in the
3830   // scalar epilogue); in that case, there is no edge from middle to exit and
3831   // and thus no phis which needed updated.
3832   if (!Cost->requiresScalarEpilogue(VF))
3833     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
3834       if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi))
3835         LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3836 }
3837 
3838 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
3839                                        VPTransformState &State) {
3840   PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
3841   // Get it's reduction variable descriptor.
3842   assert(Legal->isReductionVariable(OrigPhi) &&
3843          "Unable to find the reduction variable");
3844   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
3845 
3846   RecurKind RK = RdxDesc.getRecurrenceKind();
3847   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3848   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3849   setDebugLocFromInst(ReductionStartValue);
3850 
3851   VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
3852   // This is the vector-clone of the value that leaves the loop.
3853   Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
3854 
3855   // Wrap flags are in general invalid after vectorization, clear them.
3856   clearReductionWrapFlags(RdxDesc, State);
3857 
3858   // Before each round, move the insertion point right between
3859   // the PHIs and the values we are going to write.
3860   // This allows us to write both PHINodes and the extractelement
3861   // instructions.
3862   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3863 
3864   setDebugLocFromInst(LoopExitInst);
3865 
3866   Type *PhiTy = OrigPhi->getType();
3867   BasicBlock *VectorLoopLatch =
3868       LI->getLoopFor(State.CFG.PrevBB)->getLoopLatch();
3869   // If tail is folded by masking, the vector value to leave the loop should be
3870   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3871   // instead of the former. For an inloop reduction the reduction will already
3872   // be predicated, and does not need to be handled here.
3873   if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
3874     for (unsigned Part = 0; Part < UF; ++Part) {
3875       Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
3876       Value *Sel = nullptr;
3877       for (User *U : VecLoopExitInst->users()) {
3878         if (isa<SelectInst>(U)) {
3879           assert(!Sel && "Reduction exit feeding two selects");
3880           Sel = U;
3881         } else
3882           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3883       }
3884       assert(Sel && "Reduction exit feeds no select");
3885       State.reset(LoopExitInstDef, Sel, Part);
3886 
3887       // If the target can create a predicated operator for the reduction at no
3888       // extra cost in the loop (for example a predicated vadd), it can be
3889       // cheaper for the select to remain in the loop than be sunk out of it,
3890       // and so use the select value for the phi instead of the old
3891       // LoopExitValue.
3892       if (PreferPredicatedReductionSelect ||
3893           TTI->preferPredicatedReductionSelect(
3894               RdxDesc.getOpcode(), PhiTy,
3895               TargetTransformInfo::ReductionFlags())) {
3896         auto *VecRdxPhi =
3897             cast<PHINode>(State.get(PhiR, Part));
3898         VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel);
3899       }
3900     }
3901   }
3902 
3903   // If the vector reduction can be performed in a smaller type, we truncate
3904   // then extend the loop exit value to enable InstCombine to evaluate the
3905   // entire expression in the smaller type.
3906   if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
3907     assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
3908     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3909     Builder.SetInsertPoint(VectorLoopLatch->getTerminator());
3910     VectorParts RdxParts(UF);
3911     for (unsigned Part = 0; Part < UF; ++Part) {
3912       RdxParts[Part] = State.get(LoopExitInstDef, Part);
3913       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3914       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3915                                         : Builder.CreateZExt(Trunc, VecTy);
3916       for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users()))
3917         if (U != Trunc) {
3918           U->replaceUsesOfWith(RdxParts[Part], Extnd);
3919           RdxParts[Part] = Extnd;
3920         }
3921     }
3922     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3923     for (unsigned Part = 0; Part < UF; ++Part) {
3924       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3925       State.reset(LoopExitInstDef, RdxParts[Part], Part);
3926     }
3927   }
3928 
3929   // Reduce all of the unrolled parts into a single vector.
3930   Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
3931   unsigned Op = RecurrenceDescriptor::getOpcode(RK);
3932 
3933   // The middle block terminator has already been assigned a DebugLoc here (the
3934   // OrigLoop's single latch terminator). We want the whole middle block to
3935   // appear to execute on this line because: (a) it is all compiler generated,
3936   // (b) these instructions are always executed after evaluating the latch
3937   // conditional branch, and (c) other passes may add new predecessors which
3938   // terminate on this line. This is the easiest way to ensure we don't
3939   // accidentally cause an extra step back into the loop while debugging.
3940   setDebugLocFromInst(LoopMiddleBlock->getTerminator());
3941   if (PhiR->isOrdered())
3942     ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
3943   else {
3944     // Floating-point operations should have some FMF to enable the reduction.
3945     IRBuilderBase::FastMathFlagGuard FMFG(Builder);
3946     Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
3947     for (unsigned Part = 1; Part < UF; ++Part) {
3948       Value *RdxPart = State.get(LoopExitInstDef, Part);
3949       if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
3950         ReducedPartRdx = Builder.CreateBinOp(
3951             (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
3952       } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK))
3953         ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK,
3954                                            ReducedPartRdx, RdxPart);
3955       else
3956         ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
3957     }
3958   }
3959 
3960   // Create the reduction after the loop. Note that inloop reductions create the
3961   // target reduction in the loop using a Reduction recipe.
3962   if (VF.isVector() && !PhiR->isInLoop()) {
3963     ReducedPartRdx =
3964         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi);
3965     // If the reduction can be performed in a smaller type, we need to extend
3966     // the reduction to the wider type before we branch to the original loop.
3967     if (PhiTy != RdxDesc.getRecurrenceType())
3968       ReducedPartRdx = RdxDesc.isSigned()
3969                            ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
3970                            : Builder.CreateZExt(ReducedPartRdx, PhiTy);
3971   }
3972 
3973   PHINode *ResumePhi =
3974       dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
3975 
3976   // Create a phi node that merges control-flow from the backedge-taken check
3977   // block and the middle block.
3978   PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
3979                                         LoopScalarPreHeader->getTerminator());
3980 
3981   // If we are fixing reductions in the epilogue loop then we should already
3982   // have created a bc.merge.rdx Phi after the main vector body. Ensure that
3983   // we carry over the incoming values correctly.
3984   for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
3985     if (Incoming == LoopMiddleBlock)
3986       BCBlockPhi->addIncoming(ReducedPartRdx, Incoming);
3987     else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming))
3988       BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
3989                               Incoming);
3990     else
3991       BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
3992   }
3993 
3994   // Set the resume value for this reduction
3995   ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
3996 
3997   // Now, we need to fix the users of the reduction variable
3998   // inside and outside of the scalar remainder loop.
3999 
4000   // We know that the loop is in LCSSA form. We need to update the PHI nodes
4001   // in the exit blocks.  See comment on analogous loop in
4002   // fixFirstOrderRecurrence for a more complete explaination of the logic.
4003   if (!Cost->requiresScalarEpilogue(VF))
4004     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4005       if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst))
4006         LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4007 
4008   // Fix the scalar loop reduction variable with the incoming reduction sum
4009   // from the vector body and from the backedge value.
4010   int IncomingEdgeBlockIdx =
4011       OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4012   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4013   // Pick the other block.
4014   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4015   OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4016   OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4017 }
4018 
4019 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
4020                                                   VPTransformState &State) {
4021   RecurKind RK = RdxDesc.getRecurrenceKind();
4022   if (RK != RecurKind::Add && RK != RecurKind::Mul)
4023     return;
4024 
4025   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4026   assert(LoopExitInstr && "null loop exit instruction");
4027   SmallVector<Instruction *, 8> Worklist;
4028   SmallPtrSet<Instruction *, 8> Visited;
4029   Worklist.push_back(LoopExitInstr);
4030   Visited.insert(LoopExitInstr);
4031 
4032   while (!Worklist.empty()) {
4033     Instruction *Cur = Worklist.pop_back_val();
4034     if (isa<OverflowingBinaryOperator>(Cur))
4035       for (unsigned Part = 0; Part < UF; ++Part) {
4036         // FIXME: Should not rely on getVPValue at this point.
4037         Value *V = State.get(State.Plan->getVPValue(Cur, true), Part);
4038         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4039       }
4040 
4041     for (User *U : Cur->users()) {
4042       Instruction *UI = cast<Instruction>(U);
4043       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4044           Visited.insert(UI).second)
4045         Worklist.push_back(UI);
4046     }
4047   }
4048 }
4049 
4050 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) {
4051   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4052     if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
4053       // Some phis were already hand updated by the reduction and recurrence
4054       // code above, leave them alone.
4055       continue;
4056 
4057     auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4058     // Non-instruction incoming values will have only one value.
4059 
4060     VPLane Lane = VPLane::getFirstLane();
4061     if (isa<Instruction>(IncomingValue) &&
4062         !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue),
4063                                            VF))
4064       Lane = VPLane::getLastLaneForVF(VF);
4065 
4066     // Can be a loop invariant incoming value or the last scalar value to be
4067     // extracted from the vectorized loop.
4068     // FIXME: Should not rely on getVPValue at this point.
4069     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4070     Value *lastIncomingValue =
4071         OrigLoop->isLoopInvariant(IncomingValue)
4072             ? IncomingValue
4073             : State.get(State.Plan->getVPValue(IncomingValue, true),
4074                         VPIteration(UF - 1, Lane));
4075     LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4076   }
4077 }
4078 
4079 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4080   // The basic block and loop containing the predicated instruction.
4081   auto *PredBB = PredInst->getParent();
4082   auto *VectorLoop = LI->getLoopFor(PredBB);
4083 
4084   // Initialize a worklist with the operands of the predicated instruction.
4085   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4086 
4087   // Holds instructions that we need to analyze again. An instruction may be
4088   // reanalyzed if we don't yet know if we can sink it or not.
4089   SmallVector<Instruction *, 8> InstsToReanalyze;
4090 
4091   // Returns true if a given use occurs in the predicated block. Phi nodes use
4092   // their operands in their corresponding predecessor blocks.
4093   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4094     auto *I = cast<Instruction>(U.getUser());
4095     BasicBlock *BB = I->getParent();
4096     if (auto *Phi = dyn_cast<PHINode>(I))
4097       BB = Phi->getIncomingBlock(
4098           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4099     return BB == PredBB;
4100   };
4101 
4102   // Iteratively sink the scalarized operands of the predicated instruction
4103   // into the block we created for it. When an instruction is sunk, it's
4104   // operands are then added to the worklist. The algorithm ends after one pass
4105   // through the worklist doesn't sink a single instruction.
4106   bool Changed;
4107   do {
4108     // Add the instructions that need to be reanalyzed to the worklist, and
4109     // reset the changed indicator.
4110     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4111     InstsToReanalyze.clear();
4112     Changed = false;
4113 
4114     while (!Worklist.empty()) {
4115       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4116 
4117       // We can't sink an instruction if it is a phi node, is not in the loop,
4118       // or may have side effects.
4119       if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
4120           I->mayHaveSideEffects())
4121         continue;
4122 
4123       // If the instruction is already in PredBB, check if we can sink its
4124       // operands. In that case, VPlan's sinkScalarOperands() succeeded in
4125       // sinking the scalar instruction I, hence it appears in PredBB; but it
4126       // may have failed to sink I's operands (recursively), which we try
4127       // (again) here.
4128       if (I->getParent() == PredBB) {
4129         Worklist.insert(I->op_begin(), I->op_end());
4130         continue;
4131       }
4132 
4133       // It's legal to sink the instruction if all its uses occur in the
4134       // predicated block. Otherwise, there's nothing to do yet, and we may
4135       // need to reanalyze the instruction.
4136       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4137         InstsToReanalyze.push_back(I);
4138         continue;
4139       }
4140 
4141       // Move the instruction to the beginning of the predicated block, and add
4142       // it's operands to the worklist.
4143       I->moveBefore(&*PredBB->getFirstInsertionPt());
4144       Worklist.insert(I->op_begin(), I->op_end());
4145 
4146       // The sinking may have enabled other instructions to be sunk, so we will
4147       // need to iterate.
4148       Changed = true;
4149     }
4150   } while (Changed);
4151 }
4152 
4153 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
4154   for (PHINode *OrigPhi : OrigPHIsToFix) {
4155     VPWidenPHIRecipe *VPPhi =
4156         cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi));
4157     PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4158     // Make sure the builder has a valid insert point.
4159     Builder.SetInsertPoint(NewPhi);
4160     for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4161       VPValue *Inc = VPPhi->getIncomingValue(i);
4162       VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4163       NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4164     }
4165   }
4166 }
4167 
4168 bool InnerLoopVectorizer::useOrderedReductions(
4169     const RecurrenceDescriptor &RdxDesc) {
4170   return Cost->useOrderedReductions(RdxDesc);
4171 }
4172 
4173 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
4174                                               VPWidenPHIRecipe *PhiR,
4175                                               VPTransformState &State) {
4176   assert(EnableVPlanNativePath &&
4177          "Non-native vplans are not expected to have VPWidenPHIRecipes.");
4178   // Currently we enter here in the VPlan-native path for non-induction
4179   // PHIs where all control flow is uniform. We simply widen these PHIs.
4180   // Create a vector phi with no operands - the vector phi operands will be
4181   // set at the end of vector code generation.
4182   Type *VecTy = (State.VF.isScalar())
4183                     ? PN->getType()
4184                     : VectorType::get(PN->getType(), State.VF);
4185   Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4186   State.set(PhiR, VecPhi, 0);
4187   OrigPHIsToFix.push_back(cast<PHINode>(PN));
4188 }
4189 
4190 /// A helper function for checking whether an integer division-related
4191 /// instruction may divide by zero (in which case it must be predicated if
4192 /// executed conditionally in the scalar code).
4193 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4194 /// Non-zero divisors that are non compile-time constants will not be
4195 /// converted into multiplication, so we will still end up scalarizing
4196 /// the division, but can do so w/o predication.
4197 static bool mayDivideByZero(Instruction &I) {
4198   assert((I.getOpcode() == Instruction::UDiv ||
4199           I.getOpcode() == Instruction::SDiv ||
4200           I.getOpcode() == Instruction::URem ||
4201           I.getOpcode() == Instruction::SRem) &&
4202          "Unexpected instruction");
4203   Value *Divisor = I.getOperand(1);
4204   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4205   return !CInt || CInt->isZero();
4206 }
4207 
4208 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4209                                                VPUser &ArgOperands,
4210                                                VPTransformState &State) {
4211   assert(!isa<DbgInfoIntrinsic>(I) &&
4212          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4213   setDebugLocFromInst(&I);
4214 
4215   Module *M = I.getParent()->getParent()->getParent();
4216   auto *CI = cast<CallInst>(&I);
4217 
4218   SmallVector<Type *, 4> Tys;
4219   for (Value *ArgOperand : CI->args())
4220     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4221 
4222   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4223 
4224   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4225   // version of the instruction.
4226   // Is it beneficial to perform intrinsic call compared to lib call?
4227   bool NeedToScalarize = false;
4228   InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4229   InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
4230   bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
4231   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4232          "Instruction should be scalarized elsewhere.");
4233   assert((IntrinsicCost.isValid() || CallCost.isValid()) &&
4234          "Either the intrinsic cost or vector call cost must be valid");
4235 
4236   for (unsigned Part = 0; Part < UF; ++Part) {
4237     SmallVector<Type *, 2> TysForDecl = {CI->getType()};
4238     SmallVector<Value *, 4> Args;
4239     for (auto &I : enumerate(ArgOperands.operands())) {
4240       // Some intrinsics have a scalar argument - don't replace it with a
4241       // vector.
4242       Value *Arg;
4243       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4244         Arg = State.get(I.value(), Part);
4245       else {
4246         Arg = State.get(I.value(), VPIteration(0, 0));
4247         if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index()))
4248           TysForDecl.push_back(Arg->getType());
4249       }
4250       Args.push_back(Arg);
4251     }
4252 
4253     Function *VectorF;
4254     if (UseVectorIntrinsic) {
4255       // Use vector version of the intrinsic.
4256       if (VF.isVector())
4257         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4258       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4259       assert(VectorF && "Can't retrieve vector intrinsic.");
4260     } else {
4261       // Use vector version of the function call.
4262       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4263 #ifndef NDEBUG
4264       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4265              "Can't create vector function.");
4266 #endif
4267         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4268     }
4269       SmallVector<OperandBundleDef, 1> OpBundles;
4270       CI->getOperandBundlesAsDefs(OpBundles);
4271       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4272 
4273       if (isa<FPMathOperator>(V))
4274         V->copyFastMathFlags(CI);
4275 
4276       State.set(Def, V, Part);
4277       addMetadata(V, &I);
4278   }
4279 }
4280 
4281 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4282   // We should not collect Scalars more than once per VF. Right now, this
4283   // function is called from collectUniformsAndScalars(), which already does
4284   // this check. Collecting Scalars for VF=1 does not make any sense.
4285   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4286          "This function should not be visited twice for the same VF");
4287 
4288   // This avoids any chances of creating a REPLICATE recipe during planning
4289   // since that would result in generation of scalarized code during execution,
4290   // which is not supported for scalable vectors.
4291   if (VF.isScalable()) {
4292     Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
4293     return;
4294   }
4295 
4296   SmallSetVector<Instruction *, 8> Worklist;
4297 
4298   // These sets are used to seed the analysis with pointers used by memory
4299   // accesses that will remain scalar.
4300   SmallSetVector<Instruction *, 8> ScalarPtrs;
4301   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4302   auto *Latch = TheLoop->getLoopLatch();
4303 
4304   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4305   // The pointer operands of loads and stores will be scalar as long as the
4306   // memory access is not a gather or scatter operation. The value operand of a
4307   // store will remain scalar if the store is scalarized.
4308   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4309     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4310     assert(WideningDecision != CM_Unknown &&
4311            "Widening decision should be ready at this moment");
4312     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4313       if (Ptr == Store->getValueOperand())
4314         return WideningDecision == CM_Scalarize;
4315     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4316            "Ptr is neither a value or pointer operand");
4317     return WideningDecision != CM_GatherScatter;
4318   };
4319 
4320   // A helper that returns true if the given value is a bitcast or
4321   // getelementptr instruction contained in the loop.
4322   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4323     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4324             isa<GetElementPtrInst>(V)) &&
4325            !TheLoop->isLoopInvariant(V);
4326   };
4327 
4328   // A helper that evaluates a memory access's use of a pointer. If the use will
4329   // be a scalar use and the pointer is only used by memory accesses, we place
4330   // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4331   // PossibleNonScalarPtrs.
4332   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4333     // We only care about bitcast and getelementptr instructions contained in
4334     // the loop.
4335     if (!isLoopVaryingBitCastOrGEP(Ptr))
4336       return;
4337 
4338     // If the pointer has already been identified as scalar (e.g., if it was
4339     // also identified as uniform), there's nothing to do.
4340     auto *I = cast<Instruction>(Ptr);
4341     if (Worklist.count(I))
4342       return;
4343 
4344     // If the use of the pointer will be a scalar use, and all users of the
4345     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4346     // place the pointer in PossibleNonScalarPtrs.
4347     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4348           return isa<LoadInst>(U) || isa<StoreInst>(U);
4349         }))
4350       ScalarPtrs.insert(I);
4351     else
4352       PossibleNonScalarPtrs.insert(I);
4353   };
4354 
4355   // We seed the scalars analysis with three classes of instructions: (1)
4356   // instructions marked uniform-after-vectorization and (2) bitcast,
4357   // getelementptr and (pointer) phi instructions used by memory accesses
4358   // requiring a scalar use.
4359   //
4360   // (1) Add to the worklist all instructions that have been identified as
4361   // uniform-after-vectorization.
4362   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4363 
4364   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4365   // memory accesses requiring a scalar use. The pointer operands of loads and
4366   // stores will be scalar as long as the memory accesses is not a gather or
4367   // scatter operation. The value operand of a store will remain scalar if the
4368   // store is scalarized.
4369   for (auto *BB : TheLoop->blocks())
4370     for (auto &I : *BB) {
4371       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4372         evaluatePtrUse(Load, Load->getPointerOperand());
4373       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4374         evaluatePtrUse(Store, Store->getPointerOperand());
4375         evaluatePtrUse(Store, Store->getValueOperand());
4376       }
4377     }
4378   for (auto *I : ScalarPtrs)
4379     if (!PossibleNonScalarPtrs.count(I)) {
4380       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4381       Worklist.insert(I);
4382     }
4383 
4384   // Insert the forced scalars.
4385   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4386   // induction variable when the PHI user is scalarized.
4387   auto ForcedScalar = ForcedScalars.find(VF);
4388   if (ForcedScalar != ForcedScalars.end())
4389     for (auto *I : ForcedScalar->second)
4390       Worklist.insert(I);
4391 
4392   // Expand the worklist by looking through any bitcasts and getelementptr
4393   // instructions we've already identified as scalar. This is similar to the
4394   // expansion step in collectLoopUniforms(); however, here we're only
4395   // expanding to include additional bitcasts and getelementptr instructions.
4396   unsigned Idx = 0;
4397   while (Idx != Worklist.size()) {
4398     Instruction *Dst = Worklist[Idx++];
4399     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4400       continue;
4401     auto *Src = cast<Instruction>(Dst->getOperand(0));
4402     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4403           auto *J = cast<Instruction>(U);
4404           return !TheLoop->contains(J) || Worklist.count(J) ||
4405                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4406                   isScalarUse(J, Src));
4407         })) {
4408       Worklist.insert(Src);
4409       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4410     }
4411   }
4412 
4413   // An induction variable will remain scalar if all users of the induction
4414   // variable and induction variable update remain scalar.
4415   for (auto &Induction : Legal->getInductionVars()) {
4416     auto *Ind = Induction.first;
4417     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4418 
4419     // If tail-folding is applied, the primary induction variable will be used
4420     // to feed a vector compare.
4421     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4422       continue;
4423 
4424     // Returns true if \p Indvar is a pointer induction that is used directly by
4425     // load/store instruction \p I.
4426     auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
4427                                               Instruction *I) {
4428       return Induction.second.getKind() ==
4429                  InductionDescriptor::IK_PtrInduction &&
4430              (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
4431              Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
4432     };
4433 
4434     // Determine if all users of the induction variable are scalar after
4435     // vectorization.
4436     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4437       auto *I = cast<Instruction>(U);
4438       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4439              IsDirectLoadStoreFromPtrIndvar(Ind, I);
4440     });
4441     if (!ScalarInd)
4442       continue;
4443 
4444     // Determine if all users of the induction variable update instruction are
4445     // scalar after vectorization.
4446     auto ScalarIndUpdate =
4447         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4448           auto *I = cast<Instruction>(U);
4449           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4450                  IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
4451         });
4452     if (!ScalarIndUpdate)
4453       continue;
4454 
4455     // The induction variable and its update instruction will remain scalar.
4456     Worklist.insert(Ind);
4457     Worklist.insert(IndUpdate);
4458     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4459     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4460                       << "\n");
4461   }
4462 
4463   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4464 }
4465 
4466 bool LoopVectorizationCostModel::isScalarWithPredication(
4467     Instruction *I, ElementCount VF) const {
4468   if (!blockNeedsPredicationForAnyReason(I->getParent()))
4469     return false;
4470   switch(I->getOpcode()) {
4471   default:
4472     break;
4473   case Instruction::Load:
4474   case Instruction::Store: {
4475     if (!Legal->isMaskRequired(I))
4476       return false;
4477     auto *Ptr = getLoadStorePointerOperand(I);
4478     auto *Ty = getLoadStoreType(I);
4479     Type *VTy = Ty;
4480     if (VF.isVector())
4481       VTy = VectorType::get(Ty, VF);
4482     const Align Alignment = getLoadStoreAlignment(I);
4483     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4484                                 TTI.isLegalMaskedGather(VTy, Alignment))
4485                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4486                                 TTI.isLegalMaskedScatter(VTy, Alignment));
4487   }
4488   case Instruction::UDiv:
4489   case Instruction::SDiv:
4490   case Instruction::SRem:
4491   case Instruction::URem:
4492     return mayDivideByZero(*I);
4493   }
4494   return false;
4495 }
4496 
4497 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4498     Instruction *I, ElementCount VF) {
4499   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4500   assert(getWideningDecision(I, VF) == CM_Unknown &&
4501          "Decision should not be set yet.");
4502   auto *Group = getInterleavedAccessGroup(I);
4503   assert(Group && "Must have a group.");
4504 
4505   // If the instruction's allocated size doesn't equal it's type size, it
4506   // requires padding and will be scalarized.
4507   auto &DL = I->getModule()->getDataLayout();
4508   auto *ScalarTy = getLoadStoreType(I);
4509   if (hasIrregularType(ScalarTy, DL))
4510     return false;
4511 
4512   // Check if masking is required.
4513   // A Group may need masking for one of two reasons: it resides in a block that
4514   // needs predication, or it was decided to use masking to deal with gaps
4515   // (either a gap at the end of a load-access that may result in a speculative
4516   // load, or any gaps in a store-access).
4517   bool PredicatedAccessRequiresMasking =
4518       blockNeedsPredicationForAnyReason(I->getParent()) &&
4519       Legal->isMaskRequired(I);
4520   bool LoadAccessWithGapsRequiresEpilogMasking =
4521       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4522       !isScalarEpilogueAllowed();
4523   bool StoreAccessWithGapsRequiresMasking =
4524       isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4525   if (!PredicatedAccessRequiresMasking &&
4526       !LoadAccessWithGapsRequiresEpilogMasking &&
4527       !StoreAccessWithGapsRequiresMasking)
4528     return true;
4529 
4530   // If masked interleaving is required, we expect that the user/target had
4531   // enabled it, because otherwise it either wouldn't have been created or
4532   // it should have been invalidated by the CostModel.
4533   assert(useMaskedInterleavedAccesses(TTI) &&
4534          "Masked interleave-groups for predicated accesses are not enabled.");
4535 
4536   if (Group->isReverse())
4537     return false;
4538 
4539   auto *Ty = getLoadStoreType(I);
4540   const Align Alignment = getLoadStoreAlignment(I);
4541   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4542                           : TTI.isLegalMaskedStore(Ty, Alignment);
4543 }
4544 
4545 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4546     Instruction *I, ElementCount VF) {
4547   // Get and ensure we have a valid memory instruction.
4548   assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
4549 
4550   auto *Ptr = getLoadStorePointerOperand(I);
4551   auto *ScalarTy = getLoadStoreType(I);
4552 
4553   // In order to be widened, the pointer should be consecutive, first of all.
4554   if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
4555     return false;
4556 
4557   // If the instruction is a store located in a predicated block, it will be
4558   // scalarized.
4559   if (isScalarWithPredication(I, VF))
4560     return false;
4561 
4562   // If the instruction's allocated size doesn't equal it's type size, it
4563   // requires padding and will be scalarized.
4564   auto &DL = I->getModule()->getDataLayout();
4565   if (hasIrregularType(ScalarTy, DL))
4566     return false;
4567 
4568   return true;
4569 }
4570 
4571 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4572   // We should not collect Uniforms more than once per VF. Right now,
4573   // this function is called from collectUniformsAndScalars(), which
4574   // already does this check. Collecting Uniforms for VF=1 does not make any
4575   // sense.
4576 
4577   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
4578          "This function should not be visited twice for the same VF");
4579 
4580   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4581   // not analyze again.  Uniforms.count(VF) will return 1.
4582   Uniforms[VF].clear();
4583 
4584   // We now know that the loop is vectorizable!
4585   // Collect instructions inside the loop that will remain uniform after
4586   // vectorization.
4587 
4588   // Global values, params and instructions outside of current loop are out of
4589   // scope.
4590   auto isOutOfScope = [&](Value *V) -> bool {
4591     Instruction *I = dyn_cast<Instruction>(V);
4592     return (!I || !TheLoop->contains(I));
4593   };
4594 
4595   // Worklist containing uniform instructions demanding lane 0.
4596   SetVector<Instruction *> Worklist;
4597   BasicBlock *Latch = TheLoop->getLoopLatch();
4598 
4599   // Add uniform instructions demanding lane 0 to the worklist. Instructions
4600   // that are scalar with predication must not be considered uniform after
4601   // vectorization, because that would create an erroneous replicating region
4602   // where only a single instance out of VF should be formed.
4603   // TODO: optimize such seldom cases if found important, see PR40816.
4604   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4605     if (isOutOfScope(I)) {
4606       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
4607                         << *I << "\n");
4608       return;
4609     }
4610     if (isScalarWithPredication(I, VF)) {
4611       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4612                         << *I << "\n");
4613       return;
4614     }
4615     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4616     Worklist.insert(I);
4617   };
4618 
4619   // Start with the conditional branch. If the branch condition is an
4620   // instruction contained in the loop that is only used by the branch, it is
4621   // uniform.
4622   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4623   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4624     addToWorklistIfAllowed(Cmp);
4625 
4626   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
4627     InstWidening WideningDecision = getWideningDecision(I, VF);
4628     assert(WideningDecision != CM_Unknown &&
4629            "Widening decision should be ready at this moment");
4630 
4631     // A uniform memory op is itself uniform.  We exclude uniform stores
4632     // here as they demand the last lane, not the first one.
4633     if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
4634       assert(WideningDecision == CM_Scalarize);
4635       return true;
4636     }
4637 
4638     return (WideningDecision == CM_Widen ||
4639             WideningDecision == CM_Widen_Reverse ||
4640             WideningDecision == CM_Interleave);
4641   };
4642 
4643 
4644   // Returns true if Ptr is the pointer operand of a memory access instruction
4645   // I, and I is known to not require scalarization.
4646   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4647     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4648   };
4649 
4650   // Holds a list of values which are known to have at least one uniform use.
4651   // Note that there may be other uses which aren't uniform.  A "uniform use"
4652   // here is something which only demands lane 0 of the unrolled iterations;
4653   // it does not imply that all lanes produce the same value (e.g. this is not
4654   // the usual meaning of uniform)
4655   SetVector<Value *> HasUniformUse;
4656 
4657   // Scan the loop for instructions which are either a) known to have only
4658   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
4659   for (auto *BB : TheLoop->blocks())
4660     for (auto &I : *BB) {
4661       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
4662         switch (II->getIntrinsicID()) {
4663         case Intrinsic::sideeffect:
4664         case Intrinsic::experimental_noalias_scope_decl:
4665         case Intrinsic::assume:
4666         case Intrinsic::lifetime_start:
4667         case Intrinsic::lifetime_end:
4668           if (TheLoop->hasLoopInvariantOperands(&I))
4669             addToWorklistIfAllowed(&I);
4670           break;
4671         default:
4672           break;
4673         }
4674       }
4675 
4676       // ExtractValue instructions must be uniform, because the operands are
4677       // known to be loop-invariant.
4678       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
4679         assert(isOutOfScope(EVI->getAggregateOperand()) &&
4680                "Expected aggregate value to be loop invariant");
4681         addToWorklistIfAllowed(EVI);
4682         continue;
4683       }
4684 
4685       // If there's no pointer operand, there's nothing to do.
4686       auto *Ptr = getLoadStorePointerOperand(&I);
4687       if (!Ptr)
4688         continue;
4689 
4690       // A uniform memory op is itself uniform.  We exclude uniform stores
4691       // here as they demand the last lane, not the first one.
4692       if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
4693         addToWorklistIfAllowed(&I);
4694 
4695       if (isUniformDecision(&I, VF)) {
4696         assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
4697         HasUniformUse.insert(Ptr);
4698       }
4699     }
4700 
4701   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
4702   // demanding) users.  Since loops are assumed to be in LCSSA form, this
4703   // disallows uses outside the loop as well.
4704   for (auto *V : HasUniformUse) {
4705     if (isOutOfScope(V))
4706       continue;
4707     auto *I = cast<Instruction>(V);
4708     auto UsersAreMemAccesses =
4709       llvm::all_of(I->users(), [&](User *U) -> bool {
4710         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
4711       });
4712     if (UsersAreMemAccesses)
4713       addToWorklistIfAllowed(I);
4714   }
4715 
4716   // Expand Worklist in topological order: whenever a new instruction
4717   // is added , its users should be already inside Worklist.  It ensures
4718   // a uniform instruction will only be used by uniform instructions.
4719   unsigned idx = 0;
4720   while (idx != Worklist.size()) {
4721     Instruction *I = Worklist[idx++];
4722 
4723     for (auto OV : I->operand_values()) {
4724       // isOutOfScope operands cannot be uniform instructions.
4725       if (isOutOfScope(OV))
4726         continue;
4727       // First order recurrence Phi's should typically be considered
4728       // non-uniform.
4729       auto *OP = dyn_cast<PHINode>(OV);
4730       if (OP && Legal->isFirstOrderRecurrence(OP))
4731         continue;
4732       // If all the users of the operand are uniform, then add the
4733       // operand into the uniform worklist.
4734       auto *OI = cast<Instruction>(OV);
4735       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4736             auto *J = cast<Instruction>(U);
4737             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
4738           }))
4739         addToWorklistIfAllowed(OI);
4740     }
4741   }
4742 
4743   // For an instruction to be added into Worklist above, all its users inside
4744   // the loop should also be in Worklist. However, this condition cannot be
4745   // true for phi nodes that form a cyclic dependence. We must process phi
4746   // nodes separately. An induction variable will remain uniform if all users
4747   // of the induction variable and induction variable update remain uniform.
4748   // The code below handles both pointer and non-pointer induction variables.
4749   for (auto &Induction : Legal->getInductionVars()) {
4750     auto *Ind = Induction.first;
4751     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4752 
4753     // Determine if all users of the induction variable are uniform after
4754     // vectorization.
4755     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4756       auto *I = cast<Instruction>(U);
4757       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4758              isVectorizedMemAccessUse(I, Ind);
4759     });
4760     if (!UniformInd)
4761       continue;
4762 
4763     // Determine if all users of the induction variable update instruction are
4764     // uniform after vectorization.
4765     auto UniformIndUpdate =
4766         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4767           auto *I = cast<Instruction>(U);
4768           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4769                  isVectorizedMemAccessUse(I, IndUpdate);
4770         });
4771     if (!UniformIndUpdate)
4772       continue;
4773 
4774     // The induction variable and its update instruction will remain uniform.
4775     addToWorklistIfAllowed(Ind);
4776     addToWorklistIfAllowed(IndUpdate);
4777   }
4778 
4779   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4780 }
4781 
4782 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4783   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4784 
4785   if (Legal->getRuntimePointerChecking()->Need) {
4786     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4787         "runtime pointer checks needed. Enable vectorization of this "
4788         "loop with '#pragma clang loop vectorize(enable)' when "
4789         "compiling with -Os/-Oz",
4790         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4791     return true;
4792   }
4793 
4794   if (!PSE.getPredicate().isAlwaysTrue()) {
4795     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4796         "runtime SCEV checks needed. Enable vectorization of this "
4797         "loop with '#pragma clang loop vectorize(enable)' when "
4798         "compiling with -Os/-Oz",
4799         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4800     return true;
4801   }
4802 
4803   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4804   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4805     reportVectorizationFailure("Runtime stride check for small trip count",
4806         "runtime stride == 1 checks needed. Enable vectorization of "
4807         "this loop without such check by compiling with -Os/-Oz",
4808         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4809     return true;
4810   }
4811 
4812   return false;
4813 }
4814 
4815 ElementCount
4816 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
4817   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
4818     return ElementCount::getScalable(0);
4819 
4820   if (Hints->isScalableVectorizationDisabled()) {
4821     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
4822                             "ScalableVectorizationDisabled", ORE, TheLoop);
4823     return ElementCount::getScalable(0);
4824   }
4825 
4826   LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
4827 
4828   auto MaxScalableVF = ElementCount::getScalable(
4829       std::numeric_limits<ElementCount::ScalarTy>::max());
4830 
4831   // Test that the loop-vectorizer can legalize all operations for this MaxVF.
4832   // FIXME: While for scalable vectors this is currently sufficient, this should
4833   // be replaced by a more detailed mechanism that filters out specific VFs,
4834   // instead of invalidating vectorization for a whole set of VFs based on the
4835   // MaxVF.
4836 
4837   // Disable scalable vectorization if the loop contains unsupported reductions.
4838   if (!canVectorizeReductions(MaxScalableVF)) {
4839     reportVectorizationInfo(
4840         "Scalable vectorization not supported for the reduction "
4841         "operations found in this loop.",
4842         "ScalableVFUnfeasible", ORE, TheLoop);
4843     return ElementCount::getScalable(0);
4844   }
4845 
4846   // Disable scalable vectorization if the loop contains any instructions
4847   // with element types not supported for scalable vectors.
4848   if (any_of(ElementTypesInLoop, [&](Type *Ty) {
4849         return !Ty->isVoidTy() &&
4850                !this->TTI.isElementTypeLegalForScalableVector(Ty);
4851       })) {
4852     reportVectorizationInfo("Scalable vectorization is not supported "
4853                             "for all element types found in this loop.",
4854                             "ScalableVFUnfeasible", ORE, TheLoop);
4855     return ElementCount::getScalable(0);
4856   }
4857 
4858   if (Legal->isSafeForAnyVectorWidth())
4859     return MaxScalableVF;
4860 
4861   // Limit MaxScalableVF by the maximum safe dependence distance.
4862   Optional<unsigned> MaxVScale = TTI.getMaxVScale();
4863   if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange))
4864     MaxVScale =
4865         TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
4866   MaxScalableVF = ElementCount::getScalable(
4867       MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
4868   if (!MaxScalableVF)
4869     reportVectorizationInfo(
4870         "Max legal vector width too small, scalable vectorization "
4871         "unfeasible.",
4872         "ScalableVFUnfeasible", ORE, TheLoop);
4873 
4874   return MaxScalableVF;
4875 }
4876 
4877 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
4878     unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) {
4879   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4880   unsigned SmallestType, WidestType;
4881   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4882 
4883   // Get the maximum safe dependence distance in bits computed by LAA.
4884   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4885   // the memory accesses that is most restrictive (involved in the smallest
4886   // dependence distance).
4887   unsigned MaxSafeElements =
4888       PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
4889 
4890   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
4891   auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
4892 
4893   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
4894                     << ".\n");
4895   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
4896                     << ".\n");
4897 
4898   // First analyze the UserVF, fall back if the UserVF should be ignored.
4899   if (UserVF) {
4900     auto MaxSafeUserVF =
4901         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
4902 
4903     if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
4904       // If `VF=vscale x N` is safe, then so is `VF=N`
4905       if (UserVF.isScalable())
4906         return FixedScalableVFPair(
4907             ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
4908       else
4909         return UserVF;
4910     }
4911 
4912     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
4913 
4914     // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
4915     // is better to ignore the hint and let the compiler choose a suitable VF.
4916     if (!UserVF.isScalable()) {
4917       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4918                         << " is unsafe, clamping to max safe VF="
4919                         << MaxSafeFixedVF << ".\n");
4920       ORE->emit([&]() {
4921         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4922                                           TheLoop->getStartLoc(),
4923                                           TheLoop->getHeader())
4924                << "User-specified vectorization factor "
4925                << ore::NV("UserVectorizationFactor", UserVF)
4926                << " is unsafe, clamping to maximum safe vectorization factor "
4927                << ore::NV("VectorizationFactor", MaxSafeFixedVF);
4928       });
4929       return MaxSafeFixedVF;
4930     }
4931 
4932     if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
4933       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4934                         << " is ignored because scalable vectors are not "
4935                            "available.\n");
4936       ORE->emit([&]() {
4937         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4938                                           TheLoop->getStartLoc(),
4939                                           TheLoop->getHeader())
4940                << "User-specified vectorization factor "
4941                << ore::NV("UserVectorizationFactor", UserVF)
4942                << " is ignored because the target does not support scalable "
4943                   "vectors. The compiler will pick a more suitable value.";
4944       });
4945     } else {
4946       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4947                         << " is unsafe. Ignoring scalable UserVF.\n");
4948       ORE->emit([&]() {
4949         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4950                                           TheLoop->getStartLoc(),
4951                                           TheLoop->getHeader())
4952                << "User-specified vectorization factor "
4953                << ore::NV("UserVectorizationFactor", UserVF)
4954                << " is unsafe. Ignoring the hint to let the compiler pick a "
4955                   "more suitable value.";
4956       });
4957     }
4958   }
4959 
4960   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4961                     << " / " << WidestType << " bits.\n");
4962 
4963   FixedScalableVFPair Result(ElementCount::getFixed(1),
4964                              ElementCount::getScalable(0));
4965   if (auto MaxVF =
4966           getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
4967                                   MaxSafeFixedVF, FoldTailByMasking))
4968     Result.FixedVF = MaxVF;
4969 
4970   if (auto MaxVF =
4971           getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
4972                                   MaxSafeScalableVF, FoldTailByMasking))
4973     if (MaxVF.isScalable()) {
4974       Result.ScalableVF = MaxVF;
4975       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
4976                         << "\n");
4977     }
4978 
4979   return Result;
4980 }
4981 
4982 FixedScalableVFPair
4983 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
4984   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4985     // TODO: It may by useful to do since it's still likely to be dynamically
4986     // uniform if the target can skip.
4987     reportVectorizationFailure(
4988         "Not inserting runtime ptr check for divergent target",
4989         "runtime pointer checks needed. Not enabled for divergent target",
4990         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4991     return FixedScalableVFPair::getNone();
4992   }
4993 
4994   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4995   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4996   if (TC == 1) {
4997     reportVectorizationFailure("Single iteration (non) loop",
4998         "loop trip count is one, irrelevant for vectorization",
4999         "SingleIterationLoop", ORE, TheLoop);
5000     return FixedScalableVFPair::getNone();
5001   }
5002 
5003   switch (ScalarEpilogueStatus) {
5004   case CM_ScalarEpilogueAllowed:
5005     return computeFeasibleMaxVF(TC, UserVF, false);
5006   case CM_ScalarEpilogueNotAllowedUsePredicate:
5007     LLVM_FALLTHROUGH;
5008   case CM_ScalarEpilogueNotNeededUsePredicate:
5009     LLVM_DEBUG(
5010         dbgs() << "LV: vector predicate hint/switch found.\n"
5011                << "LV: Not allowing scalar epilogue, creating predicated "
5012                << "vector loop.\n");
5013     break;
5014   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5015     // fallthrough as a special case of OptForSize
5016   case CM_ScalarEpilogueNotAllowedOptSize:
5017     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5018       LLVM_DEBUG(
5019           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5020     else
5021       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5022                         << "count.\n");
5023 
5024     // Bail if runtime checks are required, which are not good when optimising
5025     // for size.
5026     if (runtimeChecksRequired())
5027       return FixedScalableVFPair::getNone();
5028 
5029     break;
5030   }
5031 
5032   // The only loops we can vectorize without a scalar epilogue, are loops with
5033   // a bottom-test and a single exiting block. We'd have to handle the fact
5034   // that not every instruction executes on the last iteration.  This will
5035   // require a lane mask which varies through the vector loop body.  (TODO)
5036   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5037     // If there was a tail-folding hint/switch, but we can't fold the tail by
5038     // masking, fallback to a vectorization with a scalar epilogue.
5039     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5040       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5041                            "scalar epilogue instead.\n");
5042       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5043       return computeFeasibleMaxVF(TC, UserVF, false);
5044     }
5045     return FixedScalableVFPair::getNone();
5046   }
5047 
5048   // Now try the tail folding
5049 
5050   // Invalidate interleave groups that require an epilogue if we can't mask
5051   // the interleave-group.
5052   if (!useMaskedInterleavedAccesses(TTI)) {
5053     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5054            "No decisions should have been taken at this point");
5055     // Note: There is no need to invalidate any cost modeling decisions here, as
5056     // non where taken so far.
5057     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5058   }
5059 
5060   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true);
5061   // Avoid tail folding if the trip count is known to be a multiple of any VF
5062   // we chose.
5063   // FIXME: The condition below pessimises the case for fixed-width vectors,
5064   // when scalable VFs are also candidates for vectorization.
5065   if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
5066     ElementCount MaxFixedVF = MaxFactors.FixedVF;
5067     assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&
5068            "MaxFixedVF must be a power of 2");
5069     unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
5070                                    : MaxFixedVF.getFixedValue();
5071     ScalarEvolution *SE = PSE.getSE();
5072     const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5073     const SCEV *ExitCount = SE->getAddExpr(
5074         BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5075     const SCEV *Rem = SE->getURemExpr(
5076         SE->applyLoopGuards(ExitCount, TheLoop),
5077         SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5078     if (Rem->isZero()) {
5079       // Accept MaxFixedVF if we do not have a tail.
5080       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5081       return MaxFactors;
5082     }
5083   }
5084 
5085   // For scalable vectors don't use tail folding for low trip counts or
5086   // optimizing for code size. We only permit this if the user has explicitly
5087   // requested it.
5088   if (ScalarEpilogueStatus != CM_ScalarEpilogueNotNeededUsePredicate &&
5089       ScalarEpilogueStatus != CM_ScalarEpilogueNotAllowedUsePredicate &&
5090       MaxFactors.ScalableVF.isVector())
5091     MaxFactors.ScalableVF = ElementCount::getScalable(0);
5092 
5093   // If we don't know the precise trip count, or if the trip count that we
5094   // found modulo the vectorization factor is not zero, try to fold the tail
5095   // by masking.
5096   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5097   if (Legal->prepareToFoldTailByMasking()) {
5098     FoldTailByMasking = true;
5099     return MaxFactors;
5100   }
5101 
5102   // If there was a tail-folding hint/switch, but we can't fold the tail by
5103   // masking, fallback to a vectorization with a scalar epilogue.
5104   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5105     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5106                          "scalar epilogue instead.\n");
5107     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5108     return MaxFactors;
5109   }
5110 
5111   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5112     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5113     return FixedScalableVFPair::getNone();
5114   }
5115 
5116   if (TC == 0) {
5117     reportVectorizationFailure(
5118         "Unable to calculate the loop count due to complex control flow",
5119         "unable to calculate the loop count due to complex control flow",
5120         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5121     return FixedScalableVFPair::getNone();
5122   }
5123 
5124   reportVectorizationFailure(
5125       "Cannot optimize for size and vectorize at the same time.",
5126       "cannot optimize for size and vectorize at the same time. "
5127       "Enable vectorization of this loop with '#pragma clang loop "
5128       "vectorize(enable)' when compiling with -Os/-Oz",
5129       "NoTailLoopWithOptForSize", ORE, TheLoop);
5130   return FixedScalableVFPair::getNone();
5131 }
5132 
5133 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
5134     unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
5135     const ElementCount &MaxSafeVF, bool FoldTailByMasking) {
5136   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
5137   TypeSize WidestRegister = TTI.getRegisterBitWidth(
5138       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5139                            : TargetTransformInfo::RGK_FixedWidthVector);
5140 
5141   // Convenience function to return the minimum of two ElementCounts.
5142   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
5143     assert((LHS.isScalable() == RHS.isScalable()) &&
5144            "Scalable flags must match");
5145     return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
5146   };
5147 
5148   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5149   // Note that both WidestRegister and WidestType may not be a powers of 2.
5150   auto MaxVectorElementCount = ElementCount::get(
5151       PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType),
5152       ComputeScalableMaxVF);
5153   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
5154   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5155                     << (MaxVectorElementCount * WidestType) << " bits.\n");
5156 
5157   if (!MaxVectorElementCount) {
5158     LLVM_DEBUG(dbgs() << "LV: The target has no "
5159                       << (ComputeScalableMaxVF ? "scalable" : "fixed")
5160                       << " vector registers.\n");
5161     return ElementCount::getFixed(1);
5162   }
5163 
5164   const auto TripCountEC = ElementCount::getFixed(ConstTripCount);
5165   if (ConstTripCount &&
5166       ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) &&
5167       (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {
5168     // If loop trip count (TC) is known at compile time there is no point in
5169     // choosing VF greater than TC (as done in the loop below). Select maximum
5170     // power of two which doesn't exceed TC.
5171     // If MaxVectorElementCount is scalable, we only fall back on a fixed VF
5172     // when the TC is less than or equal to the known number of lanes.
5173     auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount);
5174     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
5175                          "exceeding the constant trip count: "
5176                       << ClampedConstTripCount << "\n");
5177     return ElementCount::getFixed(ClampedConstTripCount);
5178   }
5179 
5180   ElementCount MaxVF = MaxVectorElementCount;
5181   if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
5182                             TTI.shouldMaximizeVectorBandwidth())) {
5183     auto MaxVectorElementCountMaxBW = ElementCount::get(
5184         PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
5185         ComputeScalableMaxVF);
5186     MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
5187 
5188     // Collect all viable vectorization factors larger than the default MaxVF
5189     // (i.e. MaxVectorElementCount).
5190     SmallVector<ElementCount, 8> VFs;
5191     for (ElementCount VS = MaxVectorElementCount * 2;
5192          ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
5193       VFs.push_back(VS);
5194 
5195     // For each VF calculate its register usage.
5196     auto RUs = calculateRegisterUsage(VFs);
5197 
5198     // Select the largest VF which doesn't require more registers than existing
5199     // ones.
5200     for (int i = RUs.size() - 1; i >= 0; --i) {
5201       bool Selected = true;
5202       for (auto &pair : RUs[i].MaxLocalUsers) {
5203         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5204         if (pair.second > TargetNumRegisters)
5205           Selected = false;
5206       }
5207       if (Selected) {
5208         MaxVF = VFs[i];
5209         break;
5210       }
5211     }
5212     if (ElementCount MinVF =
5213             TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
5214       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
5215         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5216                           << ") with target's minimum: " << MinVF << '\n');
5217         MaxVF = MinVF;
5218       }
5219     }
5220 
5221     // Invalidate any widening decisions we might have made, in case the loop
5222     // requires prediction (decided later), but we have already made some
5223     // load/store widening decisions.
5224     invalidateCostModelingDecisions();
5225   }
5226   return MaxVF;
5227 }
5228 
5229 Optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const {
5230   if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
5231     auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
5232     auto Min = Attr.getVScaleRangeMin();
5233     auto Max = Attr.getVScaleRangeMax();
5234     if (Max && Min == Max)
5235       return Max;
5236   }
5237 
5238   return TTI.getVScaleForTuning();
5239 }
5240 
5241 bool LoopVectorizationCostModel::isMoreProfitable(
5242     const VectorizationFactor &A, const VectorizationFactor &B) const {
5243   InstructionCost CostA = A.Cost;
5244   InstructionCost CostB = B.Cost;
5245 
5246   unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
5247 
5248   if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking &&
5249       MaxTripCount) {
5250     // If we are folding the tail and the trip count is a known (possibly small)
5251     // constant, the trip count will be rounded up to an integer number of
5252     // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),
5253     // which we compare directly. When not folding the tail, the total cost will
5254     // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is
5255     // approximated with the per-lane cost below instead of using the tripcount
5256     // as here.
5257     auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue());
5258     auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue());
5259     return RTCostA < RTCostB;
5260   }
5261 
5262   // Improve estimate for the vector width if it is scalable.
5263   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
5264   unsigned EstimatedWidthB = B.Width.getKnownMinValue();
5265   if (Optional<unsigned> VScale = getVScaleForTuning()) {
5266     if (A.Width.isScalable())
5267       EstimatedWidthA *= VScale.getValue();
5268     if (B.Width.isScalable())
5269       EstimatedWidthB *= VScale.getValue();
5270   }
5271 
5272   // Assume vscale may be larger than 1 (or the value being tuned for),
5273   // so that scalable vectorization is slightly favorable over fixed-width
5274   // vectorization.
5275   if (A.Width.isScalable() && !B.Width.isScalable())
5276     return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
5277 
5278   // To avoid the need for FP division:
5279   //      (CostA / A.Width) < (CostB / B.Width)
5280   // <=>  (CostA * B.Width) < (CostB * A.Width)
5281   return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
5282 }
5283 
5284 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
5285     const ElementCountSet &VFCandidates) {
5286   InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
5287   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
5288   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
5289   assert(VFCandidates.count(ElementCount::getFixed(1)) &&
5290          "Expected Scalar VF to be a candidate");
5291 
5292   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost);
5293   VectorizationFactor ChosenFactor = ScalarCost;
5294 
5295   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5296   if (ForceVectorization && VFCandidates.size() > 1) {
5297     // Ignore scalar width, because the user explicitly wants vectorization.
5298     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5299     // evaluation.
5300     ChosenFactor.Cost = InstructionCost::getMax();
5301   }
5302 
5303   SmallVector<InstructionVFPair> InvalidCosts;
5304   for (const auto &i : VFCandidates) {
5305     // The cost for scalar VF=1 is already calculated, so ignore it.
5306     if (i.isScalar())
5307       continue;
5308 
5309     VectorizationCostTy C = expectedCost(i, &InvalidCosts);
5310     VectorizationFactor Candidate(i, C.first);
5311 
5312 #ifndef NDEBUG
5313     unsigned AssumedMinimumVscale = 1;
5314     if (Optional<unsigned> VScale = getVScaleForTuning())
5315       AssumedMinimumVscale = VScale.getValue();
5316     unsigned Width =
5317         Candidate.Width.isScalable()
5318             ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5319             : Candidate.Width.getFixedValue();
5320     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5321                       << " costs: " << (Candidate.Cost / Width));
5322     if (i.isScalable())
5323       LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
5324                         << AssumedMinimumVscale << ")");
5325     LLVM_DEBUG(dbgs() << ".\n");
5326 #endif
5327 
5328     if (!C.second && !ForceVectorization) {
5329       LLVM_DEBUG(
5330           dbgs() << "LV: Not considering vector loop of width " << i
5331                  << " because it will not generate any vector instructions.\n");
5332       continue;
5333     }
5334 
5335     // If profitable add it to ProfitableVF list.
5336     if (isMoreProfitable(Candidate, ScalarCost))
5337       ProfitableVFs.push_back(Candidate);
5338 
5339     if (isMoreProfitable(Candidate, ChosenFactor))
5340       ChosenFactor = Candidate;
5341   }
5342 
5343   // Emit a report of VFs with invalid costs in the loop.
5344   if (!InvalidCosts.empty()) {
5345     // Group the remarks per instruction, keeping the instruction order from
5346     // InvalidCosts.
5347     std::map<Instruction *, unsigned> Numbering;
5348     unsigned I = 0;
5349     for (auto &Pair : InvalidCosts)
5350       if (!Numbering.count(Pair.first))
5351         Numbering[Pair.first] = I++;
5352 
5353     // Sort the list, first on instruction(number) then on VF.
5354     llvm::sort(InvalidCosts,
5355                [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
5356                  if (Numbering[A.first] != Numbering[B.first])
5357                    return Numbering[A.first] < Numbering[B.first];
5358                  ElementCountComparator ECC;
5359                  return ECC(A.second, B.second);
5360                });
5361 
5362     // For a list of ordered instruction-vf pairs:
5363     //   [(load, vf1), (load, vf2), (store, vf1)]
5364     // Group the instructions together to emit separate remarks for:
5365     //   load  (vf1, vf2)
5366     //   store (vf1)
5367     auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
5368     auto Subset = ArrayRef<InstructionVFPair>();
5369     do {
5370       if (Subset.empty())
5371         Subset = Tail.take_front(1);
5372 
5373       Instruction *I = Subset.front().first;
5374 
5375       // If the next instruction is different, or if there are no other pairs,
5376       // emit a remark for the collated subset. e.g.
5377       //   [(load, vf1), (load, vf2))]
5378       // to emit:
5379       //  remark: invalid costs for 'load' at VF=(vf, vf2)
5380       if (Subset == Tail || Tail[Subset.size()].first != I) {
5381         std::string OutString;
5382         raw_string_ostream OS(OutString);
5383         assert(!Subset.empty() && "Unexpected empty range");
5384         OS << "Instruction with invalid costs prevented vectorization at VF=(";
5385         for (auto &Pair : Subset)
5386           OS << (Pair.second == Subset.front().second ? "" : ", ")
5387              << Pair.second;
5388         OS << "):";
5389         if (auto *CI = dyn_cast<CallInst>(I))
5390           OS << " call to " << CI->getCalledFunction()->getName();
5391         else
5392           OS << " " << I->getOpcodeName();
5393         OS.flush();
5394         reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
5395         Tail = Tail.drop_front(Subset.size());
5396         Subset = {};
5397       } else
5398         // Grow the subset by one element
5399         Subset = Tail.take_front(Subset.size() + 1);
5400     } while (!Tail.empty());
5401   }
5402 
5403   if (!EnableCondStoresVectorization && NumPredStores) {
5404     reportVectorizationFailure("There are conditional stores.",
5405         "store that is conditionally executed prevents vectorization",
5406         "ConditionalStore", ORE, TheLoop);
5407     ChosenFactor = ScalarCost;
5408   }
5409 
5410   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
5411                  ChosenFactor.Cost >= ScalarCost.Cost) dbgs()
5412              << "LV: Vectorization seems to be not beneficial, "
5413              << "but was forced by a user.\n");
5414   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
5415   return ChosenFactor;
5416 }
5417 
5418 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
5419     const Loop &L, ElementCount VF) const {
5420   // Cross iteration phis such as reductions need special handling and are
5421   // currently unsupported.
5422   if (any_of(L.getHeader()->phis(),
5423              [&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); }))
5424     return false;
5425 
5426   // Phis with uses outside of the loop require special handling and are
5427   // currently unsupported.
5428   for (auto &Entry : Legal->getInductionVars()) {
5429     // Look for uses of the value of the induction at the last iteration.
5430     Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
5431     for (User *U : PostInc->users())
5432       if (!L.contains(cast<Instruction>(U)))
5433         return false;
5434     // Look for uses of penultimate value of the induction.
5435     for (User *U : Entry.first->users())
5436       if (!L.contains(cast<Instruction>(U)))
5437         return false;
5438   }
5439 
5440   // Induction variables that are widened require special handling that is
5441   // currently not supported.
5442   if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
5443         return !(this->isScalarAfterVectorization(Entry.first, VF) ||
5444                  this->isProfitableToScalarize(Entry.first, VF));
5445       }))
5446     return false;
5447 
5448   // Epilogue vectorization code has not been auditted to ensure it handles
5449   // non-latch exits properly.  It may be fine, but it needs auditted and
5450   // tested.
5451   if (L.getExitingBlock() != L.getLoopLatch())
5452     return false;
5453 
5454   return true;
5455 }
5456 
5457 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5458     const ElementCount VF) const {
5459   // FIXME: We need a much better cost-model to take different parameters such
5460   // as register pressure, code size increase and cost of extra branches into
5461   // account. For now we apply a very crude heuristic and only consider loops
5462   // with vectorization factors larger than a certain value.
5463   // We also consider epilogue vectorization unprofitable for targets that don't
5464   // consider interleaving beneficial (eg. MVE).
5465   if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
5466     return false;
5467   // FIXME: We should consider changing the threshold for scalable
5468   // vectors to take VScaleForTuning into account.
5469   if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF)
5470     return true;
5471   return false;
5472 }
5473 
5474 VectorizationFactor
5475 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
5476     const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
5477   VectorizationFactor Result = VectorizationFactor::Disabled();
5478   if (!EnableEpilogueVectorization) {
5479     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
5480     return Result;
5481   }
5482 
5483   if (!isScalarEpilogueAllowed()) {
5484     LLVM_DEBUG(
5485         dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
5486                   "allowed.\n";);
5487     return Result;
5488   }
5489 
5490   // Not really a cost consideration, but check for unsupported cases here to
5491   // simplify the logic.
5492   if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
5493     LLVM_DEBUG(
5494         dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
5495                   "not a supported candidate.\n";);
5496     return Result;
5497   }
5498 
5499   if (EpilogueVectorizationForceVF > 1) {
5500     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
5501     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
5502     if (LVP.hasPlanWithVF(ForcedEC))
5503       return {ForcedEC, 0};
5504     else {
5505       LLVM_DEBUG(
5506           dbgs()
5507               << "LEV: Epilogue vectorization forced factor is not viable.\n";);
5508       return Result;
5509     }
5510   }
5511 
5512   if (TheLoop->getHeader()->getParent()->hasOptSize() ||
5513       TheLoop->getHeader()->getParent()->hasMinSize()) {
5514     LLVM_DEBUG(
5515         dbgs()
5516             << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
5517     return Result;
5518   }
5519 
5520   if (!isEpilogueVectorizationProfitable(MainLoopVF)) {
5521     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
5522                          "this loop\n");
5523     return Result;
5524   }
5525 
5526   // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
5527   // the main loop handles 8 lanes per iteration. We could still benefit from
5528   // vectorizing the epilogue loop with VF=4.
5529   ElementCount EstimatedRuntimeVF = MainLoopVF;
5530   if (MainLoopVF.isScalable()) {
5531     EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
5532     if (Optional<unsigned> VScale = getVScaleForTuning())
5533       EstimatedRuntimeVF *= VScale.getValue();
5534   }
5535 
5536   for (auto &NextVF : ProfitableVFs)
5537     if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
5538           ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) ||
5539          ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) &&
5540         (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) &&
5541         LVP.hasPlanWithVF(NextVF.Width))
5542       Result = NextVF;
5543 
5544   if (Result != VectorizationFactor::Disabled())
5545     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5546                       << Result.Width << "\n";);
5547   return Result;
5548 }
5549 
5550 std::pair<unsigned, unsigned>
5551 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5552   unsigned MinWidth = -1U;
5553   unsigned MaxWidth = 8;
5554   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5555   // For in-loop reductions, no element types are added to ElementTypesInLoop
5556   // if there are no loads/stores in the loop. In this case, check through the
5557   // reduction variables to determine the maximum width.
5558   if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
5559     // Reset MaxWidth so that we can find the smallest type used by recurrences
5560     // in the loop.
5561     MaxWidth = -1U;
5562     for (auto &PhiDescriptorPair : Legal->getReductionVars()) {
5563       const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
5564       // When finding the min width used by the recurrence we need to account
5565       // for casts on the input operands of the recurrence.
5566       MaxWidth = std::min<unsigned>(
5567           MaxWidth, std::min<unsigned>(
5568                         RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
5569                         RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
5570     }
5571   } else {
5572     for (Type *T : ElementTypesInLoop) {
5573       MinWidth = std::min<unsigned>(
5574           MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5575       MaxWidth = std::max<unsigned>(
5576           MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5577     }
5578   }
5579   return {MinWidth, MaxWidth};
5580 }
5581 
5582 void LoopVectorizationCostModel::collectElementTypesForWidening() {
5583   ElementTypesInLoop.clear();
5584   // For each block.
5585   for (BasicBlock *BB : TheLoop->blocks()) {
5586     // For each instruction in the loop.
5587     for (Instruction &I : BB->instructionsWithoutDebug()) {
5588       Type *T = I.getType();
5589 
5590       // Skip ignored values.
5591       if (ValuesToIgnore.count(&I))
5592         continue;
5593 
5594       // Only examine Loads, Stores and PHINodes.
5595       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5596         continue;
5597 
5598       // Examine PHI nodes that are reduction variables. Update the type to
5599       // account for the recurrence type.
5600       if (auto *PN = dyn_cast<PHINode>(&I)) {
5601         if (!Legal->isReductionVariable(PN))
5602           continue;
5603         const RecurrenceDescriptor &RdxDesc =
5604             Legal->getReductionVars().find(PN)->second;
5605         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
5606             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
5607                                       RdxDesc.getRecurrenceType(),
5608                                       TargetTransformInfo::ReductionFlags()))
5609           continue;
5610         T = RdxDesc.getRecurrenceType();
5611       }
5612 
5613       // Examine the stored values.
5614       if (auto *ST = dyn_cast<StoreInst>(&I))
5615         T = ST->getValueOperand()->getType();
5616 
5617       assert(T->isSized() &&
5618              "Expected the load/store/recurrence type to be sized");
5619 
5620       ElementTypesInLoop.insert(T);
5621     }
5622   }
5623 }
5624 
5625 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5626                                                            unsigned LoopCost) {
5627   // -- The interleave heuristics --
5628   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5629   // There are many micro-architectural considerations that we can't predict
5630   // at this level. For example, frontend pressure (on decode or fetch) due to
5631   // code size, or the number and capabilities of the execution ports.
5632   //
5633   // We use the following heuristics to select the interleave count:
5634   // 1. If the code has reductions, then we interleave to break the cross
5635   // iteration dependency.
5636   // 2. If the loop is really small, then we interleave to reduce the loop
5637   // overhead.
5638   // 3. We don't interleave if we think that we will spill registers to memory
5639   // due to the increased register pressure.
5640 
5641   if (!isScalarEpilogueAllowed())
5642     return 1;
5643 
5644   // We used the distance for the interleave count.
5645   if (Legal->getMaxSafeDepDistBytes() != -1U)
5646     return 1;
5647 
5648   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5649   const bool HasReductions = !Legal->getReductionVars().empty();
5650   // Do not interleave loops with a relatively small known or estimated trip
5651   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5652   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5653   // because with the above conditions interleaving can expose ILP and break
5654   // cross iteration dependences for reductions.
5655   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5656       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5657     return 1;
5658 
5659   // If we did not calculate the cost for VF (because the user selected the VF)
5660   // then we calculate the cost of VF here.
5661   if (LoopCost == 0) {
5662     InstructionCost C = expectedCost(VF).first;
5663     assert(C.isValid() && "Expected to have chosen a VF with valid cost");
5664     LoopCost = *C.getValue();
5665 
5666     // Loop body is free and there is no need for interleaving.
5667     if (LoopCost == 0)
5668       return 1;
5669   }
5670 
5671   RegisterUsage R = calculateRegisterUsage({VF})[0];
5672   // We divide by these constants so assume that we have at least one
5673   // instruction that uses at least one register.
5674   for (auto& pair : R.MaxLocalUsers) {
5675     pair.second = std::max(pair.second, 1U);
5676   }
5677 
5678   // We calculate the interleave count using the following formula.
5679   // Subtract the number of loop invariants from the number of available
5680   // registers. These registers are used by all of the interleaved instances.
5681   // Next, divide the remaining registers by the number of registers that is
5682   // required by the loop, in order to estimate how many parallel instances
5683   // fit without causing spills. All of this is rounded down if necessary to be
5684   // a power of two. We want power of two interleave count to simplify any
5685   // addressing operations or alignment considerations.
5686   // We also want power of two interleave counts to ensure that the induction
5687   // variable of the vector loop wraps to zero, when tail is folded by masking;
5688   // this currently happens when OptForSize, in which case IC is set to 1 above.
5689   unsigned IC = UINT_MAX;
5690 
5691   for (auto& pair : R.MaxLocalUsers) {
5692     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5693     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5694                       << " registers of "
5695                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5696     if (VF.isScalar()) {
5697       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5698         TargetNumRegisters = ForceTargetNumScalarRegs;
5699     } else {
5700       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5701         TargetNumRegisters = ForceTargetNumVectorRegs;
5702     }
5703     unsigned MaxLocalUsers = pair.second;
5704     unsigned LoopInvariantRegs = 0;
5705     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5706       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5707 
5708     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5709     // Don't count the induction variable as interleaved.
5710     if (EnableIndVarRegisterHeur) {
5711       TmpIC =
5712           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5713                         std::max(1U, (MaxLocalUsers - 1)));
5714     }
5715 
5716     IC = std::min(IC, TmpIC);
5717   }
5718 
5719   // Clamp the interleave ranges to reasonable counts.
5720   unsigned MaxInterleaveCount =
5721       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
5722 
5723   // Check if the user has overridden the max.
5724   if (VF.isScalar()) {
5725     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5726       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5727   } else {
5728     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5729       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5730   }
5731 
5732   // If trip count is known or estimated compile time constant, limit the
5733   // interleave count to be less than the trip count divided by VF, provided it
5734   // is at least 1.
5735   //
5736   // For scalable vectors we can't know if interleaving is beneficial. It may
5737   // not be beneficial for small loops if none of the lanes in the second vector
5738   // iterations is enabled. However, for larger loops, there is likely to be a
5739   // similar benefit as for fixed-width vectors. For now, we choose to leave
5740   // the InterleaveCount as if vscale is '1', although if some information about
5741   // the vector is known (e.g. min vector size), we can make a better decision.
5742   if (BestKnownTC) {
5743     MaxInterleaveCount =
5744         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
5745     // Make sure MaxInterleaveCount is greater than 0.
5746     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
5747   }
5748 
5749   assert(MaxInterleaveCount > 0 &&
5750          "Maximum interleave count must be greater than 0");
5751 
5752   // Clamp the calculated IC to be between the 1 and the max interleave count
5753   // that the target and trip count allows.
5754   if (IC > MaxInterleaveCount)
5755     IC = MaxInterleaveCount;
5756   else
5757     // Make sure IC is greater than 0.
5758     IC = std::max(1u, IC);
5759 
5760   assert(IC > 0 && "Interleave count must be greater than 0.");
5761 
5762   // Interleave if we vectorized this loop and there is a reduction that could
5763   // benefit from interleaving.
5764   if (VF.isVector() && HasReductions) {
5765     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5766     return IC;
5767   }
5768 
5769   // For any scalar loop that either requires runtime checks or predication we
5770   // are better off leaving this to the unroller. Note that if we've already
5771   // vectorized the loop we will have done the runtime check and so interleaving
5772   // won't require further checks.
5773   bool ScalarInterleavingRequiresPredication =
5774       (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5775          return Legal->blockNeedsPredication(BB);
5776        }));
5777   bool ScalarInterleavingRequiresRuntimePointerCheck =
5778       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5779 
5780   // We want to interleave small loops in order to reduce the loop overhead and
5781   // potentially expose ILP opportunities.
5782   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5783                     << "LV: IC is " << IC << '\n'
5784                     << "LV: VF is " << VF << '\n');
5785   const bool AggressivelyInterleaveReductions =
5786       TTI.enableAggressiveInterleaving(HasReductions);
5787   if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5788       !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5789     // We assume that the cost overhead is 1 and we use the cost model
5790     // to estimate the cost of the loop and interleave until the cost of the
5791     // loop overhead is about 5% of the cost of the loop.
5792     unsigned SmallIC =
5793         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5794 
5795     // Interleave until store/load ports (estimated by max interleave count) are
5796     // saturated.
5797     unsigned NumStores = Legal->getNumStores();
5798     unsigned NumLoads = Legal->getNumLoads();
5799     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5800     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5801 
5802     // There is little point in interleaving for reductions containing selects
5803     // and compares when VF=1 since it may just create more overhead than it's
5804     // worth for loops with small trip counts. This is because we still have to
5805     // do the final reduction after the loop.
5806     bool HasSelectCmpReductions =
5807         HasReductions &&
5808         any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5809           const RecurrenceDescriptor &RdxDesc = Reduction.second;
5810           return RecurrenceDescriptor::isSelectCmpRecurrenceKind(
5811               RdxDesc.getRecurrenceKind());
5812         });
5813     if (HasSelectCmpReductions) {
5814       LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5815       return 1;
5816     }
5817 
5818     // If we have a scalar reduction (vector reductions are already dealt with
5819     // by this point), we can increase the critical path length if the loop
5820     // we're interleaving is inside another loop. For tree-wise reductions
5821     // set the limit to 2, and for ordered reductions it's best to disable
5822     // interleaving entirely.
5823     if (HasReductions && TheLoop->getLoopDepth() > 1) {
5824       bool HasOrderedReductions =
5825           any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5826             const RecurrenceDescriptor &RdxDesc = Reduction.second;
5827             return RdxDesc.isOrdered();
5828           });
5829       if (HasOrderedReductions) {
5830         LLVM_DEBUG(
5831             dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5832         return 1;
5833       }
5834 
5835       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5836       SmallIC = std::min(SmallIC, F);
5837       StoresIC = std::min(StoresIC, F);
5838       LoadsIC = std::min(LoadsIC, F);
5839     }
5840 
5841     if (EnableLoadStoreRuntimeInterleave &&
5842         std::max(StoresIC, LoadsIC) > SmallIC) {
5843       LLVM_DEBUG(
5844           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5845       return std::max(StoresIC, LoadsIC);
5846     }
5847 
5848     // If there are scalar reductions and TTI has enabled aggressive
5849     // interleaving for reductions, we will interleave to expose ILP.
5850     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
5851         AggressivelyInterleaveReductions) {
5852       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5853       // Interleave no less than SmallIC but not as aggressive as the normal IC
5854       // to satisfy the rare situation when resources are too limited.
5855       return std::max(IC / 2, SmallIC);
5856     } else {
5857       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5858       return SmallIC;
5859     }
5860   }
5861 
5862   // Interleave if this is a large loop (small loops are already dealt with by
5863   // this point) that could benefit from interleaving.
5864   if (AggressivelyInterleaveReductions) {
5865     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5866     return IC;
5867   }
5868 
5869   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5870   return 1;
5871 }
5872 
5873 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5874 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5875   // This function calculates the register usage by measuring the highest number
5876   // of values that are alive at a single location. Obviously, this is a very
5877   // rough estimation. We scan the loop in a topological order in order and
5878   // assign a number to each instruction. We use RPO to ensure that defs are
5879   // met before their users. We assume that each instruction that has in-loop
5880   // users starts an interval. We record every time that an in-loop value is
5881   // used, so we have a list of the first and last occurrences of each
5882   // instruction. Next, we transpose this data structure into a multi map that
5883   // holds the list of intervals that *end* at a specific location. This multi
5884   // map allows us to perform a linear search. We scan the instructions linearly
5885   // and record each time that a new interval starts, by placing it in a set.
5886   // If we find this value in the multi-map then we remove it from the set.
5887   // The max register usage is the maximum size of the set.
5888   // We also search for instructions that are defined outside the loop, but are
5889   // used inside the loop. We need this number separately from the max-interval
5890   // usage number because when we unroll, loop-invariant values do not take
5891   // more register.
5892   LoopBlocksDFS DFS(TheLoop);
5893   DFS.perform(LI);
5894 
5895   RegisterUsage RU;
5896 
5897   // Each 'key' in the map opens a new interval. The values
5898   // of the map are the index of the 'last seen' usage of the
5899   // instruction that is the key.
5900   using IntervalMap = DenseMap<Instruction *, unsigned>;
5901 
5902   // Maps instruction to its index.
5903   SmallVector<Instruction *, 64> IdxToInstr;
5904   // Marks the end of each interval.
5905   IntervalMap EndPoint;
5906   // Saves the list of instruction indices that are used in the loop.
5907   SmallPtrSet<Instruction *, 8> Ends;
5908   // Saves the list of values that are used in the loop but are
5909   // defined outside the loop, such as arguments and constants.
5910   SmallPtrSet<Value *, 8> LoopInvariants;
5911 
5912   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5913     for (Instruction &I : BB->instructionsWithoutDebug()) {
5914       IdxToInstr.push_back(&I);
5915 
5916       // Save the end location of each USE.
5917       for (Value *U : I.operands()) {
5918         auto *Instr = dyn_cast<Instruction>(U);
5919 
5920         // Ignore non-instruction values such as arguments, constants, etc.
5921         if (!Instr)
5922           continue;
5923 
5924         // If this instruction is outside the loop then record it and continue.
5925         if (!TheLoop->contains(Instr)) {
5926           LoopInvariants.insert(Instr);
5927           continue;
5928         }
5929 
5930         // Overwrite previous end points.
5931         EndPoint[Instr] = IdxToInstr.size();
5932         Ends.insert(Instr);
5933       }
5934     }
5935   }
5936 
5937   // Saves the list of intervals that end with the index in 'key'.
5938   using InstrList = SmallVector<Instruction *, 2>;
5939   DenseMap<unsigned, InstrList> TransposeEnds;
5940 
5941   // Transpose the EndPoints to a list of values that end at each index.
5942   for (auto &Interval : EndPoint)
5943     TransposeEnds[Interval.second].push_back(Interval.first);
5944 
5945   SmallPtrSet<Instruction *, 8> OpenIntervals;
5946   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5947   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5948 
5949   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5950 
5951   // A lambda that gets the register usage for the given type and VF.
5952   const auto &TTICapture = TTI;
5953   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5954     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
5955       return 0;
5956     InstructionCost::CostType RegUsage =
5957         *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue();
5958     assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() &&
5959            "Nonsensical values for register usage.");
5960     return RegUsage;
5961   };
5962 
5963   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5964     Instruction *I = IdxToInstr[i];
5965 
5966     // Remove all of the instructions that end at this location.
5967     InstrList &List = TransposeEnds[i];
5968     for (Instruction *ToRemove : List)
5969       OpenIntervals.erase(ToRemove);
5970 
5971     // Ignore instructions that are never used within the loop.
5972     if (!Ends.count(I))
5973       continue;
5974 
5975     // Skip ignored values.
5976     if (ValuesToIgnore.count(I))
5977       continue;
5978 
5979     // For each VF find the maximum usage of registers.
5980     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5981       // Count the number of live intervals.
5982       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5983 
5984       if (VFs[j].isScalar()) {
5985         for (auto Inst : OpenIntervals) {
5986           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5987           if (RegUsage.find(ClassID) == RegUsage.end())
5988             RegUsage[ClassID] = 1;
5989           else
5990             RegUsage[ClassID] += 1;
5991         }
5992       } else {
5993         collectUniformsAndScalars(VFs[j]);
5994         for (auto Inst : OpenIntervals) {
5995           // Skip ignored values for VF > 1.
5996           if (VecValuesToIgnore.count(Inst))
5997             continue;
5998           if (isScalarAfterVectorization(Inst, VFs[j])) {
5999             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6000             if (RegUsage.find(ClassID) == RegUsage.end())
6001               RegUsage[ClassID] = 1;
6002             else
6003               RegUsage[ClassID] += 1;
6004           } else {
6005             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
6006             if (RegUsage.find(ClassID) == RegUsage.end())
6007               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
6008             else
6009               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6010           }
6011         }
6012       }
6013 
6014       for (auto& pair : RegUsage) {
6015         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
6016           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
6017         else
6018           MaxUsages[j][pair.first] = pair.second;
6019       }
6020     }
6021 
6022     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6023                       << OpenIntervals.size() << '\n');
6024 
6025     // Add the current instruction to the list of open intervals.
6026     OpenIntervals.insert(I);
6027   }
6028 
6029   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6030     SmallMapVector<unsigned, unsigned, 4> Invariant;
6031 
6032     for (auto Inst : LoopInvariants) {
6033       unsigned Usage =
6034           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6035       unsigned ClassID =
6036           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6037       if (Invariant.find(ClassID) == Invariant.end())
6038         Invariant[ClassID] = Usage;
6039       else
6040         Invariant[ClassID] += Usage;
6041     }
6042 
6043     LLVM_DEBUG({
6044       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
6045       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
6046              << " item\n";
6047       for (const auto &pair : MaxUsages[i]) {
6048         dbgs() << "LV(REG): RegisterClass: "
6049                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6050                << " registers\n";
6051       }
6052       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
6053              << " item\n";
6054       for (const auto &pair : Invariant) {
6055         dbgs() << "LV(REG): RegisterClass: "
6056                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6057                << " registers\n";
6058       }
6059     });
6060 
6061     RU.LoopInvariantRegs = Invariant;
6062     RU.MaxLocalUsers = MaxUsages[i];
6063     RUs[i] = RU;
6064   }
6065 
6066   return RUs;
6067 }
6068 
6069 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
6070                                                            ElementCount VF) {
6071   // TODO: Cost model for emulated masked load/store is completely
6072   // broken. This hack guides the cost model to use an artificially
6073   // high enough value to practically disable vectorization with such
6074   // operations, except where previously deployed legality hack allowed
6075   // using very low cost values. This is to avoid regressions coming simply
6076   // from moving "masked load/store" check from legality to cost model.
6077   // Masked Load/Gather emulation was previously never allowed.
6078   // Limited number of Masked Store/Scatter emulation was allowed.
6079   assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction");
6080   return isa<LoadInst>(I) ||
6081          (isa<StoreInst>(I) &&
6082           NumPredStores > NumberOfStoresToPredicate);
6083 }
6084 
6085 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6086   // If we aren't vectorizing the loop, or if we've already collected the
6087   // instructions to scalarize, there's nothing to do. Collection may already
6088   // have occurred if we have a user-selected VF and are now computing the
6089   // expected cost for interleaving.
6090   if (VF.isScalar() || VF.isZero() ||
6091       InstsToScalarize.find(VF) != InstsToScalarize.end())
6092     return;
6093 
6094   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6095   // not profitable to scalarize any instructions, the presence of VF in the
6096   // map will indicate that we've analyzed it already.
6097   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6098 
6099   // Find all the instructions that are scalar with predication in the loop and
6100   // determine if it would be better to not if-convert the blocks they are in.
6101   // If so, we also record the instructions to scalarize.
6102   for (BasicBlock *BB : TheLoop->blocks()) {
6103     if (!blockNeedsPredicationForAnyReason(BB))
6104       continue;
6105     for (Instruction &I : *BB)
6106       if (isScalarWithPredication(&I, VF)) {
6107         ScalarCostsTy ScalarCosts;
6108         // Do not apply discount if scalable, because that would lead to
6109         // invalid scalarization costs.
6110         // Do not apply discount logic if hacked cost is needed
6111         // for emulated masked memrefs.
6112         if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) &&
6113             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6114           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6115         // Remember that BB will remain after vectorization.
6116         PredicatedBBsAfterVectorization.insert(BB);
6117       }
6118   }
6119 }
6120 
6121 int LoopVectorizationCostModel::computePredInstDiscount(
6122     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
6123   assert(!isUniformAfterVectorization(PredInst, VF) &&
6124          "Instruction marked uniform-after-vectorization will be predicated");
6125 
6126   // Initialize the discount to zero, meaning that the scalar version and the
6127   // vector version cost the same.
6128   InstructionCost Discount = 0;
6129 
6130   // Holds instructions to analyze. The instructions we visit are mapped in
6131   // ScalarCosts. Those instructions are the ones that would be scalarized if
6132   // we find that the scalar version costs less.
6133   SmallVector<Instruction *, 8> Worklist;
6134 
6135   // Returns true if the given instruction can be scalarized.
6136   auto canBeScalarized = [&](Instruction *I) -> bool {
6137     // We only attempt to scalarize instructions forming a single-use chain
6138     // from the original predicated block that would otherwise be vectorized.
6139     // Although not strictly necessary, we give up on instructions we know will
6140     // already be scalar to avoid traversing chains that are unlikely to be
6141     // beneficial.
6142     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6143         isScalarAfterVectorization(I, VF))
6144       return false;
6145 
6146     // If the instruction is scalar with predication, it will be analyzed
6147     // separately. We ignore it within the context of PredInst.
6148     if (isScalarWithPredication(I, VF))
6149       return false;
6150 
6151     // If any of the instruction's operands are uniform after vectorization,
6152     // the instruction cannot be scalarized. This prevents, for example, a
6153     // masked load from being scalarized.
6154     //
6155     // We assume we will only emit a value for lane zero of an instruction
6156     // marked uniform after vectorization, rather than VF identical values.
6157     // Thus, if we scalarize an instruction that uses a uniform, we would
6158     // create uses of values corresponding to the lanes we aren't emitting code
6159     // for. This behavior can be changed by allowing getScalarValue to clone
6160     // the lane zero values for uniforms rather than asserting.
6161     for (Use &U : I->operands())
6162       if (auto *J = dyn_cast<Instruction>(U.get()))
6163         if (isUniformAfterVectorization(J, VF))
6164           return false;
6165 
6166     // Otherwise, we can scalarize the instruction.
6167     return true;
6168   };
6169 
6170   // Compute the expected cost discount from scalarizing the entire expression
6171   // feeding the predicated instruction. We currently only consider expressions
6172   // that are single-use instruction chains.
6173   Worklist.push_back(PredInst);
6174   while (!Worklist.empty()) {
6175     Instruction *I = Worklist.pop_back_val();
6176 
6177     // If we've already analyzed the instruction, there's nothing to do.
6178     if (ScalarCosts.find(I) != ScalarCosts.end())
6179       continue;
6180 
6181     // Compute the cost of the vector instruction. Note that this cost already
6182     // includes the scalarization overhead of the predicated instruction.
6183     InstructionCost VectorCost = getInstructionCost(I, VF).first;
6184 
6185     // Compute the cost of the scalarized instruction. This cost is the cost of
6186     // the instruction as if it wasn't if-converted and instead remained in the
6187     // predicated block. We will scale this cost by block probability after
6188     // computing the scalarization overhead.
6189     InstructionCost ScalarCost =
6190         VF.getFixedValue() *
6191         getInstructionCost(I, ElementCount::getFixed(1)).first;
6192 
6193     // Compute the scalarization overhead of needed insertelement instructions
6194     // and phi nodes.
6195     if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
6196       ScalarCost += TTI.getScalarizationOverhead(
6197           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6198           APInt::getAllOnes(VF.getFixedValue()), true, false);
6199       ScalarCost +=
6200           VF.getFixedValue() *
6201           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6202     }
6203 
6204     // Compute the scalarization overhead of needed extractelement
6205     // instructions. For each of the instruction's operands, if the operand can
6206     // be scalarized, add it to the worklist; otherwise, account for the
6207     // overhead.
6208     for (Use &U : I->operands())
6209       if (auto *J = dyn_cast<Instruction>(U.get())) {
6210         assert(VectorType::isValidElementType(J->getType()) &&
6211                "Instruction has non-scalar type");
6212         if (canBeScalarized(J))
6213           Worklist.push_back(J);
6214         else if (needsExtract(J, VF)) {
6215           ScalarCost += TTI.getScalarizationOverhead(
6216               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6217               APInt::getAllOnes(VF.getFixedValue()), false, true);
6218         }
6219       }
6220 
6221     // Scale the total scalar cost by block probability.
6222     ScalarCost /= getReciprocalPredBlockProb();
6223 
6224     // Compute the discount. A non-negative discount means the vector version
6225     // of the instruction costs more, and scalarizing would be beneficial.
6226     Discount += VectorCost - ScalarCost;
6227     ScalarCosts[I] = ScalarCost;
6228   }
6229 
6230   return *Discount.getValue();
6231 }
6232 
6233 LoopVectorizationCostModel::VectorizationCostTy
6234 LoopVectorizationCostModel::expectedCost(
6235     ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
6236   VectorizationCostTy Cost;
6237 
6238   // For each block.
6239   for (BasicBlock *BB : TheLoop->blocks()) {
6240     VectorizationCostTy BlockCost;
6241 
6242     // For each instruction in the old loop.
6243     for (Instruction &I : BB->instructionsWithoutDebug()) {
6244       // Skip ignored values.
6245       if (ValuesToIgnore.count(&I) ||
6246           (VF.isVector() && VecValuesToIgnore.count(&I)))
6247         continue;
6248 
6249       VectorizationCostTy C = getInstructionCost(&I, VF);
6250 
6251       // Check if we should override the cost.
6252       if (C.first.isValid() &&
6253           ForceTargetInstructionCost.getNumOccurrences() > 0)
6254         C.first = InstructionCost(ForceTargetInstructionCost);
6255 
6256       // Keep a list of instructions with invalid costs.
6257       if (Invalid && !C.first.isValid())
6258         Invalid->emplace_back(&I, VF);
6259 
6260       BlockCost.first += C.first;
6261       BlockCost.second |= C.second;
6262       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6263                         << " for VF " << VF << " For instruction: " << I
6264                         << '\n');
6265     }
6266 
6267     // If we are vectorizing a predicated block, it will have been
6268     // if-converted. This means that the block's instructions (aside from
6269     // stores and instructions that may divide by zero) will now be
6270     // unconditionally executed. For the scalar case, we may not always execute
6271     // the predicated block, if it is an if-else block. Thus, scale the block's
6272     // cost by the probability of executing it. blockNeedsPredication from
6273     // Legal is used so as to not include all blocks in tail folded loops.
6274     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6275       BlockCost.first /= getReciprocalPredBlockProb();
6276 
6277     Cost.first += BlockCost.first;
6278     Cost.second |= BlockCost.second;
6279   }
6280 
6281   return Cost;
6282 }
6283 
6284 /// Gets Address Access SCEV after verifying that the access pattern
6285 /// is loop invariant except the induction variable dependence.
6286 ///
6287 /// This SCEV can be sent to the Target in order to estimate the address
6288 /// calculation cost.
6289 static const SCEV *getAddressAccessSCEV(
6290               Value *Ptr,
6291               LoopVectorizationLegality *Legal,
6292               PredicatedScalarEvolution &PSE,
6293               const Loop *TheLoop) {
6294 
6295   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6296   if (!Gep)
6297     return nullptr;
6298 
6299   // We are looking for a gep with all loop invariant indices except for one
6300   // which should be an induction variable.
6301   auto SE = PSE.getSE();
6302   unsigned NumOperands = Gep->getNumOperands();
6303   for (unsigned i = 1; i < NumOperands; ++i) {
6304     Value *Opd = Gep->getOperand(i);
6305     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6306         !Legal->isInductionVariable(Opd))
6307       return nullptr;
6308   }
6309 
6310   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6311   return PSE.getSCEV(Ptr);
6312 }
6313 
6314 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6315   return Legal->hasStride(I->getOperand(0)) ||
6316          Legal->hasStride(I->getOperand(1));
6317 }
6318 
6319 InstructionCost
6320 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6321                                                         ElementCount VF) {
6322   assert(VF.isVector() &&
6323          "Scalarization cost of instruction implies vectorization.");
6324   if (VF.isScalable())
6325     return InstructionCost::getInvalid();
6326 
6327   Type *ValTy = getLoadStoreType(I);
6328   auto SE = PSE.getSE();
6329 
6330   unsigned AS = getLoadStoreAddressSpace(I);
6331   Value *Ptr = getLoadStorePointerOperand(I);
6332   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6333   // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
6334   //       that it is being called from this specific place.
6335 
6336   // Figure out whether the access is strided and get the stride value
6337   // if it's known in compile time
6338   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6339 
6340   // Get the cost of the scalar memory instruction and address computation.
6341   InstructionCost Cost =
6342       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6343 
6344   // Don't pass *I here, since it is scalar but will actually be part of a
6345   // vectorized loop where the user of it is a vectorized instruction.
6346   const Align Alignment = getLoadStoreAlignment(I);
6347   Cost += VF.getKnownMinValue() *
6348           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6349                               AS, TTI::TCK_RecipThroughput);
6350 
6351   // Get the overhead of the extractelement and insertelement instructions
6352   // we might create due to scalarization.
6353   Cost += getScalarizationOverhead(I, VF);
6354 
6355   // If we have a predicated load/store, it will need extra i1 extracts and
6356   // conditional branches, but may not be executed for each vector lane. Scale
6357   // the cost by the probability of executing the predicated block.
6358   if (isPredicatedInst(I, VF)) {
6359     Cost /= getReciprocalPredBlockProb();
6360 
6361     // Add the cost of an i1 extract and a branch
6362     auto *Vec_i1Ty =
6363         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
6364     Cost += TTI.getScalarizationOverhead(
6365         Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
6366         /*Insert=*/false, /*Extract=*/true);
6367     Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
6368 
6369     if (useEmulatedMaskMemRefHack(I, VF))
6370       // Artificially setting to a high enough value to practically disable
6371       // vectorization with such operations.
6372       Cost = 3000000;
6373   }
6374 
6375   return Cost;
6376 }
6377 
6378 InstructionCost
6379 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6380                                                     ElementCount VF) {
6381   Type *ValTy = getLoadStoreType(I);
6382   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6383   Value *Ptr = getLoadStorePointerOperand(I);
6384   unsigned AS = getLoadStoreAddressSpace(I);
6385   int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
6386   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6387 
6388   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6389          "Stride should be 1 or -1 for consecutive memory access");
6390   const Align Alignment = getLoadStoreAlignment(I);
6391   InstructionCost Cost = 0;
6392   if (Legal->isMaskRequired(I))
6393     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6394                                       CostKind);
6395   else
6396     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6397                                 CostKind, I);
6398 
6399   bool Reverse = ConsecutiveStride < 0;
6400   if (Reverse)
6401     Cost +=
6402         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
6403   return Cost;
6404 }
6405 
6406 InstructionCost
6407 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6408                                                 ElementCount VF) {
6409   assert(Legal->isUniformMemOp(*I));
6410 
6411   Type *ValTy = getLoadStoreType(I);
6412   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6413   const Align Alignment = getLoadStoreAlignment(I);
6414   unsigned AS = getLoadStoreAddressSpace(I);
6415   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6416   if (isa<LoadInst>(I)) {
6417     return TTI.getAddressComputationCost(ValTy) +
6418            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6419                                CostKind) +
6420            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6421   }
6422   StoreInst *SI = cast<StoreInst>(I);
6423 
6424   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6425   return TTI.getAddressComputationCost(ValTy) +
6426          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6427                              CostKind) +
6428          (isLoopInvariantStoreValue
6429               ? 0
6430               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6431                                        VF.getKnownMinValue() - 1));
6432 }
6433 
6434 InstructionCost
6435 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6436                                                  ElementCount VF) {
6437   Type *ValTy = getLoadStoreType(I);
6438   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6439   const Align Alignment = getLoadStoreAlignment(I);
6440   const Value *Ptr = getLoadStorePointerOperand(I);
6441 
6442   return TTI.getAddressComputationCost(VectorTy) +
6443          TTI.getGatherScatterOpCost(
6444              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6445              TargetTransformInfo::TCK_RecipThroughput, I);
6446 }
6447 
6448 InstructionCost
6449 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6450                                                    ElementCount VF) {
6451   // TODO: Once we have support for interleaving with scalable vectors
6452   // we can calculate the cost properly here.
6453   if (VF.isScalable())
6454     return InstructionCost::getInvalid();
6455 
6456   Type *ValTy = getLoadStoreType(I);
6457   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6458   unsigned AS = getLoadStoreAddressSpace(I);
6459 
6460   auto Group = getInterleavedAccessGroup(I);
6461   assert(Group && "Fail to get an interleaved access group.");
6462 
6463   unsigned InterleaveFactor = Group->getFactor();
6464   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6465 
6466   // Holds the indices of existing members in the interleaved group.
6467   SmallVector<unsigned, 4> Indices;
6468   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
6469     if (Group->getMember(IF))
6470       Indices.push_back(IF);
6471 
6472   // Calculate the cost of the whole interleaved group.
6473   bool UseMaskForGaps =
6474       (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6475       (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
6476   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
6477       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6478       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6479 
6480   if (Group->isReverse()) {
6481     // TODO: Add support for reversed masked interleaved access.
6482     assert(!Legal->isMaskRequired(I) &&
6483            "Reverse masked interleaved access not supported.");
6484     Cost +=
6485         Group->getNumMembers() *
6486         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
6487   }
6488   return Cost;
6489 }
6490 
6491 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
6492     Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
6493   using namespace llvm::PatternMatch;
6494   // Early exit for no inloop reductions
6495   if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6496     return None;
6497   auto *VectorTy = cast<VectorType>(Ty);
6498 
6499   // We are looking for a pattern of, and finding the minimal acceptable cost:
6500   //  reduce(mul(ext(A), ext(B))) or
6501   //  reduce(mul(A, B)) or
6502   //  reduce(ext(A)) or
6503   //  reduce(A).
6504   // The basic idea is that we walk down the tree to do that, finding the root
6505   // reduction instruction in InLoopReductionImmediateChains. From there we find
6506   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6507   // of the components. If the reduction cost is lower then we return it for the
6508   // reduction instruction and 0 for the other instructions in the pattern. If
6509   // it is not we return an invalid cost specifying the orignal cost method
6510   // should be used.
6511   Instruction *RetI = I;
6512   if (match(RetI, m_ZExtOrSExt(m_Value()))) {
6513     if (!RetI->hasOneUser())
6514       return None;
6515     RetI = RetI->user_back();
6516   }
6517   if (match(RetI, m_Mul(m_Value(), m_Value())) &&
6518       RetI->user_back()->getOpcode() == Instruction::Add) {
6519     if (!RetI->hasOneUser())
6520       return None;
6521     RetI = RetI->user_back();
6522   }
6523 
6524   // Test if the found instruction is a reduction, and if not return an invalid
6525   // cost specifying the parent to use the original cost modelling.
6526   if (!InLoopReductionImmediateChains.count(RetI))
6527     return None;
6528 
6529   // Find the reduction this chain is a part of and calculate the basic cost of
6530   // the reduction on its own.
6531   Instruction *LastChain = InLoopReductionImmediateChains[RetI];
6532   Instruction *ReductionPhi = LastChain;
6533   while (!isa<PHINode>(ReductionPhi))
6534     ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
6535 
6536   const RecurrenceDescriptor &RdxDesc =
6537       Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
6538 
6539   InstructionCost BaseCost = TTI.getArithmeticReductionCost(
6540       RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
6541 
6542   // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6543   // normal fmul instruction to the cost of the fadd reduction.
6544   if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
6545     BaseCost +=
6546         TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
6547 
6548   // If we're using ordered reductions then we can just return the base cost
6549   // here, since getArithmeticReductionCost calculates the full ordered
6550   // reduction cost when FP reassociation is not allowed.
6551   if (useOrderedReductions(RdxDesc))
6552     return BaseCost;
6553 
6554   // Get the operand that was not the reduction chain and match it to one of the
6555   // patterns, returning the better cost if it is found.
6556   Instruction *RedOp = RetI->getOperand(1) == LastChain
6557                            ? dyn_cast<Instruction>(RetI->getOperand(0))
6558                            : dyn_cast<Instruction>(RetI->getOperand(1));
6559 
6560   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6561 
6562   Instruction *Op0, *Op1;
6563   if (RedOp &&
6564       match(RedOp,
6565             m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
6566       match(Op0, m_ZExtOrSExt(m_Value())) &&
6567       Op0->getOpcode() == Op1->getOpcode() &&
6568       Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6569       !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
6570       (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
6571 
6572     // Matched reduce(ext(mul(ext(A), ext(B)))
6573     // Note that the extend opcodes need to all match, or if A==B they will have
6574     // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6575     // which is equally fine.
6576     bool IsUnsigned = isa<ZExtInst>(Op0);
6577     auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6578     auto *MulType = VectorType::get(Op0->getType(), VectorTy);
6579 
6580     InstructionCost ExtCost =
6581         TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
6582                              TTI::CastContextHint::None, CostKind, Op0);
6583     InstructionCost MulCost =
6584         TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
6585     InstructionCost Ext2Cost =
6586         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
6587                              TTI::CastContextHint::None, CostKind, RedOp);
6588 
6589     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6590         /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6591         CostKind);
6592 
6593     if (RedCost.isValid() &&
6594         RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6595       return I == RetI ? RedCost : 0;
6596   } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
6597              !TheLoop->isLoopInvariant(RedOp)) {
6598     // Matched reduce(ext(A))
6599     bool IsUnsigned = isa<ZExtInst>(RedOp);
6600     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6601     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6602         /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6603         CostKind);
6604 
6605     InstructionCost ExtCost =
6606         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6607                              TTI::CastContextHint::None, CostKind, RedOp);
6608     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6609       return I == RetI ? RedCost : 0;
6610   } else if (RedOp &&
6611              match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
6612     if (match(Op0, m_ZExtOrSExt(m_Value())) &&
6613         Op0->getOpcode() == Op1->getOpcode() &&
6614         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
6615       bool IsUnsigned = isa<ZExtInst>(Op0);
6616       Type *Op0Ty = Op0->getOperand(0)->getType();
6617       Type *Op1Ty = Op1->getOperand(0)->getType();
6618       Type *LargestOpTy =
6619           Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
6620                                                                     : Op0Ty;
6621       auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
6622 
6623       // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of
6624       // different sizes. We take the largest type as the ext to reduce, and add
6625       // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6626       InstructionCost ExtCost0 = TTI.getCastInstrCost(
6627           Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
6628           TTI::CastContextHint::None, CostKind, Op0);
6629       InstructionCost ExtCost1 = TTI.getCastInstrCost(
6630           Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
6631           TTI::CastContextHint::None, CostKind, Op1);
6632       InstructionCost MulCost =
6633           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6634 
6635       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6636           /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6637           CostKind);
6638       InstructionCost ExtraExtCost = 0;
6639       if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
6640         Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
6641         ExtraExtCost = TTI.getCastInstrCost(
6642             ExtraExtOp->getOpcode(), ExtType,
6643             VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
6644             TTI::CastContextHint::None, CostKind, ExtraExtOp);
6645       }
6646 
6647       if (RedCost.isValid() &&
6648           (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6649         return I == RetI ? RedCost : 0;
6650     } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
6651       // Matched reduce(mul())
6652       InstructionCost MulCost =
6653           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6654 
6655       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6656           /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy,
6657           CostKind);
6658 
6659       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
6660         return I == RetI ? RedCost : 0;
6661     }
6662   }
6663 
6664   return I == RetI ? Optional<InstructionCost>(BaseCost) : None;
6665 }
6666 
6667 InstructionCost
6668 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6669                                                      ElementCount VF) {
6670   // Calculate scalar cost only. Vectorization cost should be ready at this
6671   // moment.
6672   if (VF.isScalar()) {
6673     Type *ValTy = getLoadStoreType(I);
6674     const Align Alignment = getLoadStoreAlignment(I);
6675     unsigned AS = getLoadStoreAddressSpace(I);
6676 
6677     return TTI.getAddressComputationCost(ValTy) +
6678            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6679                                TTI::TCK_RecipThroughput, I);
6680   }
6681   return getWideningCost(I, VF);
6682 }
6683 
6684 LoopVectorizationCostModel::VectorizationCostTy
6685 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6686                                                ElementCount VF) {
6687   // If we know that this instruction will remain uniform, check the cost of
6688   // the scalar version.
6689   if (isUniformAfterVectorization(I, VF))
6690     VF = ElementCount::getFixed(1);
6691 
6692   if (VF.isVector() && isProfitableToScalarize(I, VF))
6693     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6694 
6695   // Forced scalars do not have any scalarization overhead.
6696   auto ForcedScalar = ForcedScalars.find(VF);
6697   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6698     auto InstSet = ForcedScalar->second;
6699     if (InstSet.count(I))
6700       return VectorizationCostTy(
6701           (getInstructionCost(I, ElementCount::getFixed(1)).first *
6702            VF.getKnownMinValue()),
6703           false);
6704   }
6705 
6706   Type *VectorTy;
6707   InstructionCost C = getInstructionCost(I, VF, VectorTy);
6708 
6709   bool TypeNotScalarized = false;
6710   if (VF.isVector() && VectorTy->isVectorTy()) {
6711     unsigned NumParts = TTI.getNumberOfParts(VectorTy);
6712     if (NumParts)
6713       TypeNotScalarized = NumParts < VF.getKnownMinValue();
6714     else
6715       C = InstructionCost::getInvalid();
6716   }
6717   return VectorizationCostTy(C, TypeNotScalarized);
6718 }
6719 
6720 InstructionCost
6721 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6722                                                      ElementCount VF) const {
6723 
6724   // There is no mechanism yet to create a scalable scalarization loop,
6725   // so this is currently Invalid.
6726   if (VF.isScalable())
6727     return InstructionCost::getInvalid();
6728 
6729   if (VF.isScalar())
6730     return 0;
6731 
6732   InstructionCost Cost = 0;
6733   Type *RetTy = ToVectorTy(I->getType(), VF);
6734   if (!RetTy->isVoidTy() &&
6735       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6736     Cost += TTI.getScalarizationOverhead(
6737         cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true,
6738         false);
6739 
6740   // Some targets keep addresses scalar.
6741   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6742     return Cost;
6743 
6744   // Some targets support efficient element stores.
6745   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6746     return Cost;
6747 
6748   // Collect operands to consider.
6749   CallInst *CI = dyn_cast<CallInst>(I);
6750   Instruction::op_range Ops = CI ? CI->args() : I->operands();
6751 
6752   // Skip operands that do not require extraction/scalarization and do not incur
6753   // any overhead.
6754   SmallVector<Type *> Tys;
6755   for (auto *V : filterExtractingOperands(Ops, VF))
6756     Tys.push_back(MaybeVectorizeType(V->getType(), VF));
6757   return Cost + TTI.getOperandsScalarizationOverhead(
6758                     filterExtractingOperands(Ops, VF), Tys);
6759 }
6760 
6761 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6762   if (VF.isScalar())
6763     return;
6764   NumPredStores = 0;
6765   for (BasicBlock *BB : TheLoop->blocks()) {
6766     // For each instruction in the old loop.
6767     for (Instruction &I : *BB) {
6768       Value *Ptr =  getLoadStorePointerOperand(&I);
6769       if (!Ptr)
6770         continue;
6771 
6772       // TODO: We should generate better code and update the cost model for
6773       // predicated uniform stores. Today they are treated as any other
6774       // predicated store (see added test cases in
6775       // invariant-store-vectorization.ll).
6776       if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6777         NumPredStores++;
6778 
6779       if (Legal->isUniformMemOp(I)) {
6780         // TODO: Avoid replicating loads and stores instead of
6781         // relying on instcombine to remove them.
6782         // Load: Scalar load + broadcast
6783         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6784         InstructionCost Cost;
6785         if (isa<StoreInst>(&I) && VF.isScalable() &&
6786             isLegalGatherOrScatter(&I, VF)) {
6787           Cost = getGatherScatterCost(&I, VF);
6788           setWideningDecision(&I, VF, CM_GatherScatter, Cost);
6789         } else {
6790           assert((isa<LoadInst>(&I) || !VF.isScalable()) &&
6791                  "Cannot yet scalarize uniform stores");
6792           Cost = getUniformMemOpCost(&I, VF);
6793           setWideningDecision(&I, VF, CM_Scalarize, Cost);
6794         }
6795         continue;
6796       }
6797 
6798       // We assume that widening is the best solution when possible.
6799       if (memoryInstructionCanBeWidened(&I, VF)) {
6800         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6801         int ConsecutiveStride = Legal->isConsecutivePtr(
6802             getLoadStoreType(&I), getLoadStorePointerOperand(&I));
6803         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6804                "Expected consecutive stride.");
6805         InstWidening Decision =
6806             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6807         setWideningDecision(&I, VF, Decision, Cost);
6808         continue;
6809       }
6810 
6811       // Choose between Interleaving, Gather/Scatter or Scalarization.
6812       InstructionCost InterleaveCost = InstructionCost::getInvalid();
6813       unsigned NumAccesses = 1;
6814       if (isAccessInterleaved(&I)) {
6815         auto Group = getInterleavedAccessGroup(&I);
6816         assert(Group && "Fail to get an interleaved access group.");
6817 
6818         // Make one decision for the whole group.
6819         if (getWideningDecision(&I, VF) != CM_Unknown)
6820           continue;
6821 
6822         NumAccesses = Group->getNumMembers();
6823         if (interleavedAccessCanBeWidened(&I, VF))
6824           InterleaveCost = getInterleaveGroupCost(&I, VF);
6825       }
6826 
6827       InstructionCost GatherScatterCost =
6828           isLegalGatherOrScatter(&I, VF)
6829               ? getGatherScatterCost(&I, VF) * NumAccesses
6830               : InstructionCost::getInvalid();
6831 
6832       InstructionCost ScalarizationCost =
6833           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6834 
6835       // Choose better solution for the current VF,
6836       // write down this decision and use it during vectorization.
6837       InstructionCost Cost;
6838       InstWidening Decision;
6839       if (InterleaveCost <= GatherScatterCost &&
6840           InterleaveCost < ScalarizationCost) {
6841         Decision = CM_Interleave;
6842         Cost = InterleaveCost;
6843       } else if (GatherScatterCost < ScalarizationCost) {
6844         Decision = CM_GatherScatter;
6845         Cost = GatherScatterCost;
6846       } else {
6847         Decision = CM_Scalarize;
6848         Cost = ScalarizationCost;
6849       }
6850       // If the instructions belongs to an interleave group, the whole group
6851       // receives the same decision. The whole group receives the cost, but
6852       // the cost will actually be assigned to one instruction.
6853       if (auto Group = getInterleavedAccessGroup(&I))
6854         setWideningDecision(Group, VF, Decision, Cost);
6855       else
6856         setWideningDecision(&I, VF, Decision, Cost);
6857     }
6858   }
6859 
6860   // Make sure that any load of address and any other address computation
6861   // remains scalar unless there is gather/scatter support. This avoids
6862   // inevitable extracts into address registers, and also has the benefit of
6863   // activating LSR more, since that pass can't optimize vectorized
6864   // addresses.
6865   if (TTI.prefersVectorizedAddressing())
6866     return;
6867 
6868   // Start with all scalar pointer uses.
6869   SmallPtrSet<Instruction *, 8> AddrDefs;
6870   for (BasicBlock *BB : TheLoop->blocks())
6871     for (Instruction &I : *BB) {
6872       Instruction *PtrDef =
6873         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6874       if (PtrDef && TheLoop->contains(PtrDef) &&
6875           getWideningDecision(&I, VF) != CM_GatherScatter)
6876         AddrDefs.insert(PtrDef);
6877     }
6878 
6879   // Add all instructions used to generate the addresses.
6880   SmallVector<Instruction *, 4> Worklist;
6881   append_range(Worklist, AddrDefs);
6882   while (!Worklist.empty()) {
6883     Instruction *I = Worklist.pop_back_val();
6884     for (auto &Op : I->operands())
6885       if (auto *InstOp = dyn_cast<Instruction>(Op))
6886         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6887             AddrDefs.insert(InstOp).second)
6888           Worklist.push_back(InstOp);
6889   }
6890 
6891   for (auto *I : AddrDefs) {
6892     if (isa<LoadInst>(I)) {
6893       // Setting the desired widening decision should ideally be handled in
6894       // by cost functions, but since this involves the task of finding out
6895       // if the loaded register is involved in an address computation, it is
6896       // instead changed here when we know this is the case.
6897       InstWidening Decision = getWideningDecision(I, VF);
6898       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6899         // Scalarize a widened load of address.
6900         setWideningDecision(
6901             I, VF, CM_Scalarize,
6902             (VF.getKnownMinValue() *
6903              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6904       else if (auto Group = getInterleavedAccessGroup(I)) {
6905         // Scalarize an interleave group of address loads.
6906         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6907           if (Instruction *Member = Group->getMember(I))
6908             setWideningDecision(
6909                 Member, VF, CM_Scalarize,
6910                 (VF.getKnownMinValue() *
6911                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6912         }
6913       }
6914     } else
6915       // Make sure I gets scalarized and a cost estimate without
6916       // scalarization overhead.
6917       ForcedScalars[VF].insert(I);
6918   }
6919 }
6920 
6921 InstructionCost
6922 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
6923                                                Type *&VectorTy) {
6924   Type *RetTy = I->getType();
6925   if (canTruncateToMinimalBitwidth(I, VF))
6926     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6927   auto SE = PSE.getSE();
6928   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6929 
6930   auto hasSingleCopyAfterVectorization = [this](Instruction *I,
6931                                                 ElementCount VF) -> bool {
6932     if (VF.isScalar())
6933       return true;
6934 
6935     auto Scalarized = InstsToScalarize.find(VF);
6936     assert(Scalarized != InstsToScalarize.end() &&
6937            "VF not yet analyzed for scalarization profitability");
6938     return !Scalarized->second.count(I) &&
6939            llvm::all_of(I->users(), [&](User *U) {
6940              auto *UI = cast<Instruction>(U);
6941              return !Scalarized->second.count(UI);
6942            });
6943   };
6944   (void) hasSingleCopyAfterVectorization;
6945 
6946   if (isScalarAfterVectorization(I, VF)) {
6947     // With the exception of GEPs and PHIs, after scalarization there should
6948     // only be one copy of the instruction generated in the loop. This is
6949     // because the VF is either 1, or any instructions that need scalarizing
6950     // have already been dealt with by the the time we get here. As a result,
6951     // it means we don't have to multiply the instruction cost by VF.
6952     assert(I->getOpcode() == Instruction::GetElementPtr ||
6953            I->getOpcode() == Instruction::PHI ||
6954            (I->getOpcode() == Instruction::BitCast &&
6955             I->getType()->isPointerTy()) ||
6956            hasSingleCopyAfterVectorization(I, VF));
6957     VectorTy = RetTy;
6958   } else
6959     VectorTy = ToVectorTy(RetTy, VF);
6960 
6961   // TODO: We need to estimate the cost of intrinsic calls.
6962   switch (I->getOpcode()) {
6963   case Instruction::GetElementPtr:
6964     // We mark this instruction as zero-cost because the cost of GEPs in
6965     // vectorized code depends on whether the corresponding memory instruction
6966     // is scalarized or not. Therefore, we handle GEPs with the memory
6967     // instruction cost.
6968     return 0;
6969   case Instruction::Br: {
6970     // In cases of scalarized and predicated instructions, there will be VF
6971     // predicated blocks in the vectorized loop. Each branch around these
6972     // blocks requires also an extract of its vector compare i1 element.
6973     bool ScalarPredicatedBB = false;
6974     BranchInst *BI = cast<BranchInst>(I);
6975     if (VF.isVector() && BI->isConditional() &&
6976         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
6977          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
6978       ScalarPredicatedBB = true;
6979 
6980     if (ScalarPredicatedBB) {
6981       // Not possible to scalarize scalable vector with predicated instructions.
6982       if (VF.isScalable())
6983         return InstructionCost::getInvalid();
6984       // Return cost for branches around scalarized and predicated blocks.
6985       auto *Vec_i1Ty =
6986           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6987       return (
6988           TTI.getScalarizationOverhead(
6989               Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) +
6990           (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6991     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6992       // The back-edge branch will remain, as will all scalar branches.
6993       return TTI.getCFInstrCost(Instruction::Br, CostKind);
6994     else
6995       // This branch will be eliminated by if-conversion.
6996       return 0;
6997     // Note: We currently assume zero cost for an unconditional branch inside
6998     // a predicated block since it will become a fall-through, although we
6999     // may decide in the future to call TTI for all branches.
7000   }
7001   case Instruction::PHI: {
7002     auto *Phi = cast<PHINode>(I);
7003 
7004     // First-order recurrences are replaced by vector shuffles inside the loop.
7005     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
7006     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
7007       return TTI.getShuffleCost(
7008           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
7009           None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
7010 
7011     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7012     // converted into select instructions. We require N - 1 selects per phi
7013     // node, where N is the number of incoming values.
7014     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
7015       return (Phi->getNumIncomingValues() - 1) *
7016              TTI.getCmpSelInstrCost(
7017                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
7018                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
7019                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
7020 
7021     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
7022   }
7023   case Instruction::UDiv:
7024   case Instruction::SDiv:
7025   case Instruction::URem:
7026   case Instruction::SRem:
7027     // If we have a predicated instruction, it may not be executed for each
7028     // vector lane. Get the scalarization cost and scale this amount by the
7029     // probability of executing the predicated block. If the instruction is not
7030     // predicated, we fall through to the next case.
7031     if (VF.isVector() && isScalarWithPredication(I, VF)) {
7032       InstructionCost Cost = 0;
7033 
7034       // These instructions have a non-void type, so account for the phi nodes
7035       // that we will create. This cost is likely to be zero. The phi node
7036       // cost, if any, should be scaled by the block probability because it
7037       // models a copy at the end of each predicated block.
7038       Cost += VF.getKnownMinValue() *
7039               TTI.getCFInstrCost(Instruction::PHI, CostKind);
7040 
7041       // The cost of the non-predicated instruction.
7042       Cost += VF.getKnownMinValue() *
7043               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
7044 
7045       // The cost of insertelement and extractelement instructions needed for
7046       // scalarization.
7047       Cost += getScalarizationOverhead(I, VF);
7048 
7049       // Scale the cost by the probability of executing the predicated blocks.
7050       // This assumes the predicated block for each vector lane is equally
7051       // likely.
7052       return Cost / getReciprocalPredBlockProb();
7053     }
7054     LLVM_FALLTHROUGH;
7055   case Instruction::Add:
7056   case Instruction::FAdd:
7057   case Instruction::Sub:
7058   case Instruction::FSub:
7059   case Instruction::Mul:
7060   case Instruction::FMul:
7061   case Instruction::FDiv:
7062   case Instruction::FRem:
7063   case Instruction::Shl:
7064   case Instruction::LShr:
7065   case Instruction::AShr:
7066   case Instruction::And:
7067   case Instruction::Or:
7068   case Instruction::Xor: {
7069     // Since we will replace the stride by 1 the multiplication should go away.
7070     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7071       return 0;
7072 
7073     // Detect reduction patterns
7074     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7075       return *RedCost;
7076 
7077     // Certain instructions can be cheaper to vectorize if they have a constant
7078     // second vector operand. One example of this are shifts on x86.
7079     Value *Op2 = I->getOperand(1);
7080     TargetTransformInfo::OperandValueProperties Op2VP;
7081     TargetTransformInfo::OperandValueKind Op2VK =
7082         TTI.getOperandInfo(Op2, Op2VP);
7083     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7084       Op2VK = TargetTransformInfo::OK_UniformValue;
7085 
7086     SmallVector<const Value *, 4> Operands(I->operand_values());
7087     return TTI.getArithmeticInstrCost(
7088         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7089         Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
7090   }
7091   case Instruction::FNeg: {
7092     return TTI.getArithmeticInstrCost(
7093         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7094         TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None,
7095         TargetTransformInfo::OP_None, I->getOperand(0), I);
7096   }
7097   case Instruction::Select: {
7098     SelectInst *SI = cast<SelectInst>(I);
7099     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7100     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7101 
7102     const Value *Op0, *Op1;
7103     using namespace llvm::PatternMatch;
7104     if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
7105                         match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
7106       // select x, y, false --> x & y
7107       // select x, true, y --> x | y
7108       TTI::OperandValueProperties Op1VP = TTI::OP_None;
7109       TTI::OperandValueProperties Op2VP = TTI::OP_None;
7110       TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP);
7111       TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP);
7112       assert(Op0->getType()->getScalarSizeInBits() == 1 &&
7113               Op1->getType()->getScalarSizeInBits() == 1);
7114 
7115       SmallVector<const Value *, 2> Operands{Op0, Op1};
7116       return TTI.getArithmeticInstrCost(
7117           match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
7118           CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I);
7119     }
7120 
7121     Type *CondTy = SI->getCondition()->getType();
7122     if (!ScalarCond)
7123       CondTy = VectorType::get(CondTy, VF);
7124 
7125     CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
7126     if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
7127       Pred = Cmp->getPredicate();
7128     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
7129                                   CostKind, I);
7130   }
7131   case Instruction::ICmp:
7132   case Instruction::FCmp: {
7133     Type *ValTy = I->getOperand(0)->getType();
7134     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7135     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7136       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7137     VectorTy = ToVectorTy(ValTy, VF);
7138     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7139                                   cast<CmpInst>(I)->getPredicate(), CostKind,
7140                                   I);
7141   }
7142   case Instruction::Store:
7143   case Instruction::Load: {
7144     ElementCount Width = VF;
7145     if (Width.isVector()) {
7146       InstWidening Decision = getWideningDecision(I, Width);
7147       assert(Decision != CM_Unknown &&
7148              "CM decision should be taken at this point");
7149       if (Decision == CM_Scalarize)
7150         Width = ElementCount::getFixed(1);
7151     }
7152     VectorTy = ToVectorTy(getLoadStoreType(I), Width);
7153     return getMemoryInstructionCost(I, VF);
7154   }
7155   case Instruction::BitCast:
7156     if (I->getType()->isPointerTy())
7157       return 0;
7158     LLVM_FALLTHROUGH;
7159   case Instruction::ZExt:
7160   case Instruction::SExt:
7161   case Instruction::FPToUI:
7162   case Instruction::FPToSI:
7163   case Instruction::FPExt:
7164   case Instruction::PtrToInt:
7165   case Instruction::IntToPtr:
7166   case Instruction::SIToFP:
7167   case Instruction::UIToFP:
7168   case Instruction::Trunc:
7169   case Instruction::FPTrunc: {
7170     // Computes the CastContextHint from a Load/Store instruction.
7171     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7172       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7173              "Expected a load or a store!");
7174 
7175       if (VF.isScalar() || !TheLoop->contains(I))
7176         return TTI::CastContextHint::Normal;
7177 
7178       switch (getWideningDecision(I, VF)) {
7179       case LoopVectorizationCostModel::CM_GatherScatter:
7180         return TTI::CastContextHint::GatherScatter;
7181       case LoopVectorizationCostModel::CM_Interleave:
7182         return TTI::CastContextHint::Interleave;
7183       case LoopVectorizationCostModel::CM_Scalarize:
7184       case LoopVectorizationCostModel::CM_Widen:
7185         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7186                                         : TTI::CastContextHint::Normal;
7187       case LoopVectorizationCostModel::CM_Widen_Reverse:
7188         return TTI::CastContextHint::Reversed;
7189       case LoopVectorizationCostModel::CM_Unknown:
7190         llvm_unreachable("Instr did not go through cost modelling?");
7191       }
7192 
7193       llvm_unreachable("Unhandled case!");
7194     };
7195 
7196     unsigned Opcode = I->getOpcode();
7197     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7198     // For Trunc, the context is the only user, which must be a StoreInst.
7199     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7200       if (I->hasOneUse())
7201         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7202           CCH = ComputeCCH(Store);
7203     }
7204     // For Z/Sext, the context is the operand, which must be a LoadInst.
7205     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7206              Opcode == Instruction::FPExt) {
7207       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7208         CCH = ComputeCCH(Load);
7209     }
7210 
7211     // We optimize the truncation of induction variables having constant
7212     // integer steps. The cost of these truncations is the same as the scalar
7213     // operation.
7214     if (isOptimizableIVTruncate(I, VF)) {
7215       auto *Trunc = cast<TruncInst>(I);
7216       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7217                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7218     }
7219 
7220     // Detect reduction patterns
7221     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7222       return *RedCost;
7223 
7224     Type *SrcScalarTy = I->getOperand(0)->getType();
7225     Type *SrcVecTy =
7226         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7227     if (canTruncateToMinimalBitwidth(I, VF)) {
7228       // This cast is going to be shrunk. This may remove the cast or it might
7229       // turn it into slightly different cast. For example, if MinBW == 16,
7230       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7231       //
7232       // Calculate the modified src and dest types.
7233       Type *MinVecTy = VectorTy;
7234       if (Opcode == Instruction::Trunc) {
7235         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7236         VectorTy =
7237             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7238       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7239         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7240         VectorTy =
7241             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7242       }
7243     }
7244 
7245     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7246   }
7247   case Instruction::Call: {
7248     if (RecurrenceDescriptor::isFMulAddIntrinsic(I))
7249       if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7250         return *RedCost;
7251     bool NeedToScalarize;
7252     CallInst *CI = cast<CallInst>(I);
7253     InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7254     if (getVectorIntrinsicIDForCall(CI, TLI)) {
7255       InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
7256       return std::min(CallCost, IntrinsicCost);
7257     }
7258     return CallCost;
7259   }
7260   case Instruction::ExtractValue:
7261     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7262   case Instruction::Alloca:
7263     // We cannot easily widen alloca to a scalable alloca, as
7264     // the result would need to be a vector of pointers.
7265     if (VF.isScalable())
7266       return InstructionCost::getInvalid();
7267     LLVM_FALLTHROUGH;
7268   default:
7269     // This opcode is unknown. Assume that it is the same as 'mul'.
7270     return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7271   } // end of switch.
7272 }
7273 
7274 char LoopVectorize::ID = 0;
7275 
7276 static const char lv_name[] = "Loop Vectorization";
7277 
7278 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7279 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7280 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7281 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
7282 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7283 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7284 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7285 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7286 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7287 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7288 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7289 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7290 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7291 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
7292 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
7293 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7294 
7295 namespace llvm {
7296 
7297 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7298 
7299 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7300                               bool VectorizeOnlyWhenForced) {
7301   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7302 }
7303 
7304 } // end namespace llvm
7305 
7306 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
7307   // Check if the pointer operand of a load or store instruction is
7308   // consecutive.
7309   if (auto *Ptr = getLoadStorePointerOperand(Inst))
7310     return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr);
7311   return false;
7312 }
7313 
7314 void LoopVectorizationCostModel::collectValuesToIgnore() {
7315   // Ignore ephemeral values.
7316   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7317 
7318   // Ignore type-promoting instructions we identified during reduction
7319   // detection.
7320   for (auto &Reduction : Legal->getReductionVars()) {
7321     const RecurrenceDescriptor &RedDes = Reduction.second;
7322     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7323     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7324   }
7325   // Ignore type-casting instructions we identified during induction
7326   // detection.
7327   for (auto &Induction : Legal->getInductionVars()) {
7328     const InductionDescriptor &IndDes = Induction.second;
7329     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7330     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7331   }
7332 }
7333 
7334 void LoopVectorizationCostModel::collectInLoopReductions() {
7335   for (auto &Reduction : Legal->getReductionVars()) {
7336     PHINode *Phi = Reduction.first;
7337     const RecurrenceDescriptor &RdxDesc = Reduction.second;
7338 
7339     // We don't collect reductions that are type promoted (yet).
7340     if (RdxDesc.getRecurrenceType() != Phi->getType())
7341       continue;
7342 
7343     // If the target would prefer this reduction to happen "in-loop", then we
7344     // want to record it as such.
7345     unsigned Opcode = RdxDesc.getOpcode();
7346     if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7347         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7348                                    TargetTransformInfo::ReductionFlags()))
7349       continue;
7350 
7351     // Check that we can correctly put the reductions into the loop, by
7352     // finding the chain of operations that leads from the phi to the loop
7353     // exit value.
7354     SmallVector<Instruction *, 4> ReductionOperations =
7355         RdxDesc.getReductionOpChain(Phi, TheLoop);
7356     bool InLoop = !ReductionOperations.empty();
7357     if (InLoop) {
7358       InLoopReductionChains[Phi] = ReductionOperations;
7359       // Add the elements to InLoopReductionImmediateChains for cost modelling.
7360       Instruction *LastChain = Phi;
7361       for (auto *I : ReductionOperations) {
7362         InLoopReductionImmediateChains[I] = LastChain;
7363         LastChain = I;
7364       }
7365     }
7366     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7367                       << " reduction for phi: " << *Phi << "\n");
7368   }
7369 }
7370 
7371 // TODO: we could return a pair of values that specify the max VF and
7372 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7373 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7374 // doesn't have a cost model that can choose which plan to execute if
7375 // more than one is generated.
7376 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7377                                  LoopVectorizationCostModel &CM) {
7378   unsigned WidestType;
7379   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7380   return WidestVectorRegBits / WidestType;
7381 }
7382 
7383 VectorizationFactor
7384 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7385   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
7386   ElementCount VF = UserVF;
7387   // Outer loop handling: They may require CFG and instruction level
7388   // transformations before even evaluating whether vectorization is profitable.
7389   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7390   // the vectorization pipeline.
7391   if (!OrigLoop->isInnermost()) {
7392     // If the user doesn't provide a vectorization factor, determine a
7393     // reasonable one.
7394     if (UserVF.isZero()) {
7395       VF = ElementCount::getFixed(determineVPlanVF(
7396           TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
7397               .getFixedSize(),
7398           CM));
7399       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7400 
7401       // Make sure we have a VF > 1 for stress testing.
7402       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7403         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7404                           << "overriding computed VF.\n");
7405         VF = ElementCount::getFixed(4);
7406       }
7407     }
7408     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7409     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7410            "VF needs to be a power of two");
7411     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7412                       << "VF " << VF << " to build VPlans.\n");
7413     buildVPlans(VF, VF);
7414 
7415     // For VPlan build stress testing, we bail out after VPlan construction.
7416     if (VPlanBuildStressTest)
7417       return VectorizationFactor::Disabled();
7418 
7419     return {VF, 0 /*Cost*/};
7420   }
7421 
7422   LLVM_DEBUG(
7423       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7424                 "VPlan-native path.\n");
7425   return VectorizationFactor::Disabled();
7426 }
7427 
7428 Optional<VectorizationFactor>
7429 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7430   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7431   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7432   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7433     return None;
7434 
7435   // Invalidate interleave groups if all blocks of loop will be predicated.
7436   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7437       !useMaskedInterleavedAccesses(*TTI)) {
7438     LLVM_DEBUG(
7439         dbgs()
7440         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7441            "which requires masked-interleaved support.\n");
7442     if (CM.InterleaveInfo.invalidateGroups())
7443       // Invalidating interleave groups also requires invalidating all decisions
7444       // based on them, which includes widening decisions and uniform and scalar
7445       // values.
7446       CM.invalidateCostModelingDecisions();
7447   }
7448 
7449   ElementCount MaxUserVF =
7450       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7451   bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
7452   if (!UserVF.isZero() && UserVFIsLegal) {
7453     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7454            "VF needs to be a power of two");
7455     // Collect the instructions (and their associated costs) that will be more
7456     // profitable to scalarize.
7457     if (CM.selectUserVectorizationFactor(UserVF)) {
7458       LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7459       CM.collectInLoopReductions();
7460       buildVPlansWithVPRecipes(UserVF, UserVF);
7461       LLVM_DEBUG(printPlans(dbgs()));
7462       return {{UserVF, 0}};
7463     } else
7464       reportVectorizationInfo("UserVF ignored because of invalid costs.",
7465                               "InvalidCost", ORE, OrigLoop);
7466   }
7467 
7468   // Populate the set of Vectorization Factor Candidates.
7469   ElementCountSet VFCandidates;
7470   for (auto VF = ElementCount::getFixed(1);
7471        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7472     VFCandidates.insert(VF);
7473   for (auto VF = ElementCount::getScalable(1);
7474        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7475     VFCandidates.insert(VF);
7476 
7477   for (const auto &VF : VFCandidates) {
7478     // Collect Uniform and Scalar instructions after vectorization with VF.
7479     CM.collectUniformsAndScalars(VF);
7480 
7481     // Collect the instructions (and their associated costs) that will be more
7482     // profitable to scalarize.
7483     if (VF.isVector())
7484       CM.collectInstsToScalarize(VF);
7485   }
7486 
7487   CM.collectInLoopReductions();
7488   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7489   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7490 
7491   LLVM_DEBUG(printPlans(dbgs()));
7492   if (!MaxFactors.hasVector())
7493     return VectorizationFactor::Disabled();
7494 
7495   // Select the optimal vectorization factor.
7496   auto SelectedVF = CM.selectVectorizationFactor(VFCandidates);
7497 
7498   // Check if it is profitable to vectorize with runtime checks.
7499   unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
7500   if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) {
7501     bool PragmaThresholdReached =
7502         NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
7503     bool ThresholdReached =
7504         NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
7505     if ((ThresholdReached && !Hints.allowReordering()) ||
7506         PragmaThresholdReached) {
7507       ORE->emit([&]() {
7508         return OptimizationRemarkAnalysisAliasing(
7509                    DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(),
7510                    OrigLoop->getHeader())
7511                << "loop not vectorized: cannot prove it is safe to reorder "
7512                   "memory operations";
7513       });
7514       LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
7515       Hints.emitRemarkWithHints();
7516       return VectorizationFactor::Disabled();
7517     }
7518   }
7519   return SelectedVF;
7520 }
7521 
7522 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
7523   assert(count_if(VPlans,
7524                   [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7525              1 &&
7526          "Best VF has not a single VPlan.");
7527 
7528   for (const VPlanPtr &Plan : VPlans) {
7529     if (Plan->hasVF(VF))
7530       return *Plan.get();
7531   }
7532   llvm_unreachable("No plan found!");
7533 }
7534 
7535 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7536   SmallVector<Metadata *, 4> MDs;
7537   // Reserve first location for self reference to the LoopID metadata node.
7538   MDs.push_back(nullptr);
7539   bool IsUnrollMetadata = false;
7540   MDNode *LoopID = L->getLoopID();
7541   if (LoopID) {
7542     // First find existing loop unrolling disable metadata.
7543     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7544       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7545       if (MD) {
7546         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7547         IsUnrollMetadata =
7548             S && S->getString().startswith("llvm.loop.unroll.disable");
7549       }
7550       MDs.push_back(LoopID->getOperand(i));
7551     }
7552   }
7553 
7554   if (!IsUnrollMetadata) {
7555     // Add runtime unroll disable metadata.
7556     LLVMContext &Context = L->getHeader()->getContext();
7557     SmallVector<Metadata *, 1> DisableOperands;
7558     DisableOperands.push_back(
7559         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7560     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7561     MDs.push_back(DisableNode);
7562     MDNode *NewLoopID = MDNode::get(Context, MDs);
7563     // Set operand 0 to refer to the loop id itself.
7564     NewLoopID->replaceOperandWith(0, NewLoopID);
7565     L->setLoopID(NewLoopID);
7566   }
7567 }
7568 
7569 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
7570                                            VPlan &BestVPlan,
7571                                            InnerLoopVectorizer &ILV,
7572                                            DominatorTree *DT) {
7573   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
7574                     << '\n');
7575 
7576   // Perform the actual loop transformation.
7577 
7578   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
7579   VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
7580   Value *CanonicalIVStartValue;
7581   std::tie(State.CFG.VectorPreHeader, CanonicalIVStartValue) =
7582       ILV.createVectorizedLoopSkeleton();
7583   ILV.collectPoisonGeneratingRecipes(State);
7584 
7585   ILV.printDebugTracesAtStart();
7586 
7587   //===------------------------------------------------===//
7588   //
7589   // Notice: any optimization or new instruction that go
7590   // into the code below should also be implemented in
7591   // the cost-model.
7592   //
7593   //===------------------------------------------------===//
7594 
7595   // 2. Copy and widen instructions from the old loop into the new loop.
7596   BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr),
7597                              ILV.getOrCreateVectorTripCount(nullptr),
7598                              CanonicalIVStartValue, State);
7599   BestVPlan.execute(&State);
7600 
7601   // Keep all loop hints from the original loop on the vector loop (we'll
7602   // replace the vectorizer-specific hints below).
7603   MDNode *OrigLoopID = OrigLoop->getLoopID();
7604 
7605   Optional<MDNode *> VectorizedLoopID =
7606       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7607                                       LLVMLoopVectorizeFollowupVectorized});
7608 
7609   Loop *L = LI->getLoopFor(State.CFG.PrevBB);
7610   if (VectorizedLoopID.hasValue())
7611     L->setLoopID(VectorizedLoopID.getValue());
7612   else {
7613     // Keep all loop hints from the original loop on the vector loop (we'll
7614     // replace the vectorizer-specific hints below).
7615     if (MDNode *LID = OrigLoop->getLoopID())
7616       L->setLoopID(LID);
7617 
7618     LoopVectorizeHints Hints(L, true, *ORE);
7619     Hints.setAlreadyVectorized();
7620   }
7621   // Disable runtime unrolling when vectorizing the epilogue loop.
7622   if (CanonicalIVStartValue)
7623     AddRuntimeUnrollDisableMetaData(L);
7624 
7625   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7626   //    predication, updating analyses.
7627   ILV.fixVectorizedLoop(State);
7628 
7629   ILV.printDebugTracesAtEnd();
7630 }
7631 
7632 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
7633 void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
7634   for (const auto &Plan : VPlans)
7635     if (PrintVPlansInDotFormat)
7636       Plan->printDOT(O);
7637     else
7638       Plan->print(O);
7639 }
7640 #endif
7641 
7642 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
7643     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7644 
7645   // We create new control-flow for the vectorized loop, so the original exit
7646   // conditions will be dead after vectorization if it's only used by the
7647   // terminator
7648   SmallVector<BasicBlock*> ExitingBlocks;
7649   OrigLoop->getExitingBlocks(ExitingBlocks);
7650   for (auto *BB : ExitingBlocks) {
7651     auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));
7652     if (!Cmp || !Cmp->hasOneUse())
7653       continue;
7654 
7655     // TODO: we should introduce a getUniqueExitingBlocks on Loop
7656     if (!DeadInstructions.insert(Cmp).second)
7657       continue;
7658 
7659     // The operands of the icmp is often a dead trunc, used by IndUpdate.
7660     // TODO: can recurse through operands in general
7661     for (Value *Op : Cmp->operands()) {
7662       if (isa<TruncInst>(Op) && Op->hasOneUse())
7663           DeadInstructions.insert(cast<Instruction>(Op));
7664     }
7665   }
7666 
7667   // We create new "steps" for induction variable updates to which the original
7668   // induction variables map. An original update instruction will be dead if
7669   // all its users except the induction variable are dead.
7670   auto *Latch = OrigLoop->getLoopLatch();
7671   for (auto &Induction : Legal->getInductionVars()) {
7672     PHINode *Ind = Induction.first;
7673     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
7674 
7675     // If the tail is to be folded by masking, the primary induction variable,
7676     // if exists, isn't dead: it will be used for masking. Don't kill it.
7677     if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
7678       continue;
7679 
7680     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
7681           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
7682         }))
7683       DeadInstructions.insert(IndUpdate);
7684   }
7685 }
7686 
7687 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7688 
7689 //===--------------------------------------------------------------------===//
7690 // EpilogueVectorizerMainLoop
7691 //===--------------------------------------------------------------------===//
7692 
7693 /// This function is partially responsible for generating the control flow
7694 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7695 std::pair<BasicBlock *, Value *>
7696 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
7697   MDNode *OrigLoopID = OrigLoop->getLoopID();
7698   createVectorLoopSkeleton("");
7699 
7700   // Generate the code to check the minimum iteration count of the vector
7701   // epilogue (see below).
7702   EPI.EpilogueIterationCountCheck =
7703       emitMinimumIterationCountCheck(LoopScalarPreHeader, true);
7704   EPI.EpilogueIterationCountCheck->setName("iter.check");
7705 
7706   // Generate the code to check any assumptions that we've made for SCEV
7707   // expressions.
7708   EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader);
7709 
7710   // Generate the code that checks at runtime if arrays overlap. We put the
7711   // checks into a separate block to make the more common case of few elements
7712   // faster.
7713   EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader);
7714 
7715   // Generate the iteration count check for the main loop, *after* the check
7716   // for the epilogue loop, so that the path-length is shorter for the case
7717   // that goes directly through the vector epilogue. The longer-path length for
7718   // the main loop is compensated for, by the gain from vectorizing the larger
7719   // trip count. Note: the branch will get updated later on when we vectorize
7720   // the epilogue.
7721   EPI.MainLoopIterationCountCheck =
7722       emitMinimumIterationCountCheck(LoopScalarPreHeader, false);
7723 
7724   // Generate the induction variable.
7725   Value *CountRoundDown = getOrCreateVectorTripCount(LoopVectorPreHeader);
7726   EPI.VectorTripCount = CountRoundDown;
7727 
7728   // Skip induction resume value creation here because they will be created in
7729   // the second pass. If we created them here, they wouldn't be used anyway,
7730   // because the vplan in the second pass still contains the inductions from the
7731   // original loop.
7732 
7733   return {completeLoopSkeleton(OrigLoopID), nullptr};
7734 }
7735 
7736 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7737   LLVM_DEBUG({
7738     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7739            << "Main Loop VF:" << EPI.MainLoopVF
7740            << ", Main Loop UF:" << EPI.MainLoopUF
7741            << ", Epilogue Loop VF:" << EPI.EpilogueVF
7742            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7743   });
7744 }
7745 
7746 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7747   DEBUG_WITH_TYPE(VerboseDebug, {
7748     dbgs() << "intermediate fn:\n"
7749            << *OrigLoop->getHeader()->getParent() << "\n";
7750   });
7751 }
7752 
7753 BasicBlock *
7754 EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(BasicBlock *Bypass,
7755                                                            bool ForEpilogue) {
7756   assert(Bypass && "Expected valid bypass basic block.");
7757   ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7758   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7759   Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
7760   // Reuse existing vector loop preheader for TC checks.
7761   // Note that new preheader block is generated for vector loop.
7762   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7763   IRBuilder<> Builder(TCCheckBlock->getTerminator());
7764 
7765   // Generate code to check if the loop's trip count is less than VF * UF of the
7766   // main vector loop.
7767   auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ?
7768       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7769 
7770   Value *CheckMinIters = Builder.CreateICmp(
7771       P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7772       "min.iters.check");
7773 
7774   if (!ForEpilogue)
7775     TCCheckBlock->setName("vector.main.loop.iter.check");
7776 
7777   // Create new preheader for vector loop.
7778   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7779                                    DT, LI, nullptr, "vector.ph");
7780 
7781   if (ForEpilogue) {
7782     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7783                                  DT->getNode(Bypass)->getIDom()) &&
7784            "TC check is expected to dominate Bypass");
7785 
7786     // Update dominator for Bypass & LoopExit.
7787     DT->changeImmediateDominator(Bypass, TCCheckBlock);
7788     if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
7789       // For loops with multiple exits, there's no edge from the middle block
7790       // to exit blocks (as the epilogue must run) and thus no need to update
7791       // the immediate dominator of the exit blocks.
7792       DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
7793 
7794     LoopBypassBlocks.push_back(TCCheckBlock);
7795 
7796     // Save the trip count so we don't have to regenerate it in the
7797     // vec.epilog.iter.check. This is safe to do because the trip count
7798     // generated here dominates the vector epilog iter check.
7799     EPI.TripCount = Count;
7800   }
7801 
7802   ReplaceInstWithInst(
7803       TCCheckBlock->getTerminator(),
7804       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7805 
7806   return TCCheckBlock;
7807 }
7808 
7809 //===--------------------------------------------------------------------===//
7810 // EpilogueVectorizerEpilogueLoop
7811 //===--------------------------------------------------------------------===//
7812 
7813 /// This function is partially responsible for generating the control flow
7814 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7815 std::pair<BasicBlock *, Value *>
7816 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
7817   MDNode *OrigLoopID = OrigLoop->getLoopID();
7818   createVectorLoopSkeleton("vec.epilog.");
7819 
7820   // Now, compare the remaining count and if there aren't enough iterations to
7821   // execute the vectorized epilogue skip to the scalar part.
7822   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
7823   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
7824   LoopVectorPreHeader =
7825       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
7826                  LI, nullptr, "vec.epilog.ph");
7827   emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
7828                                           VecEpilogueIterationCountCheck);
7829 
7830   // Adjust the control flow taking the state info from the main loop
7831   // vectorization into account.
7832   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7833          "expected this to be saved from the previous pass.");
7834   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7835       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7836 
7837   DT->changeImmediateDominator(LoopVectorPreHeader,
7838                                EPI.MainLoopIterationCountCheck);
7839 
7840   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7841       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7842 
7843   if (EPI.SCEVSafetyCheck)
7844     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7845         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7846   if (EPI.MemSafetyCheck)
7847     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7848         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7849 
7850   DT->changeImmediateDominator(
7851       VecEpilogueIterationCountCheck,
7852       VecEpilogueIterationCountCheck->getSinglePredecessor());
7853 
7854   DT->changeImmediateDominator(LoopScalarPreHeader,
7855                                EPI.EpilogueIterationCountCheck);
7856   if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
7857     // If there is an epilogue which must run, there's no edge from the
7858     // middle block to exit blocks  and thus no need to update the immediate
7859     // dominator of the exit blocks.
7860     DT->changeImmediateDominator(LoopExitBlock,
7861                                  EPI.EpilogueIterationCountCheck);
7862 
7863   // Keep track of bypass blocks, as they feed start values to the induction
7864   // phis in the scalar loop preheader.
7865   if (EPI.SCEVSafetyCheck)
7866     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
7867   if (EPI.MemSafetyCheck)
7868     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
7869   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
7870 
7871   // The vec.epilog.iter.check block may contain Phi nodes from reductions which
7872   // merge control-flow from the latch block and the middle block. Update the
7873   // incoming values here and move the Phi into the preheader.
7874   SmallVector<PHINode *, 4> PhisInBlock;
7875   for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
7876     PhisInBlock.push_back(&Phi);
7877 
7878   for (PHINode *Phi : PhisInBlock) {
7879     Phi->replaceIncomingBlockWith(
7880         VecEpilogueIterationCountCheck->getSinglePredecessor(),
7881         VecEpilogueIterationCountCheck);
7882     Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
7883     if (EPI.SCEVSafetyCheck)
7884       Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
7885     if (EPI.MemSafetyCheck)
7886       Phi->removeIncomingValue(EPI.MemSafetyCheck);
7887     Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
7888   }
7889 
7890   // Generate a resume induction for the vector epilogue and put it in the
7891   // vector epilogue preheader
7892   Type *IdxTy = Legal->getWidestInductionType();
7893   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
7894                                          LoopVectorPreHeader->getFirstNonPHI());
7895   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7896   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7897                            EPI.MainLoopIterationCountCheck);
7898 
7899   // Generate induction resume values. These variables save the new starting
7900   // indexes for the scalar loop. They are used to test if there are any tail
7901   // iterations left once the vector loop has completed.
7902   // Note that when the vectorized epilogue is skipped due to iteration count
7903   // check, then the resume value for the induction variable comes from
7904   // the trip count of the main vector loop, hence passing the AdditionalBypass
7905   // argument.
7906   createInductionResumeValues({VecEpilogueIterationCountCheck,
7907                                EPI.VectorTripCount} /* AdditionalBypass */);
7908 
7909   return {completeLoopSkeleton(OrigLoopID), EPResumeVal};
7910 }
7911 
7912 BasicBlock *
7913 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7914     BasicBlock *Bypass, BasicBlock *Insert) {
7915 
7916   assert(EPI.TripCount &&
7917          "Expected trip count to have been safed in the first pass.");
7918   assert(
7919       (!isa<Instruction>(EPI.TripCount) ||
7920        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7921       "saved trip count does not dominate insertion point.");
7922   Value *TC = EPI.TripCount;
7923   IRBuilder<> Builder(Insert->getTerminator());
7924   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7925 
7926   // Generate code to check if the loop's trip count is less than VF * UF of the
7927   // vector epilogue loop.
7928   auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ?
7929       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7930 
7931   Value *CheckMinIters =
7932       Builder.CreateICmp(P, Count,
7933                          createStepForVF(Builder, Count->getType(),
7934                                          EPI.EpilogueVF, EPI.EpilogueUF),
7935                          "min.epilog.iters.check");
7936 
7937   ReplaceInstWithInst(
7938       Insert->getTerminator(),
7939       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7940 
7941   LoopBypassBlocks.push_back(Insert);
7942   return Insert;
7943 }
7944 
7945 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7946   LLVM_DEBUG({
7947     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7948            << "Epilogue Loop VF:" << EPI.EpilogueVF
7949            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7950   });
7951 }
7952 
7953 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7954   DEBUG_WITH_TYPE(VerboseDebug, {
7955     dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7956   });
7957 }
7958 
7959 bool LoopVectorizationPlanner::getDecisionAndClampRange(
7960     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7961   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
7962   bool PredicateAtRangeStart = Predicate(Range.Start);
7963 
7964   for (ElementCount TmpVF = Range.Start * 2;
7965        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
7966     if (Predicate(TmpVF) != PredicateAtRangeStart) {
7967       Range.End = TmpVF;
7968       break;
7969     }
7970 
7971   return PredicateAtRangeStart;
7972 }
7973 
7974 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7975 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7976 /// of VF's starting at a given VF and extending it as much as possible. Each
7977 /// vectorization decision can potentially shorten this sub-range during
7978 /// buildVPlan().
7979 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
7980                                            ElementCount MaxVF) {
7981   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
7982   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
7983     VFRange SubRange = {VF, MaxVFPlusOne};
7984     VPlans.push_back(buildVPlan(SubRange));
7985     VF = SubRange.End;
7986   }
7987 }
7988 
7989 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
7990                                          VPlanPtr &Plan) {
7991   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7992 
7993   // Look for cached value.
7994   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7995   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
7996   if (ECEntryIt != EdgeMaskCache.end())
7997     return ECEntryIt->second;
7998 
7999   VPValue *SrcMask = createBlockInMask(Src, Plan);
8000 
8001   // The terminator has to be a branch inst!
8002   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8003   assert(BI && "Unexpected terminator found");
8004 
8005   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8006     return EdgeMaskCache[Edge] = SrcMask;
8007 
8008   // If source is an exiting block, we know the exit edge is dynamically dead
8009   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
8010   // adding uses of an otherwise potentially dead instruction.
8011   if (OrigLoop->isLoopExiting(Src))
8012     return EdgeMaskCache[Edge] = SrcMask;
8013 
8014   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
8015   assert(EdgeMask && "No Edge Mask found for condition");
8016 
8017   if (BI->getSuccessor(0) != Dst)
8018     EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
8019 
8020   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8021     // The condition is 'SrcMask && EdgeMask', which is equivalent to
8022     // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8023     // The select version does not introduce new UB if SrcMask is false and
8024     // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8025     VPValue *False = Plan->getOrAddVPValue(
8026         ConstantInt::getFalse(BI->getCondition()->getType()));
8027     EdgeMask =
8028         Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
8029   }
8030 
8031   return EdgeMaskCache[Edge] = EdgeMask;
8032 }
8033 
8034 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
8035   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8036 
8037   // Look for cached value.
8038   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
8039   if (BCEntryIt != BlockMaskCache.end())
8040     return BCEntryIt->second;
8041 
8042   // All-one mask is modelled as no-mask following the convention for masked
8043   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8044   VPValue *BlockMask = nullptr;
8045 
8046   if (OrigLoop->getHeader() == BB) {
8047     if (!CM.blockNeedsPredicationForAnyReason(BB))
8048       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
8049 
8050     // Introduce the early-exit compare IV <= BTC to form header block mask.
8051     // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8052     // constructing the desired canonical IV in the header block as its first
8053     // non-phi instructions.
8054     assert(CM.foldTailByMasking() && "must fold the tail");
8055     VPBasicBlock *HeaderVPBB =
8056         Plan->getVectorLoopRegion()->getEntryBasicBlock();
8057     auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8058     auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV());
8059     HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi());
8060 
8061     VPBuilder::InsertPointGuard Guard(Builder);
8062     Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8063     if (CM.TTI.emitGetActiveLaneMask()) {
8064       VPValue *TC = Plan->getOrCreateTripCount();
8065       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC});
8066     } else {
8067       VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
8068       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
8069     }
8070     return BlockMaskCache[BB] = BlockMask;
8071   }
8072 
8073   // This is the block mask. We OR all incoming edges.
8074   for (auto *Predecessor : predecessors(BB)) {
8075     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8076     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
8077       return BlockMaskCache[BB] = EdgeMask;
8078 
8079     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8080       BlockMask = EdgeMask;
8081       continue;
8082     }
8083 
8084     BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8085   }
8086 
8087   return BlockMaskCache[BB] = BlockMask;
8088 }
8089 
8090 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
8091                                                 ArrayRef<VPValue *> Operands,
8092                                                 VFRange &Range,
8093                                                 VPlanPtr &Plan) {
8094   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8095          "Must be called with either a load or store");
8096 
8097   auto willWiden = [&](ElementCount VF) -> bool {
8098     if (VF.isScalar())
8099       return false;
8100     LoopVectorizationCostModel::InstWidening Decision =
8101         CM.getWideningDecision(I, VF);
8102     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8103            "CM decision should be taken at this point.");
8104     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8105       return true;
8106     if (CM.isScalarAfterVectorization(I, VF) ||
8107         CM.isProfitableToScalarize(I, VF))
8108       return false;
8109     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8110   };
8111 
8112   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8113     return nullptr;
8114 
8115   VPValue *Mask = nullptr;
8116   if (Legal->isMaskRequired(I))
8117     Mask = createBlockInMask(I->getParent(), Plan);
8118 
8119   // Determine if the pointer operand of the access is either consecutive or
8120   // reverse consecutive.
8121   LoopVectorizationCostModel::InstWidening Decision =
8122       CM.getWideningDecision(I, Range.Start);
8123   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8124   bool Consecutive =
8125       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8126 
8127   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8128     return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
8129                                               Consecutive, Reverse);
8130 
8131   StoreInst *Store = cast<StoreInst>(I);
8132   return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
8133                                             Mask, Consecutive, Reverse);
8134 }
8135 
8136 static VPWidenIntOrFpInductionRecipe *
8137 createWidenInductionRecipe(PHINode *Phi, Instruction *PhiOrTrunc,
8138                            VPValue *Start, const InductionDescriptor &IndDesc,
8139                            LoopVectorizationCostModel &CM, ScalarEvolution &SE,
8140                            Loop &OrigLoop, VFRange &Range) {
8141   // Returns true if an instruction \p I should be scalarized instead of
8142   // vectorized for the chosen vectorization factor.
8143   auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) {
8144     return CM.isScalarAfterVectorization(I, VF) ||
8145            CM.isProfitableToScalarize(I, VF);
8146   };
8147 
8148   bool NeedsScalarIV = LoopVectorizationPlanner::getDecisionAndClampRange(
8149       [&](ElementCount VF) {
8150         // Returns true if we should generate a scalar version of \p IV.
8151         if (ShouldScalarizeInstruction(PhiOrTrunc, VF))
8152           return true;
8153         auto isScalarInst = [&](User *U) -> bool {
8154           auto *I = cast<Instruction>(U);
8155           return OrigLoop.contains(I) && ShouldScalarizeInstruction(I, VF);
8156         };
8157         return any_of(PhiOrTrunc->users(), isScalarInst);
8158       },
8159       Range);
8160   bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange(
8161       [&](ElementCount VF) {
8162         return ShouldScalarizeInstruction(PhiOrTrunc, VF);
8163       },
8164       Range);
8165   assert(IndDesc.getStartValue() ==
8166          Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8167   assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8168          "step must be loop invariant");
8169   if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8170     return new VPWidenIntOrFpInductionRecipe(
8171         Phi, Start, IndDesc, TruncI, NeedsScalarIV, !NeedsScalarIVOnly, SE);
8172   }
8173   assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8174   return new VPWidenIntOrFpInductionRecipe(Phi, Start, IndDesc, NeedsScalarIV,
8175                                            !NeedsScalarIVOnly, SE);
8176 }
8177 
8178 VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI(
8179     PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) const {
8180 
8181   // Check if this is an integer or fp induction. If so, build the recipe that
8182   // produces its scalar and vector values.
8183   if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8184     return createWidenInductionRecipe(Phi, Phi, Operands[0], *II, CM,
8185                                       *PSE.getSE(), *OrigLoop, Range);
8186 
8187   // Check if this is pointer induction. If so, build the recipe for it.
8188   if (auto *II = Legal->getPointerInductionDescriptor(Phi))
8189     return new VPWidenPointerInductionRecipe(Phi, Operands[0], *II,
8190                                              *PSE.getSE());
8191   return nullptr;
8192 }
8193 
8194 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8195     TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range,
8196     VPlan &Plan) const {
8197   // Optimize the special case where the source is a constant integer
8198   // induction variable. Notice that we can only optimize the 'trunc' case
8199   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8200   // (c) other casts depend on pointer size.
8201 
8202   // Determine whether \p K is a truncation based on an induction variable that
8203   // can be optimized.
8204   auto isOptimizableIVTruncate =
8205       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8206     return [=](ElementCount VF) -> bool {
8207       return CM.isOptimizableIVTruncate(K, VF);
8208     };
8209   };
8210 
8211   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8212           isOptimizableIVTruncate(I), Range)) {
8213 
8214     auto *Phi = cast<PHINode>(I->getOperand(0));
8215     const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8216     VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
8217     return createWidenInductionRecipe(Phi, I, Start, II, CM, *PSE.getSE(),
8218                                       *OrigLoop, Range);
8219   }
8220   return nullptr;
8221 }
8222 
8223 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
8224                                                 ArrayRef<VPValue *> Operands,
8225                                                 VPlanPtr &Plan) {
8226   // If all incoming values are equal, the incoming VPValue can be used directly
8227   // instead of creating a new VPBlendRecipe.
8228   VPValue *FirstIncoming = Operands[0];
8229   if (all_of(Operands, [FirstIncoming](const VPValue *Inc) {
8230         return FirstIncoming == Inc;
8231       })) {
8232     return Operands[0];
8233   }
8234 
8235   unsigned NumIncoming = Phi->getNumIncomingValues();
8236   // For in-loop reductions, we do not need to create an additional select.
8237   VPValue *InLoopVal = nullptr;
8238   for (unsigned In = 0; In < NumIncoming; In++) {
8239     PHINode *PhiOp =
8240         dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue());
8241     if (PhiOp && CM.isInLoopReduction(PhiOp)) {
8242       assert(!InLoopVal && "Found more than one in-loop reduction!");
8243       InLoopVal = Operands[In];
8244     }
8245   }
8246 
8247   assert((!InLoopVal || NumIncoming == 2) &&
8248          "Found an in-loop reduction for PHI with unexpected number of "
8249          "incoming values");
8250   if (InLoopVal)
8251     return Operands[Operands[0] == InLoopVal ? 1 : 0];
8252 
8253   // We know that all PHIs in non-header blocks are converted into selects, so
8254   // we don't have to worry about the insertion order and we can just use the
8255   // builder. At this point we generate the predication tree. There may be
8256   // duplications since this is a simple recursive scan, but future
8257   // optimizations will clean it up.
8258   SmallVector<VPValue *, 2> OperandsWithMask;
8259 
8260   for (unsigned In = 0; In < NumIncoming; In++) {
8261     VPValue *EdgeMask =
8262       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8263     assert((EdgeMask || NumIncoming == 1) &&
8264            "Multiple predecessors with one having a full mask");
8265     OperandsWithMask.push_back(Operands[In]);
8266     if (EdgeMask)
8267       OperandsWithMask.push_back(EdgeMask);
8268   }
8269   return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
8270 }
8271 
8272 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8273                                                    ArrayRef<VPValue *> Operands,
8274                                                    VFRange &Range) const {
8275 
8276   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8277       [this, CI](ElementCount VF) {
8278         return CM.isScalarWithPredication(CI, VF);
8279       },
8280       Range);
8281 
8282   if (IsPredicated)
8283     return nullptr;
8284 
8285   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8286   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8287              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8288              ID == Intrinsic::pseudoprobe ||
8289              ID == Intrinsic::experimental_noalias_scope_decl))
8290     return nullptr;
8291 
8292   auto willWiden = [&](ElementCount VF) -> bool {
8293     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8294     // The following case may be scalarized depending on the VF.
8295     // The flag shows whether we use Intrinsic or a usual Call for vectorized
8296     // version of the instruction.
8297     // Is it beneficial to perform intrinsic call compared to lib call?
8298     bool NeedToScalarize = false;
8299     InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
8300     InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
8301     bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
8302     return UseVectorIntrinsic || !NeedToScalarize;
8303   };
8304 
8305   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8306     return nullptr;
8307 
8308   ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());
8309   return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()));
8310 }
8311 
8312 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8313   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8314          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8315   // Instruction should be widened, unless it is scalar after vectorization,
8316   // scalarization is profitable or it is predicated.
8317   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8318     return CM.isScalarAfterVectorization(I, VF) ||
8319            CM.isProfitableToScalarize(I, VF) ||
8320            CM.isScalarWithPredication(I, VF);
8321   };
8322   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8323                                                              Range);
8324 }
8325 
8326 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8327                                            ArrayRef<VPValue *> Operands) const {
8328   auto IsVectorizableOpcode = [](unsigned Opcode) {
8329     switch (Opcode) {
8330     case Instruction::Add:
8331     case Instruction::And:
8332     case Instruction::AShr:
8333     case Instruction::BitCast:
8334     case Instruction::FAdd:
8335     case Instruction::FCmp:
8336     case Instruction::FDiv:
8337     case Instruction::FMul:
8338     case Instruction::FNeg:
8339     case Instruction::FPExt:
8340     case Instruction::FPToSI:
8341     case Instruction::FPToUI:
8342     case Instruction::FPTrunc:
8343     case Instruction::FRem:
8344     case Instruction::FSub:
8345     case Instruction::ICmp:
8346     case Instruction::IntToPtr:
8347     case Instruction::LShr:
8348     case Instruction::Mul:
8349     case Instruction::Or:
8350     case Instruction::PtrToInt:
8351     case Instruction::SDiv:
8352     case Instruction::Select:
8353     case Instruction::SExt:
8354     case Instruction::Shl:
8355     case Instruction::SIToFP:
8356     case Instruction::SRem:
8357     case Instruction::Sub:
8358     case Instruction::Trunc:
8359     case Instruction::UDiv:
8360     case Instruction::UIToFP:
8361     case Instruction::URem:
8362     case Instruction::Xor:
8363     case Instruction::ZExt:
8364       return true;
8365     }
8366     return false;
8367   };
8368 
8369   if (!IsVectorizableOpcode(I->getOpcode()))
8370     return nullptr;
8371 
8372   // Success: widen this instruction.
8373   return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8374 }
8375 
8376 void VPRecipeBuilder::fixHeaderPhis() {
8377   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8378   for (VPHeaderPHIRecipe *R : PhisToFix) {
8379     auto *PN = cast<PHINode>(R->getUnderlyingValue());
8380     VPRecipeBase *IncR =
8381         getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8382     R->addOperand(IncR->getVPSingleValue());
8383   }
8384 }
8385 
8386 VPBasicBlock *VPRecipeBuilder::handleReplication(
8387     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8388     VPlanPtr &Plan) {
8389   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8390       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8391       Range);
8392 
8393   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8394       [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); },
8395       Range);
8396 
8397   // Even if the instruction is not marked as uniform, there are certain
8398   // intrinsic calls that can be effectively treated as such, so we check for
8399   // them here. Conservatively, we only do this for scalable vectors, since
8400   // for fixed-width VFs we can always fall back on full scalarization.
8401   if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8402     switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8403     case Intrinsic::assume:
8404     case Intrinsic::lifetime_start:
8405     case Intrinsic::lifetime_end:
8406       // For scalable vectors if one of the operands is variant then we still
8407       // want to mark as uniform, which will generate one instruction for just
8408       // the first lane of the vector. We can't scalarize the call in the same
8409       // way as for fixed-width vectors because we don't know how many lanes
8410       // there are.
8411       //
8412       // The reasons for doing it this way for scalable vectors are:
8413       //   1. For the assume intrinsic generating the instruction for the first
8414       //      lane is still be better than not generating any at all. For
8415       //      example, the input may be a splat across all lanes.
8416       //   2. For the lifetime start/end intrinsics the pointer operand only
8417       //      does anything useful when the input comes from a stack object,
8418       //      which suggests it should always be uniform. For non-stack objects
8419       //      the effect is to poison the object, which still allows us to
8420       //      remove the call.
8421       IsUniform = true;
8422       break;
8423     default:
8424       break;
8425     }
8426   }
8427 
8428   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8429                                        IsUniform, IsPredicated);
8430   setRecipe(I, Recipe);
8431   Plan->addVPValue(I, Recipe);
8432 
8433   // Find if I uses a predicated instruction. If so, it will use its scalar
8434   // value. Avoid hoisting the insert-element which packs the scalar value into
8435   // a vector value, as that happens iff all users use the vector value.
8436   for (VPValue *Op : Recipe->operands()) {
8437     auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef());
8438     if (!PredR)
8439       continue;
8440     auto *RepR =
8441         cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef());
8442     assert(RepR->isPredicated() &&
8443            "expected Replicate recipe to be predicated");
8444     RepR->setAlsoPack(false);
8445   }
8446 
8447   // Finalize the recipe for Instr, first if it is not predicated.
8448   if (!IsPredicated) {
8449     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8450     VPBB->appendRecipe(Recipe);
8451     return VPBB;
8452   }
8453   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8454 
8455   VPBlockBase *SingleSucc = VPBB->getSingleSuccessor();
8456   assert(SingleSucc && "VPBB must have a single successor when handling "
8457                        "predicated replication.");
8458   VPBlockUtils::disconnectBlocks(VPBB, SingleSucc);
8459   // Record predicated instructions for above packing optimizations.
8460   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
8461   VPBlockUtils::insertBlockAfter(Region, VPBB);
8462   auto *RegSucc = new VPBasicBlock();
8463   VPBlockUtils::insertBlockAfter(RegSucc, Region);
8464   VPBlockUtils::connectBlocks(RegSucc, SingleSucc);
8465   return RegSucc;
8466 }
8467 
8468 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
8469                                                       VPRecipeBase *PredRecipe,
8470                                                       VPlanPtr &Plan) {
8471   // Instructions marked for predication are replicated and placed under an
8472   // if-then construct to prevent side-effects.
8473 
8474   // Generate recipes to compute the block mask for this region.
8475   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
8476 
8477   // Build the triangular if-then region.
8478   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8479   assert(Instr->getParent() && "Predicated instruction not in any basic block");
8480   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8481   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8482   auto *PHIRecipe = Instr->getType()->isVoidTy()
8483                         ? nullptr
8484                         : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
8485   if (PHIRecipe) {
8486     Plan->removeVPValueFor(Instr);
8487     Plan->addVPValue(Instr, PHIRecipe);
8488   }
8489   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8490   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8491   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
8492 
8493   // Note: first set Entry as region entry and then connect successors starting
8494   // from it in order, to propagate the "parent" of each VPBasicBlock.
8495   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
8496   VPBlockUtils::connectBlocks(Pred, Exit);
8497 
8498   return Region;
8499 }
8500 
8501 VPRecipeOrVPValueTy
8502 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8503                                         ArrayRef<VPValue *> Operands,
8504                                         VFRange &Range, VPlanPtr &Plan) {
8505   // First, check for specific widening recipes that deal with calls, memory
8506   // operations, inductions and Phi nodes.
8507   if (auto *CI = dyn_cast<CallInst>(Instr))
8508     return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
8509 
8510   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8511     return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
8512 
8513   VPRecipeBase *Recipe;
8514   if (auto Phi = dyn_cast<PHINode>(Instr)) {
8515     if (Phi->getParent() != OrigLoop->getHeader())
8516       return tryToBlend(Phi, Operands, Plan);
8517     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
8518       return toVPRecipeResult(Recipe);
8519 
8520     VPHeaderPHIRecipe *PhiRecipe = nullptr;
8521     if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) {
8522       VPValue *StartV = Operands[0];
8523       if (Legal->isReductionVariable(Phi)) {
8524         const RecurrenceDescriptor &RdxDesc =
8525             Legal->getReductionVars().find(Phi)->second;
8526         assert(RdxDesc.getRecurrenceStartValue() ==
8527                Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8528         PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8529                                              CM.isInLoopReduction(Phi),
8530                                              CM.useOrderedReductions(RdxDesc));
8531       } else {
8532         PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8533       }
8534 
8535       // Record the incoming value from the backedge, so we can add the incoming
8536       // value from the backedge after all recipes have been created.
8537       recordRecipeOf(cast<Instruction>(
8538           Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())));
8539       PhisToFix.push_back(PhiRecipe);
8540     } else {
8541       // TODO: record backedge value for remaining pointer induction phis.
8542       assert(Phi->getType()->isPointerTy() &&
8543              "only pointer phis should be handled here");
8544       assert(Legal->getInductionVars().count(Phi) &&
8545              "Not an induction variable");
8546       InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
8547       VPValue *Start = Plan->getOrAddVPValue(II.getStartValue());
8548       PhiRecipe = new VPWidenPHIRecipe(Phi, Start);
8549     }
8550 
8551     return toVPRecipeResult(PhiRecipe);
8552   }
8553 
8554   if (isa<TruncInst>(Instr) &&
8555       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
8556                                                Range, *Plan)))
8557     return toVPRecipeResult(Recipe);
8558 
8559   if (!shouldWiden(Instr, Range))
8560     return nullptr;
8561 
8562   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8563     return toVPRecipeResult(new VPWidenGEPRecipe(
8564         GEP, make_range(Operands.begin(), Operands.end()), OrigLoop));
8565 
8566   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8567     bool InvariantCond =
8568         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
8569     return toVPRecipeResult(new VPWidenSelectRecipe(
8570         *SI, make_range(Operands.begin(), Operands.end()), InvariantCond));
8571   }
8572 
8573   return toVPRecipeResult(tryToWiden(Instr, Operands));
8574 }
8575 
8576 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8577                                                         ElementCount MaxVF) {
8578   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8579 
8580   // Collect instructions from the original loop that will become trivially dead
8581   // in the vectorized loop. We don't need to vectorize these instructions. For
8582   // example, original induction update instructions can become dead because we
8583   // separately emit induction "steps" when generating code for the new loop.
8584   // Similarly, we create a new latch condition when setting up the structure
8585   // of the new loop, so the old one can become dead.
8586   SmallPtrSet<Instruction *, 4> DeadInstructions;
8587   collectTriviallyDeadInstructions(DeadInstructions);
8588 
8589   // Add assume instructions we need to drop to DeadInstructions, to prevent
8590   // them from being added to the VPlan.
8591   // TODO: We only need to drop assumes in blocks that get flattend. If the
8592   // control flow is preserved, we should keep them.
8593   auto &ConditionalAssumes = Legal->getConditionalAssumes();
8594   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
8595 
8596   MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
8597   // Dead instructions do not need sinking. Remove them from SinkAfter.
8598   for (Instruction *I : DeadInstructions)
8599     SinkAfter.erase(I);
8600 
8601   // Cannot sink instructions after dead instructions (there won't be any
8602   // recipes for them). Instead, find the first non-dead previous instruction.
8603   for (auto &P : Legal->getSinkAfter()) {
8604     Instruction *SinkTarget = P.second;
8605     Instruction *FirstInst = &*SinkTarget->getParent()->begin();
8606     (void)FirstInst;
8607     while (DeadInstructions.contains(SinkTarget)) {
8608       assert(
8609           SinkTarget != FirstInst &&
8610           "Must find a live instruction (at least the one feeding the "
8611           "first-order recurrence PHI) before reaching beginning of the block");
8612       SinkTarget = SinkTarget->getPrevNode();
8613       assert(SinkTarget != P.first &&
8614              "sink source equals target, no sinking required");
8615     }
8616     P.second = SinkTarget;
8617   }
8618 
8619   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8620   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8621     VFRange SubRange = {VF, MaxVFPlusOne};
8622     VPlans.push_back(
8623         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
8624     VF = SubRange.End;
8625   }
8626 }
8627 
8628 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a
8629 // CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a
8630 // BranchOnCount VPInstruction to the latch.
8631 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
8632                                   bool HasNUW, bool IsVPlanNative) {
8633   Value *StartIdx = ConstantInt::get(IdxTy, 0);
8634   auto *StartV = Plan.getOrAddVPValue(StartIdx);
8635 
8636   auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8637   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8638   VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8639   if (IsVPlanNative)
8640     Header = cast<VPBasicBlock>(Header->getSingleSuccessor());
8641   Header->insert(CanonicalIVPHI, Header->begin());
8642 
8643   auto *CanonicalIVIncrement =
8644       new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW
8645                                : VPInstruction::CanonicalIVIncrement,
8646                         {CanonicalIVPHI}, DL);
8647   CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8648 
8649   VPBasicBlock *EB = TopRegion->getExitBasicBlock();
8650   if (IsVPlanNative) {
8651     EB = cast<VPBasicBlock>(EB->getSinglePredecessor());
8652     EB->setCondBit(nullptr);
8653   }
8654   EB->appendRecipe(CanonicalIVIncrement);
8655 
8656   auto *BranchOnCount =
8657       new VPInstruction(VPInstruction::BranchOnCount,
8658                         {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8659   EB->appendRecipe(BranchOnCount);
8660 }
8661 
8662 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
8663     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
8664     const MapVector<Instruction *, Instruction *> &SinkAfter) {
8665 
8666   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8667 
8668   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8669 
8670   // ---------------------------------------------------------------------------
8671   // Pre-construction: record ingredients whose recipes we'll need to further
8672   // process after constructing the initial VPlan.
8673   // ---------------------------------------------------------------------------
8674 
8675   // Mark instructions we'll need to sink later and their targets as
8676   // ingredients whose recipe we'll need to record.
8677   for (auto &Entry : SinkAfter) {
8678     RecipeBuilder.recordRecipeOf(Entry.first);
8679     RecipeBuilder.recordRecipeOf(Entry.second);
8680   }
8681   for (auto &Reduction : CM.getInLoopReductionChains()) {
8682     PHINode *Phi = Reduction.first;
8683     RecurKind Kind =
8684         Legal->getReductionVars().find(Phi)->second.getRecurrenceKind();
8685     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8686 
8687     RecipeBuilder.recordRecipeOf(Phi);
8688     for (auto &R : ReductionOperations) {
8689       RecipeBuilder.recordRecipeOf(R);
8690       // For min/max reductions, where we have a pair of icmp/select, we also
8691       // need to record the ICmp recipe, so it can be removed later.
8692       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
8693              "Only min/max recurrences allowed for inloop reductions");
8694       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
8695         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
8696     }
8697   }
8698 
8699   // For each interleave group which is relevant for this (possibly trimmed)
8700   // Range, add it to the set of groups to be later applied to the VPlan and add
8701   // placeholders for its members' Recipes which we'll be replacing with a
8702   // single VPInterleaveRecipe.
8703   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8704     auto applyIG = [IG, this](ElementCount VF) -> bool {
8705       return (VF.isVector() && // Query is illegal for VF == 1
8706               CM.getWideningDecision(IG->getInsertPos(), VF) ==
8707                   LoopVectorizationCostModel::CM_Interleave);
8708     };
8709     if (!getDecisionAndClampRange(applyIG, Range))
8710       continue;
8711     InterleaveGroups.insert(IG);
8712     for (unsigned i = 0; i < IG->getFactor(); i++)
8713       if (Instruction *Member = IG->getMember(i))
8714         RecipeBuilder.recordRecipeOf(Member);
8715   };
8716 
8717   // ---------------------------------------------------------------------------
8718   // Build initial VPlan: Scan the body of the loop in a topological order to
8719   // visit each basic block after having visited its predecessor basic blocks.
8720   // ---------------------------------------------------------------------------
8721 
8722   // Create initial VPlan skeleton, with separate header and latch blocks.
8723   VPBasicBlock *HeaderVPBB = new VPBasicBlock();
8724   VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
8725   VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
8726   auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop");
8727   auto Plan = std::make_unique<VPlan>(TopRegion);
8728 
8729   Instruction *DLInst =
8730       getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
8731   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(),
8732                         DLInst ? DLInst->getDebugLoc() : DebugLoc(),
8733                         !CM.foldTailByMasking(), false);
8734 
8735   // Scan the body of the loop in a topological order to visit each basic block
8736   // after having visited its predecessor basic blocks.
8737   LoopBlocksDFS DFS(OrigLoop);
8738   DFS.perform(LI);
8739 
8740   VPBasicBlock *VPBB = HeaderVPBB;
8741   SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove;
8742   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8743     // Relevant instructions from basic block BB will be grouped into VPRecipe
8744     // ingredients and fill a new VPBasicBlock.
8745     unsigned VPBBsForBB = 0;
8746     VPBB->setName(BB->getName());
8747     Builder.setInsertPoint(VPBB);
8748 
8749     // Introduce each ingredient into VPlan.
8750     // TODO: Model and preserve debug instrinsics in VPlan.
8751     for (Instruction &I : BB->instructionsWithoutDebug()) {
8752       Instruction *Instr = &I;
8753 
8754       // First filter out irrelevant instructions, to ensure no recipes are
8755       // built for them.
8756       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
8757         continue;
8758 
8759       SmallVector<VPValue *, 4> Operands;
8760       auto *Phi = dyn_cast<PHINode>(Instr);
8761       if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
8762         Operands.push_back(Plan->getOrAddVPValue(
8763             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
8764       } else {
8765         auto OpRange = Plan->mapToVPValues(Instr->operands());
8766         Operands = {OpRange.begin(), OpRange.end()};
8767       }
8768       if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
8769               Instr, Operands, Range, Plan)) {
8770         // If Instr can be simplified to an existing VPValue, use it.
8771         if (RecipeOrValue.is<VPValue *>()) {
8772           auto *VPV = RecipeOrValue.get<VPValue *>();
8773           Plan->addVPValue(Instr, VPV);
8774           // If the re-used value is a recipe, register the recipe for the
8775           // instruction, in case the recipe for Instr needs to be recorded.
8776           if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef()))
8777             RecipeBuilder.setRecipe(Instr, R);
8778           continue;
8779         }
8780         // Otherwise, add the new recipe.
8781         VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>();
8782         for (auto *Def : Recipe->definedValues()) {
8783           auto *UV = Def->getUnderlyingValue();
8784           Plan->addVPValue(UV, Def);
8785         }
8786 
8787         if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) &&
8788             HeaderVPBB->getFirstNonPhi() != VPBB->end()) {
8789           // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section
8790           // of the header block. That can happen for truncates of induction
8791           // variables. Those recipes are moved to the phi section of the header
8792           // block after applying SinkAfter, which relies on the original
8793           // position of the trunc.
8794           assert(isa<TruncInst>(Instr));
8795           InductionsToMove.push_back(
8796               cast<VPWidenIntOrFpInductionRecipe>(Recipe));
8797         }
8798         RecipeBuilder.setRecipe(Instr, Recipe);
8799         VPBB->appendRecipe(Recipe);
8800         continue;
8801       }
8802 
8803       // Otherwise, if all widening options failed, Instruction is to be
8804       // replicated. This may create a successor for VPBB.
8805       VPBasicBlock *NextVPBB =
8806           RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan);
8807       if (NextVPBB != VPBB) {
8808         VPBB = NextVPBB;
8809         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
8810                                     : "");
8811       }
8812     }
8813 
8814     VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
8815     VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
8816   }
8817 
8818   // Fold the last, empty block into its predecessor.
8819   VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB);
8820   assert(VPBB && "expected to fold last (empty) block");
8821   // After here, VPBB should not be used.
8822   VPBB = nullptr;
8823 
8824   assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
8825          !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
8826          "entry block must be set to a VPRegionBlock having a non-empty entry "
8827          "VPBasicBlock");
8828   RecipeBuilder.fixHeaderPhis();
8829 
8830   // ---------------------------------------------------------------------------
8831   // Transform initial VPlan: Apply previously taken decisions, in order, to
8832   // bring the VPlan to its final state.
8833   // ---------------------------------------------------------------------------
8834 
8835   // Apply Sink-After legal constraints.
8836   auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {
8837     auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
8838     if (Region && Region->isReplicator()) {
8839       assert(Region->getNumSuccessors() == 1 &&
8840              Region->getNumPredecessors() == 1 && "Expected SESE region!");
8841       assert(R->getParent()->size() == 1 &&
8842              "A recipe in an original replicator region must be the only "
8843              "recipe in its block");
8844       return Region;
8845     }
8846     return nullptr;
8847   };
8848   for (auto &Entry : SinkAfter) {
8849     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
8850     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
8851 
8852     auto *TargetRegion = GetReplicateRegion(Target);
8853     auto *SinkRegion = GetReplicateRegion(Sink);
8854     if (!SinkRegion) {
8855       // If the sink source is not a replicate region, sink the recipe directly.
8856       if (TargetRegion) {
8857         // The target is in a replication region, make sure to move Sink to
8858         // the block after it, not into the replication region itself.
8859         VPBasicBlock *NextBlock =
8860             cast<VPBasicBlock>(TargetRegion->getSuccessors().front());
8861         Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
8862       } else
8863         Sink->moveAfter(Target);
8864       continue;
8865     }
8866 
8867     // The sink source is in a replicate region. Unhook the region from the CFG.
8868     auto *SinkPred = SinkRegion->getSinglePredecessor();
8869     auto *SinkSucc = SinkRegion->getSingleSuccessor();
8870     VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion);
8871     VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc);
8872     VPBlockUtils::connectBlocks(SinkPred, SinkSucc);
8873 
8874     if (TargetRegion) {
8875       // The target recipe is also in a replicate region, move the sink region
8876       // after the target region.
8877       auto *TargetSucc = TargetRegion->getSingleSuccessor();
8878       VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc);
8879       VPBlockUtils::connectBlocks(TargetRegion, SinkRegion);
8880       VPBlockUtils::connectBlocks(SinkRegion, TargetSucc);
8881     } else {
8882       // The sink source is in a replicate region, we need to move the whole
8883       // replicate region, which should only contain a single recipe in the
8884       // main block.
8885       auto *SplitBlock =
8886           Target->getParent()->splitAt(std::next(Target->getIterator()));
8887 
8888       auto *SplitPred = SplitBlock->getSinglePredecessor();
8889 
8890       VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock);
8891       VPBlockUtils::connectBlocks(SplitPred, SinkRegion);
8892       VPBlockUtils::connectBlocks(SinkRegion, SplitBlock);
8893     }
8894   }
8895 
8896   VPlanTransforms::removeRedundantCanonicalIVs(*Plan);
8897   VPlanTransforms::removeRedundantInductionCasts(*Plan);
8898 
8899   // Now that sink-after is done, move induction recipes for optimized truncates
8900   // to the phi section of the header block.
8901   for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove)
8902     Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
8903 
8904   // Adjust the recipes for any inloop reductions.
8905   adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExit()), Plan,
8906                              RecipeBuilder, Range.Start);
8907 
8908   // Introduce a recipe to combine the incoming and previous values of a
8909   // first-order recurrence.
8910   for (VPRecipeBase &R :
8911        Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
8912     auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R);
8913     if (!RecurPhi)
8914       continue;
8915 
8916     VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe();
8917     VPBasicBlock *InsertBlock = PrevRecipe->getParent();
8918     auto *Region = GetReplicateRegion(PrevRecipe);
8919     if (Region)
8920       InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor());
8921     if (Region || PrevRecipe->isPhi())
8922       Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
8923     else
8924       Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator()));
8925 
8926     auto *RecurSplice = cast<VPInstruction>(
8927         Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
8928                              {RecurPhi, RecurPhi->getBackedgeValue()}));
8929 
8930     RecurPhi->replaceAllUsesWith(RecurSplice);
8931     // Set the first operand of RecurSplice to RecurPhi again, after replacing
8932     // all users.
8933     RecurSplice->setOperand(0, RecurPhi);
8934   }
8935 
8936   // Interleave memory: for each Interleave Group we marked earlier as relevant
8937   // for this VPlan, replace the Recipes widening its memory instructions with a
8938   // single VPInterleaveRecipe at its insertion point.
8939   for (auto IG : InterleaveGroups) {
8940     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
8941         RecipeBuilder.getRecipe(IG->getInsertPos()));
8942     SmallVector<VPValue *, 4> StoredValues;
8943     for (unsigned i = 0; i < IG->getFactor(); ++i)
8944       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
8945         auto *StoreR =
8946             cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
8947         StoredValues.push_back(StoreR->getStoredValue());
8948       }
8949 
8950     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
8951                                         Recipe->getMask());
8952     VPIG->insertBefore(Recipe);
8953     unsigned J = 0;
8954     for (unsigned i = 0; i < IG->getFactor(); ++i)
8955       if (Instruction *Member = IG->getMember(i)) {
8956         if (!Member->getType()->isVoidTy()) {
8957           VPValue *OriginalV = Plan->getVPValue(Member);
8958           Plan->removeVPValueFor(Member);
8959           Plan->addVPValue(Member, VPIG->getVPValue(J));
8960           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
8961           J++;
8962         }
8963         RecipeBuilder.getRecipe(Member)->eraseFromParent();
8964       }
8965   }
8966 
8967   // From this point onwards, VPlan-to-VPlan transformations may change the plan
8968   // in ways that accessing values using original IR values is incorrect.
8969   Plan->disableValue2VPValue();
8970 
8971   VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE());
8972   VPlanTransforms::sinkScalarOperands(*Plan);
8973   VPlanTransforms::mergeReplicateRegions(*Plan);
8974   VPlanTransforms::removeDeadRecipes(*Plan, *OrigLoop);
8975 
8976   std::string PlanName;
8977   raw_string_ostream RSO(PlanName);
8978   ElementCount VF = Range.Start;
8979   Plan->addVF(VF);
8980   RSO << "Initial VPlan for VF={" << VF;
8981   for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
8982     Plan->addVF(VF);
8983     RSO << "," << VF;
8984   }
8985   RSO << "},UF>=1";
8986   RSO.flush();
8987   Plan->setName(PlanName);
8988 
8989   // Fold Exit block into its predecessor if possible.
8990   // TODO: Fold block earlier once all VPlan transforms properly maintain a
8991   // VPBasicBlock as exit.
8992   VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExit());
8993 
8994   assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
8995   return Plan;
8996 }
8997 
8998 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
8999   // Outer loop handling: They may require CFG and instruction level
9000   // transformations before even evaluating whether vectorization is profitable.
9001   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9002   // the vectorization pipeline.
9003   assert(!OrigLoop->isInnermost());
9004   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9005 
9006   // Create new empty VPlan
9007   auto Plan = std::make_unique<VPlan>();
9008 
9009   // Build hierarchical CFG
9010   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9011   HCFGBuilder.buildHierarchicalCFG();
9012 
9013   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
9014        VF *= 2)
9015     Plan->addVF(VF);
9016 
9017   if (EnableVPlanPredication) {
9018     VPlanPredicator VPP(*Plan);
9019     VPP.predicate();
9020 
9021     // Avoid running transformation to recipes until masked code generation in
9022     // VPlan-native path is in place.
9023     return Plan;
9024   }
9025 
9026   SmallPtrSet<Instruction *, 1> DeadInstructions;
9027   VPlanTransforms::VPInstructionsToVPRecipes(
9028       OrigLoop, Plan,
9029       [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
9030       DeadInstructions, *PSE.getSE());
9031 
9032   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(),
9033                         true, true);
9034   return Plan;
9035 }
9036 
9037 // Adjust the recipes for reductions. For in-loop reductions the chain of
9038 // instructions leading from the loop exit instr to the phi need to be converted
9039 // to reductions, with one operand being vector and the other being the scalar
9040 // reduction chain. For other reductions, a select is introduced between the phi
9041 // and live-out recipes when folding the tail.
9042 void LoopVectorizationPlanner::adjustRecipesForReductions(
9043     VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
9044     ElementCount MinVF) {
9045   for (auto &Reduction : CM.getInLoopReductionChains()) {
9046     PHINode *Phi = Reduction.first;
9047     const RecurrenceDescriptor &RdxDesc =
9048         Legal->getReductionVars().find(Phi)->second;
9049     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9050 
9051     if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc))
9052       continue;
9053 
9054     // ReductionOperations are orders top-down from the phi's use to the
9055     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
9056     // which of the two operands will remain scalar and which will be reduced.
9057     // For minmax the chain will be the select instructions.
9058     Instruction *Chain = Phi;
9059     for (Instruction *R : ReductionOperations) {
9060       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
9061       RecurKind Kind = RdxDesc.getRecurrenceKind();
9062 
9063       VPValue *ChainOp = Plan->getVPValue(Chain);
9064       unsigned FirstOpId;
9065       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
9066              "Only min/max recurrences allowed for inloop reductions");
9067       // Recognize a call to the llvm.fmuladd intrinsic.
9068       bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9069       assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) &&
9070              "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9071       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9072         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
9073                "Expected to replace a VPWidenSelectSC");
9074         FirstOpId = 1;
9075       } else {
9076         assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) ||
9077                 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) &&
9078                "Expected to replace a VPWidenSC");
9079         FirstOpId = 0;
9080       }
9081       unsigned VecOpId =
9082           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
9083       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
9084 
9085       auto *CondOp = CM.blockNeedsPredicationForAnyReason(R->getParent())
9086                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
9087                          : nullptr;
9088 
9089       if (IsFMulAdd) {
9090         // If the instruction is a call to the llvm.fmuladd intrinsic then we
9091         // need to create an fmul recipe to use as the vector operand for the
9092         // fadd reduction.
9093         VPInstruction *FMulRecipe = new VPInstruction(
9094             Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))});
9095         FMulRecipe->setFastMathFlags(R->getFastMathFlags());
9096         WidenRecipe->getParent()->insert(FMulRecipe,
9097                                          WidenRecipe->getIterator());
9098         VecOp = FMulRecipe;
9099       }
9100       VPReductionRecipe *RedRecipe =
9101           new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
9102       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9103       Plan->removeVPValueFor(R);
9104       Plan->addVPValue(R, RedRecipe);
9105       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
9106       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9107       WidenRecipe->eraseFromParent();
9108 
9109       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9110         VPRecipeBase *CompareRecipe =
9111             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
9112         assert(isa<VPWidenRecipe>(CompareRecipe) &&
9113                "Expected to replace a VPWidenSC");
9114         assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
9115                "Expected no remaining users");
9116         CompareRecipe->eraseFromParent();
9117       }
9118       Chain = R;
9119     }
9120   }
9121 
9122   // If tail is folded by masking, introduce selects between the phi
9123   // and the live-out instruction of each reduction, at the beginning of the
9124   // dedicated latch block.
9125   if (CM.foldTailByMasking()) {
9126     Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin());
9127     for (VPRecipeBase &R :
9128          Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9129       VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9130       if (!PhiR || PhiR->isInLoop())
9131         continue;
9132       VPValue *Cond =
9133           RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
9134       VPValue *Red = PhiR->getBackedgeValue();
9135       assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB &&
9136              "reduction recipe must be defined before latch");
9137       Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});
9138     }
9139   }
9140 }
9141 
9142 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9143 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9144                                VPSlotTracker &SlotTracker) const {
9145   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9146   IG->getInsertPos()->printAsOperand(O, false);
9147   O << ", ";
9148   getAddr()->printAsOperand(O, SlotTracker);
9149   VPValue *Mask = getMask();
9150   if (Mask) {
9151     O << ", ";
9152     Mask->printAsOperand(O, SlotTracker);
9153   }
9154 
9155   unsigned OpIdx = 0;
9156   for (unsigned i = 0; i < IG->getFactor(); ++i) {
9157     if (!IG->getMember(i))
9158       continue;
9159     if (getNumStoreOperands() > 0) {
9160       O << "\n" << Indent << "  store ";
9161       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9162       O << " to index " << i;
9163     } else {
9164       O << "\n" << Indent << "  ";
9165       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
9166       O << " = load from index " << i;
9167     }
9168     ++OpIdx;
9169   }
9170 }
9171 #endif
9172 
9173 void VPWidenCallRecipe::execute(VPTransformState &State) {
9174   State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
9175                                   *this, State);
9176 }
9177 
9178 void VPWidenSelectRecipe::execute(VPTransformState &State) {
9179   auto &I = *cast<SelectInst>(getUnderlyingInstr());
9180   State.ILV->setDebugLocFromInst(&I);
9181 
9182   // The condition can be loop invariant  but still defined inside the
9183   // loop. This means that we can't just use the original 'cond' value.
9184   // We have to take the 'vectorized' value and pick the first lane.
9185   // Instcombine will make this a no-op.
9186   auto *InvarCond =
9187       InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr;
9188 
9189   for (unsigned Part = 0; Part < State.UF; ++Part) {
9190     Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part);
9191     Value *Op0 = State.get(getOperand(1), Part);
9192     Value *Op1 = State.get(getOperand(2), Part);
9193     Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
9194     State.set(this, Sel, Part);
9195     State.ILV->addMetadata(Sel, &I);
9196   }
9197 }
9198 
9199 void VPWidenRecipe::execute(VPTransformState &State) {
9200   auto &I = *cast<Instruction>(getUnderlyingValue());
9201   auto &Builder = State.Builder;
9202   switch (I.getOpcode()) {
9203   case Instruction::Call:
9204   case Instruction::Br:
9205   case Instruction::PHI:
9206   case Instruction::GetElementPtr:
9207   case Instruction::Select:
9208     llvm_unreachable("This instruction is handled by a different recipe.");
9209   case Instruction::UDiv:
9210   case Instruction::SDiv:
9211   case Instruction::SRem:
9212   case Instruction::URem:
9213   case Instruction::Add:
9214   case Instruction::FAdd:
9215   case Instruction::Sub:
9216   case Instruction::FSub:
9217   case Instruction::FNeg:
9218   case Instruction::Mul:
9219   case Instruction::FMul:
9220   case Instruction::FDiv:
9221   case Instruction::FRem:
9222   case Instruction::Shl:
9223   case Instruction::LShr:
9224   case Instruction::AShr:
9225   case Instruction::And:
9226   case Instruction::Or:
9227   case Instruction::Xor: {
9228     // Just widen unops and binops.
9229     State.ILV->setDebugLocFromInst(&I);
9230 
9231     for (unsigned Part = 0; Part < State.UF; ++Part) {
9232       SmallVector<Value *, 2> Ops;
9233       for (VPValue *VPOp : operands())
9234         Ops.push_back(State.get(VPOp, Part));
9235 
9236       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
9237 
9238       if (auto *VecOp = dyn_cast<Instruction>(V)) {
9239         VecOp->copyIRFlags(&I);
9240 
9241         // If the instruction is vectorized and was in a basic block that needed
9242         // predication, we can't propagate poison-generating flags (nuw/nsw,
9243         // exact, etc.). The control flow has been linearized and the
9244         // instruction is no longer guarded by the predicate, which could make
9245         // the flag properties to no longer hold.
9246         if (State.MayGeneratePoisonRecipes.contains(this))
9247           VecOp->dropPoisonGeneratingFlags();
9248       }
9249 
9250       // Use this vector value for all users of the original instruction.
9251       State.set(this, V, Part);
9252       State.ILV->addMetadata(V, &I);
9253     }
9254 
9255     break;
9256   }
9257   case Instruction::ICmp:
9258   case Instruction::FCmp: {
9259     // Widen compares. Generate vector compares.
9260     bool FCmp = (I.getOpcode() == Instruction::FCmp);
9261     auto *Cmp = cast<CmpInst>(&I);
9262     State.ILV->setDebugLocFromInst(Cmp);
9263     for (unsigned Part = 0; Part < State.UF; ++Part) {
9264       Value *A = State.get(getOperand(0), Part);
9265       Value *B = State.get(getOperand(1), Part);
9266       Value *C = nullptr;
9267       if (FCmp) {
9268         // Propagate fast math flags.
9269         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
9270         Builder.setFastMathFlags(Cmp->getFastMathFlags());
9271         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
9272       } else {
9273         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
9274       }
9275       State.set(this, C, Part);
9276       State.ILV->addMetadata(C, &I);
9277     }
9278 
9279     break;
9280   }
9281 
9282   case Instruction::ZExt:
9283   case Instruction::SExt:
9284   case Instruction::FPToUI:
9285   case Instruction::FPToSI:
9286   case Instruction::FPExt:
9287   case Instruction::PtrToInt:
9288   case Instruction::IntToPtr:
9289   case Instruction::SIToFP:
9290   case Instruction::UIToFP:
9291   case Instruction::Trunc:
9292   case Instruction::FPTrunc:
9293   case Instruction::BitCast: {
9294     auto *CI = cast<CastInst>(&I);
9295     State.ILV->setDebugLocFromInst(CI);
9296 
9297     /// Vectorize casts.
9298     Type *DestTy = (State.VF.isScalar())
9299                        ? CI->getType()
9300                        : VectorType::get(CI->getType(), State.VF);
9301 
9302     for (unsigned Part = 0; Part < State.UF; ++Part) {
9303       Value *A = State.get(getOperand(0), Part);
9304       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
9305       State.set(this, Cast, Part);
9306       State.ILV->addMetadata(Cast, &I);
9307     }
9308     break;
9309   }
9310   default:
9311     // This instruction is not vectorized by simple widening.
9312     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
9313     llvm_unreachable("Unhandled instruction!");
9314   } // end of switch.
9315 }
9316 
9317 void VPWidenGEPRecipe::execute(VPTransformState &State) {
9318   auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
9319   // Construct a vector GEP by widening the operands of the scalar GEP as
9320   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
9321   // results in a vector of pointers when at least one operand of the GEP
9322   // is vector-typed. Thus, to keep the representation compact, we only use
9323   // vector-typed operands for loop-varying values.
9324 
9325   if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
9326     // If we are vectorizing, but the GEP has only loop-invariant operands,
9327     // the GEP we build (by only using vector-typed operands for
9328     // loop-varying values) would be a scalar pointer. Thus, to ensure we
9329     // produce a vector of pointers, we need to either arbitrarily pick an
9330     // operand to broadcast, or broadcast a clone of the original GEP.
9331     // Here, we broadcast a clone of the original.
9332     //
9333     // TODO: If at some point we decide to scalarize instructions having
9334     //       loop-invariant operands, this special case will no longer be
9335     //       required. We would add the scalarization decision to
9336     //       collectLoopScalars() and teach getVectorValue() to broadcast
9337     //       the lane-zero scalar value.
9338     auto *Clone = State.Builder.Insert(GEP->clone());
9339     for (unsigned Part = 0; Part < State.UF; ++Part) {
9340       Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone);
9341       State.set(this, EntryPart, Part);
9342       State.ILV->addMetadata(EntryPart, GEP);
9343     }
9344   } else {
9345     // If the GEP has at least one loop-varying operand, we are sure to
9346     // produce a vector of pointers. But if we are only unrolling, we want
9347     // to produce a scalar GEP for each unroll part. Thus, the GEP we
9348     // produce with the code below will be scalar (if VF == 1) or vector
9349     // (otherwise). Note that for the unroll-only case, we still maintain
9350     // values in the vector mapping with initVector, as we do for other
9351     // instructions.
9352     for (unsigned Part = 0; Part < State.UF; ++Part) {
9353       // The pointer operand of the new GEP. If it's loop-invariant, we
9354       // won't broadcast it.
9355       auto *Ptr = IsPtrLoopInvariant
9356                       ? State.get(getOperand(0), VPIteration(0, 0))
9357                       : State.get(getOperand(0), Part);
9358 
9359       // Collect all the indices for the new GEP. If any index is
9360       // loop-invariant, we won't broadcast it.
9361       SmallVector<Value *, 4> Indices;
9362       for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
9363         VPValue *Operand = getOperand(I);
9364         if (IsIndexLoopInvariant[I - 1])
9365           Indices.push_back(State.get(Operand, VPIteration(0, 0)));
9366         else
9367           Indices.push_back(State.get(Operand, Part));
9368       }
9369 
9370       // If the GEP instruction is vectorized and was in a basic block that
9371       // needed predication, we can't propagate the poison-generating 'inbounds'
9372       // flag. The control flow has been linearized and the GEP is no longer
9373       // guarded by the predicate, which could make the 'inbounds' properties to
9374       // no longer hold.
9375       bool IsInBounds =
9376           GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0;
9377 
9378       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
9379       // but it should be a vector, otherwise.
9380       auto *NewGEP = IsInBounds
9381                          ? State.Builder.CreateInBoundsGEP(
9382                                GEP->getSourceElementType(), Ptr, Indices)
9383                          : State.Builder.CreateGEP(GEP->getSourceElementType(),
9384                                                    Ptr, Indices);
9385       assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
9386              "NewGEP is not a pointer vector");
9387       State.set(this, NewGEP, Part);
9388       State.ILV->addMetadata(NewGEP, GEP);
9389     }
9390   }
9391 }
9392 
9393 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
9394   assert(!State.Instance && "Int or FP induction being replicated.");
9395 
9396   Value *Start = getStartValue()->getLiveInIRValue();
9397   const InductionDescriptor &ID = getInductionDescriptor();
9398   TruncInst *Trunc = getTruncInst();
9399   IRBuilderBase &Builder = State.Builder;
9400   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
9401   assert(State.VF.isVector() && "must have vector VF");
9402 
9403   // The value from the original loop to which we are mapping the new induction
9404   // variable.
9405   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
9406 
9407   auto &DL = EntryVal->getModule()->getDataLayout();
9408 
9409   // Generate code for the induction step. Note that induction steps are
9410   // required to be loop-invariant
9411   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
9412     if (SE.isSCEVable(IV->getType())) {
9413       SCEVExpander Exp(SE, DL, "induction");
9414       return Exp.expandCodeFor(Step, Step->getType(),
9415                                State.CFG.VectorPreHeader->getTerminator());
9416     }
9417     return cast<SCEVUnknown>(Step)->getValue();
9418   };
9419 
9420   // Fast-math-flags propagate from the original induction instruction.
9421   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
9422   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
9423     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
9424 
9425   // Now do the actual transformations, and start with creating the step value.
9426   Value *Step = CreateStepValue(ID.getStep());
9427 
9428   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
9429          "Expected either an induction phi-node or a truncate of it!");
9430 
9431   // Construct the initial value of the vector IV in the vector loop preheader
9432   auto CurrIP = Builder.saveIP();
9433   Builder.SetInsertPoint(State.CFG.VectorPreHeader->getTerminator());
9434   if (isa<TruncInst>(EntryVal)) {
9435     assert(Start->getType()->isIntegerTy() &&
9436            "Truncation requires an integer type");
9437     auto *TruncType = cast<IntegerType>(EntryVal->getType());
9438     Step = Builder.CreateTrunc(Step, TruncType);
9439     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
9440   }
9441 
9442   Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
9443   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
9444   Value *SteppedStart = getStepVector(
9445       SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
9446 
9447   // We create vector phi nodes for both integer and floating-point induction
9448   // variables. Here, we determine the kind of arithmetic we will perform.
9449   Instruction::BinaryOps AddOp;
9450   Instruction::BinaryOps MulOp;
9451   if (Step->getType()->isIntegerTy()) {
9452     AddOp = Instruction::Add;
9453     MulOp = Instruction::Mul;
9454   } else {
9455     AddOp = ID.getInductionOpcode();
9456     MulOp = Instruction::FMul;
9457   }
9458 
9459   // Multiply the vectorization factor by the step using integer or
9460   // floating-point arithmetic as appropriate.
9461   Type *StepType = Step->getType();
9462   Value *RuntimeVF;
9463   if (Step->getType()->isFloatingPointTy())
9464     RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
9465   else
9466     RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
9467   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
9468 
9469   // Create a vector splat to use in the induction update.
9470   //
9471   // FIXME: If the step is non-constant, we create the vector splat with
9472   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
9473   //        handle a constant vector splat.
9474   Value *SplatVF = isa<Constant>(Mul)
9475                        ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
9476                        : Builder.CreateVectorSplat(State.VF, Mul);
9477   Builder.restoreIP(CurrIP);
9478 
9479   // We may need to add the step a number of times, depending on the unroll
9480   // factor. The last of those goes into the PHI.
9481   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
9482                                     &*State.CFG.PrevBB->getFirstInsertionPt());
9483   VecInd->setDebugLoc(EntryVal->getDebugLoc());
9484   Instruction *LastInduction = VecInd;
9485   for (unsigned Part = 0; Part < State.UF; ++Part) {
9486     State.set(this, LastInduction, Part);
9487 
9488     if (isa<TruncInst>(EntryVal))
9489       State.ILV->addMetadata(LastInduction, EntryVal);
9490 
9491     LastInduction = cast<Instruction>(
9492         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
9493     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
9494   }
9495 
9496   LastInduction->setName("vec.ind.next");
9497   VecInd->addIncoming(SteppedStart, State.CFG.VectorPreHeader);
9498   // Add induction update using an incorrect block temporarily. The phi node
9499   // will be fixed after VPlan execution. Note that at this point the latch
9500   // block cannot be used, as it does not exist yet.
9501   // TODO: Model increment value in VPlan, by turning the recipe into a
9502   // multi-def and a subclass of VPHeaderPHIRecipe.
9503   VecInd->addIncoming(LastInduction, State.CFG.VectorPreHeader);
9504 }
9505 
9506 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
9507   assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
9508          "Not a pointer induction according to InductionDescriptor!");
9509   assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
9510          "Unexpected type.");
9511 
9512   auto *IVR = getParent()->getPlan()->getCanonicalIV();
9513   PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
9514 
9515   if (all_of(users(), [this](const VPUser *U) {
9516         return cast<VPRecipeBase>(U)->usesScalars(this);
9517       })) {
9518     // This is the normalized GEP that starts counting at zero.
9519     Value *PtrInd = State.Builder.CreateSExtOrTrunc(
9520         CanonicalIV, IndDesc.getStep()->getType());
9521     // Determine the number of scalars we need to generate for each unroll
9522     // iteration. If the instruction is uniform, we only need to generate the
9523     // first lane. Otherwise, we generate all VF values.
9524     bool IsUniform = vputils::onlyFirstLaneUsed(this);
9525     assert((IsUniform || !State.VF.isScalable()) &&
9526            "Cannot scalarize a scalable VF");
9527     unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
9528 
9529     for (unsigned Part = 0; Part < State.UF; ++Part) {
9530       Value *PartStart =
9531           createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part);
9532 
9533       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
9534         Value *Idx = State.Builder.CreateAdd(
9535             PartStart, ConstantInt::get(PtrInd->getType(), Lane));
9536         Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx);
9537 
9538         Value *Step = CreateStepValue(IndDesc.getStep(), SE,
9539                                       State.CFG.PrevBB->getTerminator());
9540         Value *SclrGep = emitTransformedIndex(
9541             State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc);
9542         SclrGep->setName("next.gep");
9543         State.set(this, SclrGep, VPIteration(Part, Lane));
9544       }
9545     }
9546     return;
9547   }
9548 
9549   assert(isa<SCEVConstant>(IndDesc.getStep()) &&
9550          "Induction step not a SCEV constant!");
9551   Type *PhiType = IndDesc.getStep()->getType();
9552 
9553   // Build a pointer phi
9554   Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
9555   Type *ScStValueType = ScalarStartValue->getType();
9556   PHINode *NewPointerPhi =
9557       PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
9558   NewPointerPhi->addIncoming(ScalarStartValue, State.CFG.VectorPreHeader);
9559 
9560   // A pointer induction, performed by using a gep
9561   const DataLayout &DL = NewPointerPhi->getModule()->getDataLayout();
9562   Instruction *InductionLoc = &*State.Builder.GetInsertPoint();
9563 
9564   const SCEV *ScalarStep = IndDesc.getStep();
9565   SCEVExpander Exp(SE, DL, "induction");
9566   Value *ScalarStepValue = Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
9567   Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
9568   Value *NumUnrolledElems =
9569       State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
9570   Value *InductionGEP = GetElementPtrInst::Create(
9571       IndDesc.getElementType(), NewPointerPhi,
9572       State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
9573       InductionLoc);
9574   // Add induction update using an incorrect block temporarily. The phi node
9575   // will be fixed after VPlan execution. Note that at this point the latch
9576   // block cannot be used, as it does not exist yet.
9577   // TODO: Model increment value in VPlan, by turning the recipe into a
9578   // multi-def and a subclass of VPHeaderPHIRecipe.
9579   NewPointerPhi->addIncoming(InductionGEP, State.CFG.VectorPreHeader);
9580 
9581   // Create UF many actual address geps that use the pointer
9582   // phi as base and a vectorized version of the step value
9583   // (<step*0, ..., step*N>) as offset.
9584   for (unsigned Part = 0; Part < State.UF; ++Part) {
9585     Type *VecPhiType = VectorType::get(PhiType, State.VF);
9586     Value *StartOffsetScalar =
9587         State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
9588     Value *StartOffset =
9589         State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
9590     // Create a vector of consecutive numbers from zero to VF.
9591     StartOffset = State.Builder.CreateAdd(
9592         StartOffset, State.Builder.CreateStepVector(VecPhiType));
9593 
9594     Value *GEP = State.Builder.CreateGEP(
9595         IndDesc.getElementType(), NewPointerPhi,
9596         State.Builder.CreateMul(
9597             StartOffset,
9598             State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
9599             "vector.gep"));
9600     State.set(this, GEP, Part);
9601   }
9602 }
9603 
9604 void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
9605   assert(!State.Instance && "VPScalarIVStepsRecipe being replicated.");
9606 
9607   // Fast-math-flags propagate from the original induction instruction.
9608   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9609   if (IndDesc.getInductionBinOp() &&
9610       isa<FPMathOperator>(IndDesc.getInductionBinOp()))
9611     State.Builder.setFastMathFlags(
9612         IndDesc.getInductionBinOp()->getFastMathFlags());
9613 
9614   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9615   auto CreateScalarIV = [&](Value *&Step) -> Value * {
9616     Value *ScalarIV = State.get(getCanonicalIV(), VPIteration(0, 0));
9617     auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0);
9618     if (!isCanonical() || CanonicalIV->getType() != Ty) {
9619       ScalarIV =
9620           Ty->isIntegerTy()
9621               ? State.Builder.CreateSExtOrTrunc(ScalarIV, Ty)
9622               : State.Builder.CreateCast(Instruction::SIToFP, ScalarIV, Ty);
9623       ScalarIV = emitTransformedIndex(State.Builder, ScalarIV,
9624                                       getStartValue()->getLiveInIRValue(), Step,
9625                                       IndDesc);
9626       ScalarIV->setName("offset.idx");
9627     }
9628     if (TruncToTy) {
9629       assert(Step->getType()->isIntegerTy() &&
9630              "Truncation requires an integer step");
9631       ScalarIV = State.Builder.CreateTrunc(ScalarIV, TruncToTy);
9632       Step = State.Builder.CreateTrunc(Step, TruncToTy);
9633     }
9634     return ScalarIV;
9635   };
9636 
9637   Value *ScalarIV = CreateScalarIV(Step);
9638   if (State.VF.isVector()) {
9639     buildScalarSteps(ScalarIV, Step, IndDesc, this, State);
9640     return;
9641   }
9642 
9643   for (unsigned Part = 0; Part < State.UF; ++Part) {
9644     assert(!State.VF.isScalable() && "scalable vectors not yet supported.");
9645     Value *EntryPart;
9646     if (Step->getType()->isFloatingPointTy()) {
9647       Value *StartIdx =
9648           getRuntimeVFAsFloat(State.Builder, Step->getType(), State.VF * Part);
9649       // Floating-point operations inherit FMF via the builder's flags.
9650       Value *MulOp = State.Builder.CreateFMul(StartIdx, Step);
9651       EntryPart = State.Builder.CreateBinOp(IndDesc.getInductionOpcode(),
9652                                             ScalarIV, MulOp);
9653     } else {
9654       Value *StartIdx =
9655           getRuntimeVF(State.Builder, Step->getType(), State.VF * Part);
9656       EntryPart = State.Builder.CreateAdd(
9657           ScalarIV, State.Builder.CreateMul(StartIdx, Step), "induction");
9658     }
9659     State.set(this, EntryPart, Part);
9660   }
9661 }
9662 
9663 void VPWidenPHIRecipe::execute(VPTransformState &State) {
9664   State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this,
9665                                  State);
9666 }
9667 
9668 void VPBlendRecipe::execute(VPTransformState &State) {
9669   State.ILV->setDebugLocFromInst(Phi, &State.Builder);
9670   // We know that all PHIs in non-header blocks are converted into
9671   // selects, so we don't have to worry about the insertion order and we
9672   // can just use the builder.
9673   // At this point we generate the predication tree. There may be
9674   // duplications since this is a simple recursive scan, but future
9675   // optimizations will clean it up.
9676 
9677   unsigned NumIncoming = getNumIncomingValues();
9678 
9679   // Generate a sequence of selects of the form:
9680   // SELECT(Mask3, In3,
9681   //        SELECT(Mask2, In2,
9682   //               SELECT(Mask1, In1,
9683   //                      In0)))
9684   // Note that Mask0 is never used: lanes for which no path reaches this phi and
9685   // are essentially undef are taken from In0.
9686   InnerLoopVectorizer::VectorParts Entry(State.UF);
9687   for (unsigned In = 0; In < NumIncoming; ++In) {
9688     for (unsigned Part = 0; Part < State.UF; ++Part) {
9689       // We might have single edge PHIs (blocks) - use an identity
9690       // 'select' for the first PHI operand.
9691       Value *In0 = State.get(getIncomingValue(In), Part);
9692       if (In == 0)
9693         Entry[Part] = In0; // Initialize with the first incoming value.
9694       else {
9695         // Select between the current value and the previous incoming edge
9696         // based on the incoming mask.
9697         Value *Cond = State.get(getMask(In), Part);
9698         Entry[Part] =
9699             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
9700       }
9701     }
9702   }
9703   for (unsigned Part = 0; Part < State.UF; ++Part)
9704     State.set(this, Entry[Part], Part);
9705 }
9706 
9707 void VPInterleaveRecipe::execute(VPTransformState &State) {
9708   assert(!State.Instance && "Interleave group being replicated.");
9709   State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9710                                       getStoredValues(), getMask());
9711 }
9712 
9713 void VPReductionRecipe::execute(VPTransformState &State) {
9714   assert(!State.Instance && "Reduction being replicated.");
9715   Value *PrevInChain = State.get(getChainOp(), 0);
9716   RecurKind Kind = RdxDesc->getRecurrenceKind();
9717   bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc);
9718   // Propagate the fast-math flags carried by the underlying instruction.
9719   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
9720   State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags());
9721   for (unsigned Part = 0; Part < State.UF; ++Part) {
9722     Value *NewVecOp = State.get(getVecOp(), Part);
9723     if (VPValue *Cond = getCondOp()) {
9724       Value *NewCond = State.get(Cond, Part);
9725       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
9726       Value *Iden = RdxDesc->getRecurrenceIdentity(
9727           Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags());
9728       Value *IdenVec =
9729           State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
9730       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
9731       NewVecOp = Select;
9732     }
9733     Value *NewRed;
9734     Value *NextInChain;
9735     if (IsOrdered) {
9736       if (State.VF.isVector())
9737         NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp,
9738                                         PrevInChain);
9739       else
9740         NewRed = State.Builder.CreateBinOp(
9741             (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain,
9742             NewVecOp);
9743       PrevInChain = NewRed;
9744     } else {
9745       PrevInChain = State.get(getChainOp(), Part);
9746       NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
9747     }
9748     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9749       NextInChain =
9750           createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
9751                          NewRed, PrevInChain);
9752     } else if (IsOrdered)
9753       NextInChain = NewRed;
9754     else
9755       NextInChain = State.Builder.CreateBinOp(
9756           (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed,
9757           PrevInChain);
9758     State.set(this, NextInChain, Part);
9759   }
9760 }
9761 
9762 void VPReplicateRecipe::execute(VPTransformState &State) {
9763   if (State.Instance) { // Generate a single instance.
9764     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9765     State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance,
9766                                     IsPredicated, State);
9767     // Insert scalar instance packing it into a vector.
9768     if (AlsoPack && State.VF.isVector()) {
9769       // If we're constructing lane 0, initialize to start from poison.
9770       if (State.Instance->Lane.isFirstLane()) {
9771         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9772         Value *Poison = PoisonValue::get(
9773             VectorType::get(getUnderlyingValue()->getType(), State.VF));
9774         State.set(this, Poison, State.Instance->Part);
9775       }
9776       State.ILV->packScalarIntoVectorValue(this, *State.Instance, State);
9777     }
9778     return;
9779   }
9780 
9781   // Generate scalar instances for all VF lanes of all UF parts, unless the
9782   // instruction is uniform inwhich case generate only the first lane for each
9783   // of the UF parts.
9784   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
9785   assert((!State.VF.isScalable() || IsUniform) &&
9786          "Can't scalarize a scalable vector");
9787   for (unsigned Part = 0; Part < State.UF; ++Part)
9788     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9789       State.ILV->scalarizeInstruction(getUnderlyingInstr(), this,
9790                                       VPIteration(Part, Lane), IsPredicated,
9791                                       State);
9792 }
9793 
9794 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
9795   assert(State.Instance && "Branch on Mask works only on single instance.");
9796 
9797   unsigned Part = State.Instance->Part;
9798   unsigned Lane = State.Instance->Lane.getKnownLane();
9799 
9800   Value *ConditionBit = nullptr;
9801   VPValue *BlockInMask = getMask();
9802   if (BlockInMask) {
9803     ConditionBit = State.get(BlockInMask, Part);
9804     if (ConditionBit->getType()->isVectorTy())
9805       ConditionBit = State.Builder.CreateExtractElement(
9806           ConditionBit, State.Builder.getInt32(Lane));
9807   } else // Block in mask is all-one.
9808     ConditionBit = State.Builder.getTrue();
9809 
9810   // Replace the temporary unreachable terminator with a new conditional branch,
9811   // whose two destinations will be set later when they are created.
9812   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
9813   assert(isa<UnreachableInst>(CurrentTerminator) &&
9814          "Expected to replace unreachable terminator with conditional branch.");
9815   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
9816   CondBr->setSuccessor(0, nullptr);
9817   ReplaceInstWithInst(CurrentTerminator, CondBr);
9818 }
9819 
9820 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
9821   assert(State.Instance && "Predicated instruction PHI works per instance.");
9822   Instruction *ScalarPredInst =
9823       cast<Instruction>(State.get(getOperand(0), *State.Instance));
9824   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
9825   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
9826   assert(PredicatingBB && "Predicated block has no single predecessor.");
9827   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
9828          "operand must be VPReplicateRecipe");
9829 
9830   // By current pack/unpack logic we need to generate only a single phi node: if
9831   // a vector value for the predicated instruction exists at this point it means
9832   // the instruction has vector users only, and a phi for the vector value is
9833   // needed. In this case the recipe of the predicated instruction is marked to
9834   // also do that packing, thereby "hoisting" the insert-element sequence.
9835   // Otherwise, a phi node for the scalar value is needed.
9836   unsigned Part = State.Instance->Part;
9837   if (State.hasVectorValue(getOperand(0), Part)) {
9838     Value *VectorValue = State.get(getOperand(0), Part);
9839     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
9840     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
9841     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
9842     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
9843     if (State.hasVectorValue(this, Part))
9844       State.reset(this, VPhi, Part);
9845     else
9846       State.set(this, VPhi, Part);
9847     // NOTE: Currently we need to update the value of the operand, so the next
9848     // predicated iteration inserts its generated value in the correct vector.
9849     State.reset(getOperand(0), VPhi, Part);
9850   } else {
9851     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
9852     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
9853     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
9854                      PredicatingBB);
9855     Phi->addIncoming(ScalarPredInst, PredicatedBB);
9856     if (State.hasScalarValue(this, *State.Instance))
9857       State.reset(this, Phi, *State.Instance);
9858     else
9859       State.set(this, Phi, *State.Instance);
9860     // NOTE: Currently we need to update the value of the operand, so the next
9861     // predicated iteration inserts its generated value in the correct vector.
9862     State.reset(getOperand(0), Phi, *State.Instance);
9863   }
9864 }
9865 
9866 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9867   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9868 
9869   // Attempt to issue a wide load.
9870   LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
9871   StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
9872 
9873   assert((LI || SI) && "Invalid Load/Store instruction");
9874   assert((!SI || StoredValue) && "No stored value provided for widened store");
9875   assert((!LI || !StoredValue) && "Stored value provided for widened load");
9876 
9877   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9878 
9879   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9880   const Align Alignment = getLoadStoreAlignment(&Ingredient);
9881   bool CreateGatherScatter = !Consecutive;
9882 
9883   auto &Builder = State.Builder;
9884   InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
9885   bool isMaskRequired = getMask();
9886   if (isMaskRequired)
9887     for (unsigned Part = 0; Part < State.UF; ++Part)
9888       BlockInMaskParts[Part] = State.get(getMask(), Part);
9889 
9890   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
9891     // Calculate the pointer for the specific unroll-part.
9892     GetElementPtrInst *PartPtr = nullptr;
9893 
9894     bool InBounds = false;
9895     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
9896       InBounds = gep->isInBounds();
9897     if (Reverse) {
9898       // If the address is consecutive but reversed, then the
9899       // wide store needs to start at the last vector element.
9900       // RunTimeVF =  VScale * VF.getKnownMinValue()
9901       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
9902       Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF);
9903       // NumElt = -Part * RunTimeVF
9904       Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
9905       // LastLane = 1 - RunTimeVF
9906       Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
9907       PartPtr =
9908           cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
9909       PartPtr->setIsInBounds(InBounds);
9910       PartPtr = cast<GetElementPtrInst>(
9911           Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
9912       PartPtr->setIsInBounds(InBounds);
9913       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
9914         BlockInMaskParts[Part] =
9915             Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse");
9916     } else {
9917       Value *Increment =
9918           createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part);
9919       PartPtr = cast<GetElementPtrInst>(
9920           Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
9921       PartPtr->setIsInBounds(InBounds);
9922     }
9923 
9924     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
9925     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
9926   };
9927 
9928   // Handle Stores:
9929   if (SI) {
9930     State.ILV->setDebugLocFromInst(SI);
9931 
9932     for (unsigned Part = 0; Part < State.UF; ++Part) {
9933       Instruction *NewSI = nullptr;
9934       Value *StoredVal = State.get(StoredValue, Part);
9935       if (CreateGatherScatter) {
9936         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9937         Value *VectorGep = State.get(getAddr(), Part);
9938         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
9939                                             MaskPart);
9940       } else {
9941         if (Reverse) {
9942           // If we store to reverse consecutive memory locations, then we need
9943           // to reverse the order of elements in the stored value.
9944           StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
9945           // We don't want to update the value in the map as it might be used in
9946           // another expression. So don't call resetVectorValue(StoredVal).
9947         }
9948         auto *VecPtr =
9949             CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9950         if (isMaskRequired)
9951           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
9952                                             BlockInMaskParts[Part]);
9953         else
9954           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
9955       }
9956       State.ILV->addMetadata(NewSI, SI);
9957     }
9958     return;
9959   }
9960 
9961   // Handle loads.
9962   assert(LI && "Must have a load instruction");
9963   State.ILV->setDebugLocFromInst(LI);
9964   for (unsigned Part = 0; Part < State.UF; ++Part) {
9965     Value *NewLI;
9966     if (CreateGatherScatter) {
9967       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9968       Value *VectorGep = State.get(getAddr(), Part);
9969       NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
9970                                          nullptr, "wide.masked.gather");
9971       State.ILV->addMetadata(NewLI, LI);
9972     } else {
9973       auto *VecPtr =
9974           CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9975       if (isMaskRequired)
9976         NewLI = Builder.CreateMaskedLoad(
9977             DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
9978             PoisonValue::get(DataTy), "wide.masked.load");
9979       else
9980         NewLI =
9981             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
9982 
9983       // Add metadata to the load, but setVectorValue to the reverse shuffle.
9984       State.ILV->addMetadata(NewLI, LI);
9985       if (Reverse)
9986         NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
9987     }
9988 
9989     State.set(this, NewLI, Part);
9990   }
9991 }
9992 
9993 // Determine how to lower the scalar epilogue, which depends on 1) optimising
9994 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9995 // predication, and 4) a TTI hook that analyses whether the loop is suitable
9996 // for predication.
9997 static ScalarEpilogueLowering getScalarEpilogueLowering(
9998     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9999     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
10000     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
10001     LoopVectorizationLegality &LVL) {
10002   // 1) OptSize takes precedence over all other options, i.e. if this is set,
10003   // don't look at hints or options, and don't request a scalar epilogue.
10004   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
10005   // LoopAccessInfo (due to code dependency and not being able to reliably get
10006   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
10007   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
10008   // versioning when the vectorization is forced, unlike hasOptSize. So revert
10009   // back to the old way and vectorize with versioning when forced. See D81345.)
10010   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
10011                                                       PGSOQueryType::IRPass) &&
10012                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
10013     return CM_ScalarEpilogueNotAllowedOptSize;
10014 
10015   // 2) If set, obey the directives
10016   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
10017     switch (PreferPredicateOverEpilogue) {
10018     case PreferPredicateTy::ScalarEpilogue:
10019       return CM_ScalarEpilogueAllowed;
10020     case PreferPredicateTy::PredicateElseScalarEpilogue:
10021       return CM_ScalarEpilogueNotNeededUsePredicate;
10022     case PreferPredicateTy::PredicateOrDontVectorize:
10023       return CM_ScalarEpilogueNotAllowedUsePredicate;
10024     };
10025   }
10026 
10027   // 3) If set, obey the hints
10028   switch (Hints.getPredicate()) {
10029   case LoopVectorizeHints::FK_Enabled:
10030     return CM_ScalarEpilogueNotNeededUsePredicate;
10031   case LoopVectorizeHints::FK_Disabled:
10032     return CM_ScalarEpilogueAllowed;
10033   };
10034 
10035   // 4) if the TTI hook indicates this is profitable, request predication.
10036   if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
10037                                        LVL.getLAI()))
10038     return CM_ScalarEpilogueNotNeededUsePredicate;
10039 
10040   return CM_ScalarEpilogueAllowed;
10041 }
10042 
10043 Value *VPTransformState::get(VPValue *Def, unsigned Part) {
10044   // If Values have been set for this Def return the one relevant for \p Part.
10045   if (hasVectorValue(Def, Part))
10046     return Data.PerPartOutput[Def][Part];
10047 
10048   if (!hasScalarValue(Def, {Part, 0})) {
10049     Value *IRV = Def->getLiveInIRValue();
10050     Value *B = ILV->getBroadcastInstrs(IRV);
10051     set(Def, B, Part);
10052     return B;
10053   }
10054 
10055   Value *ScalarValue = get(Def, {Part, 0});
10056   // If we aren't vectorizing, we can just copy the scalar map values over
10057   // to the vector map.
10058   if (VF.isScalar()) {
10059     set(Def, ScalarValue, Part);
10060     return ScalarValue;
10061   }
10062 
10063   auto *RepR = dyn_cast<VPReplicateRecipe>(Def);
10064   bool IsUniform = RepR && RepR->isUniform();
10065 
10066   unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
10067   // Check if there is a scalar value for the selected lane.
10068   if (!hasScalarValue(Def, {Part, LastLane})) {
10069     // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform.
10070     assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) ||
10071             isa<VPScalarIVStepsRecipe>(Def->getDef())) &&
10072            "unexpected recipe found to be invariant");
10073     IsUniform = true;
10074     LastLane = 0;
10075   }
10076 
10077   auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));
10078   // Set the insert point after the last scalarized instruction or after the
10079   // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
10080   // will directly follow the scalar definitions.
10081   auto OldIP = Builder.saveIP();
10082   auto NewIP =
10083       isa<PHINode>(LastInst)
10084           ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
10085           : std::next(BasicBlock::iterator(LastInst));
10086   Builder.SetInsertPoint(&*NewIP);
10087 
10088   // However, if we are vectorizing, we need to construct the vector values.
10089   // If the value is known to be uniform after vectorization, we can just
10090   // broadcast the scalar value corresponding to lane zero for each unroll
10091   // iteration. Otherwise, we construct the vector values using
10092   // insertelement instructions. Since the resulting vectors are stored in
10093   // State, we will only generate the insertelements once.
10094   Value *VectorValue = nullptr;
10095   if (IsUniform) {
10096     VectorValue = ILV->getBroadcastInstrs(ScalarValue);
10097     set(Def, VectorValue, Part);
10098   } else {
10099     // Initialize packing with insertelements to start from undef.
10100     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
10101     Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
10102     set(Def, Undef, Part);
10103     for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
10104       ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this);
10105     VectorValue = get(Def, Part);
10106   }
10107   Builder.restoreIP(OldIP);
10108   return VectorValue;
10109 }
10110 
10111 // Process the loop in the VPlan-native vectorization path. This path builds
10112 // VPlan upfront in the vectorization pipeline, which allows to apply
10113 // VPlan-to-VPlan transformations from the very beginning without modifying the
10114 // input LLVM IR.
10115 static bool processLoopInVPlanNativePath(
10116     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
10117     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
10118     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
10119     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
10120     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
10121     LoopVectorizationRequirements &Requirements) {
10122 
10123   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
10124     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
10125     return false;
10126   }
10127   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
10128   Function *F = L->getHeader()->getParent();
10129   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
10130 
10131   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10132       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
10133 
10134   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
10135                                 &Hints, IAI);
10136   // Use the planner for outer loop vectorization.
10137   // TODO: CM is not used at this point inside the planner. Turn CM into an
10138   // optional argument if we don't need it in the future.
10139   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints,
10140                                Requirements, ORE);
10141 
10142   // Get user vectorization factor.
10143   ElementCount UserVF = Hints.getWidth();
10144 
10145   CM.collectElementTypesForWidening();
10146 
10147   // Plan how to best vectorize, return the best VF and its cost.
10148   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
10149 
10150   // If we are stress testing VPlan builds, do not attempt to generate vector
10151   // code. Masked vector code generation support will follow soon.
10152   // Also, do not attempt to vectorize if no vector code will be produced.
10153   if (VPlanBuildStressTest || EnableVPlanPredication ||
10154       VectorizationFactor::Disabled() == VF)
10155     return false;
10156 
10157   VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10158 
10159   {
10160     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10161                              F->getParent()->getDataLayout());
10162     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
10163                            &CM, BFI, PSI, Checks);
10164     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
10165                       << L->getHeader()->getParent()->getName() << "\"\n");
10166     LVP.executePlan(VF.Width, 1, BestPlan, LB, DT);
10167   }
10168 
10169   // Mark the loop as already vectorized to avoid vectorizing again.
10170   Hints.setAlreadyVectorized();
10171   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10172   return true;
10173 }
10174 
10175 // Emit a remark if there are stores to floats that required a floating point
10176 // extension. If the vectorized loop was generated with floating point there
10177 // will be a performance penalty from the conversion overhead and the change in
10178 // the vector width.
10179 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
10180   SmallVector<Instruction *, 4> Worklist;
10181   for (BasicBlock *BB : L->getBlocks()) {
10182     for (Instruction &Inst : *BB) {
10183       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
10184         if (S->getValueOperand()->getType()->isFloatTy())
10185           Worklist.push_back(S);
10186       }
10187     }
10188   }
10189 
10190   // Traverse the floating point stores upwards searching, for floating point
10191   // conversions.
10192   SmallPtrSet<const Instruction *, 4> Visited;
10193   SmallPtrSet<const Instruction *, 4> EmittedRemark;
10194   while (!Worklist.empty()) {
10195     auto *I = Worklist.pop_back_val();
10196     if (!L->contains(I))
10197       continue;
10198     if (!Visited.insert(I).second)
10199       continue;
10200 
10201     // Emit a remark if the floating point store required a floating
10202     // point conversion.
10203     // TODO: More work could be done to identify the root cause such as a
10204     // constant or a function return type and point the user to it.
10205     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
10206       ORE->emit([&]() {
10207         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
10208                                           I->getDebugLoc(), L->getHeader())
10209                << "floating point conversion changes vector width. "
10210                << "Mixed floating point precision requires an up/down "
10211                << "cast that will negatively impact performance.";
10212       });
10213 
10214     for (Use &Op : I->operands())
10215       if (auto *OpI = dyn_cast<Instruction>(Op))
10216         Worklist.push_back(OpI);
10217   }
10218 }
10219 
10220 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
10221     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10222                                !EnableLoopInterleaving),
10223       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10224                               !EnableLoopVectorization) {}
10225 
10226 bool LoopVectorizePass::processLoop(Loop *L) {
10227   assert((EnableVPlanNativePath || L->isInnermost()) &&
10228          "VPlan-native path is not enabled. Only process inner loops.");
10229 
10230 #ifndef NDEBUG
10231   const std::string DebugLocStr = getDebugLocString(L);
10232 #endif /* NDEBUG */
10233 
10234   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
10235                     << L->getHeader()->getParent()->getName() << "' from "
10236                     << DebugLocStr << "\n");
10237 
10238   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
10239 
10240   LLVM_DEBUG(
10241       dbgs() << "LV: Loop hints:"
10242              << " force="
10243              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
10244                      ? "disabled"
10245                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
10246                             ? "enabled"
10247                             : "?"))
10248              << " width=" << Hints.getWidth()
10249              << " interleave=" << Hints.getInterleave() << "\n");
10250 
10251   // Function containing loop
10252   Function *F = L->getHeader()->getParent();
10253 
10254   // Looking at the diagnostic output is the only way to determine if a loop
10255   // was vectorized (other than looking at the IR or machine code), so it
10256   // is important to generate an optimization remark for each loop. Most of
10257   // these messages are generated as OptimizationRemarkAnalysis. Remarks
10258   // generated as OptimizationRemark and OptimizationRemarkMissed are
10259   // less verbose reporting vectorized loops and unvectorized loops that may
10260   // benefit from vectorization, respectively.
10261 
10262   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10263     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10264     return false;
10265   }
10266 
10267   PredicatedScalarEvolution PSE(*SE, *L);
10268 
10269   // Check if it is legal to vectorize the loop.
10270   LoopVectorizationRequirements Requirements;
10271   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
10272                                 &Requirements, &Hints, DB, AC, BFI, PSI);
10273   if (!LVL.canVectorize(EnableVPlanNativePath)) {
10274     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10275     Hints.emitRemarkWithHints();
10276     return false;
10277   }
10278 
10279   // Check the function attributes and profiles to find out if this function
10280   // should be optimized for size.
10281   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10282       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
10283 
10284   // Entrance to the VPlan-native vectorization path. Outer loops are processed
10285   // here. They may require CFG and instruction level transformations before
10286   // even evaluating whether vectorization is profitable. Since we cannot modify
10287   // the incoming IR, we need to build VPlan upfront in the vectorization
10288   // pipeline.
10289   if (!L->isInnermost())
10290     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10291                                         ORE, BFI, PSI, Hints, Requirements);
10292 
10293   assert(L->isInnermost() && "Inner loop expected.");
10294 
10295   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10296   // count by optimizing for size, to minimize overheads.
10297   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
10298   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10299     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10300                       << "This loop is worth vectorizing only if no scalar "
10301                       << "iteration overheads are incurred.");
10302     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10303       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10304     else {
10305       LLVM_DEBUG(dbgs() << "\n");
10306       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10307     }
10308   }
10309 
10310   // Check the function attributes to see if implicit floats are allowed.
10311   // FIXME: This check doesn't seem possibly correct -- what if the loop is
10312   // an integer loop and the vector instructions selected are purely integer
10313   // vector instructions?
10314   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10315     reportVectorizationFailure(
10316         "Can't vectorize when the NoImplicitFloat attribute is used",
10317         "loop not vectorized due to NoImplicitFloat attribute",
10318         "NoImplicitFloat", ORE, L);
10319     Hints.emitRemarkWithHints();
10320     return false;
10321   }
10322 
10323   // Check if the target supports potentially unsafe FP vectorization.
10324   // FIXME: Add a check for the type of safety issue (denormal, signaling)
10325   // for the target we're vectorizing for, to make sure none of the
10326   // additional fp-math flags can help.
10327   if (Hints.isPotentiallyUnsafe() &&
10328       TTI->isFPVectorizationPotentiallyUnsafe()) {
10329     reportVectorizationFailure(
10330         "Potentially unsafe FP op prevents vectorization",
10331         "loop not vectorized due to unsafe FP support.",
10332         "UnsafeFP", ORE, L);
10333     Hints.emitRemarkWithHints();
10334     return false;
10335   }
10336 
10337   bool AllowOrderedReductions;
10338   // If the flag is set, use that instead and override the TTI behaviour.
10339   if (ForceOrderedReductions.getNumOccurrences() > 0)
10340     AllowOrderedReductions = ForceOrderedReductions;
10341   else
10342     AllowOrderedReductions = TTI->enableOrderedReductions();
10343   if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10344     ORE->emit([&]() {
10345       auto *ExactFPMathInst = Requirements.getExactFPInst();
10346       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10347                                                  ExactFPMathInst->getDebugLoc(),
10348                                                  ExactFPMathInst->getParent())
10349              << "loop not vectorized: cannot prove it is safe to reorder "
10350                 "floating-point operations";
10351     });
10352     LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10353                          "reorder floating-point operations\n");
10354     Hints.emitRemarkWithHints();
10355     return false;
10356   }
10357 
10358   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10359   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10360 
10361   // If an override option has been passed in for interleaved accesses, use it.
10362   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
10363     UseInterleaved = EnableInterleavedMemAccesses;
10364 
10365   // Analyze interleaved memory accesses.
10366   if (UseInterleaved) {
10367     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
10368   }
10369 
10370   // Use the cost model.
10371   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10372                                 F, &Hints, IAI);
10373   CM.collectValuesToIgnore();
10374   CM.collectElementTypesForWidening();
10375 
10376   // Use the planner for vectorization.
10377   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints,
10378                                Requirements, ORE);
10379 
10380   // Get user vectorization factor and interleave count.
10381   ElementCount UserVF = Hints.getWidth();
10382   unsigned UserIC = Hints.getInterleave();
10383 
10384   // Plan how to best vectorize, return the best VF and its cost.
10385   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
10386 
10387   VectorizationFactor VF = VectorizationFactor::Disabled();
10388   unsigned IC = 1;
10389 
10390   if (MaybeVF) {
10391     VF = *MaybeVF;
10392     // Select the interleave count.
10393     IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());
10394   }
10395 
10396   // Identify the diagnostic messages that should be produced.
10397   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10398   bool VectorizeLoop = true, InterleaveLoop = true;
10399   if (VF.Width.isScalar()) {
10400     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10401     VecDiagMsg = std::make_pair(
10402         "VectorizationNotBeneficial",
10403         "the cost-model indicates that vectorization is not beneficial");
10404     VectorizeLoop = false;
10405   }
10406 
10407   if (!MaybeVF && UserIC > 1) {
10408     // Tell the user interleaving was avoided up-front, despite being explicitly
10409     // requested.
10410     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10411                          "interleaving should be avoided up front\n");
10412     IntDiagMsg = std::make_pair(
10413         "InterleavingAvoided",
10414         "Ignoring UserIC, because interleaving was avoided up front");
10415     InterleaveLoop = false;
10416   } else if (IC == 1 && UserIC <= 1) {
10417     // Tell the user interleaving is not beneficial.
10418     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10419     IntDiagMsg = std::make_pair(
10420         "InterleavingNotBeneficial",
10421         "the cost-model indicates that interleaving is not beneficial");
10422     InterleaveLoop = false;
10423     if (UserIC == 1) {
10424       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10425       IntDiagMsg.second +=
10426           " and is explicitly disabled or interleave count is set to 1";
10427     }
10428   } else if (IC > 1 && UserIC == 1) {
10429     // Tell the user interleaving is beneficial, but it explicitly disabled.
10430     LLVM_DEBUG(
10431         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10432     IntDiagMsg = std::make_pair(
10433         "InterleavingBeneficialButDisabled",
10434         "the cost-model indicates that interleaving is beneficial "
10435         "but is explicitly disabled or interleave count is set to 1");
10436     InterleaveLoop = false;
10437   }
10438 
10439   // Override IC if user provided an interleave count.
10440   IC = UserIC > 0 ? UserIC : IC;
10441 
10442   // Emit diagnostic messages, if any.
10443   const char *VAPassName = Hints.vectorizeAnalysisPassName();
10444   if (!VectorizeLoop && !InterleaveLoop) {
10445     // Do not vectorize or interleaving the loop.
10446     ORE->emit([&]() {
10447       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10448                                       L->getStartLoc(), L->getHeader())
10449              << VecDiagMsg.second;
10450     });
10451     ORE->emit([&]() {
10452       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10453                                       L->getStartLoc(), L->getHeader())
10454              << IntDiagMsg.second;
10455     });
10456     return false;
10457   } else if (!VectorizeLoop && InterleaveLoop) {
10458     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10459     ORE->emit([&]() {
10460       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10461                                         L->getStartLoc(), L->getHeader())
10462              << VecDiagMsg.second;
10463     });
10464   } else if (VectorizeLoop && !InterleaveLoop) {
10465     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10466                       << ") in " << DebugLocStr << '\n');
10467     ORE->emit([&]() {
10468       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10469                                         L->getStartLoc(), L->getHeader())
10470              << IntDiagMsg.second;
10471     });
10472   } else if (VectorizeLoop && InterleaveLoop) {
10473     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10474                       << ") in " << DebugLocStr << '\n');
10475     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10476   }
10477 
10478   bool DisableRuntimeUnroll = false;
10479   MDNode *OrigLoopID = L->getLoopID();
10480   {
10481     // Optimistically generate runtime checks. Drop them if they turn out to not
10482     // be profitable. Limit the scope of Checks, so the cleanup happens
10483     // immediately after vector codegeneration is done.
10484     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10485                              F->getParent()->getDataLayout());
10486     if (!VF.Width.isScalar() || IC > 1)
10487       Checks.Create(L, *LVL.getLAI(), PSE.getPredicate());
10488 
10489     using namespace ore;
10490     if (!VectorizeLoop) {
10491       assert(IC > 1 && "interleave count should not be 1 or 0");
10492       // If we decided that it is not legal to vectorize the loop, then
10493       // interleave it.
10494       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10495                                  &CM, BFI, PSI, Checks);
10496 
10497       VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10498       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT);
10499 
10500       ORE->emit([&]() {
10501         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10502                                   L->getHeader())
10503                << "interleaved loop (interleaved count: "
10504                << NV("InterleaveCount", IC) << ")";
10505       });
10506     } else {
10507       // If we decided that it is *legal* to vectorize the loop, then do it.
10508 
10509       // Consider vectorizing the epilogue too if it's profitable.
10510       VectorizationFactor EpilogueVF =
10511           CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
10512       if (EpilogueVF.Width.isVector()) {
10513 
10514         // The first pass vectorizes the main loop and creates a scalar epilogue
10515         // to be vectorized by executing the plan (potentially with a different
10516         // factor) again shortly afterwards.
10517         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10518         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10519                                            EPI, &LVL, &CM, BFI, PSI, Checks);
10520 
10521         VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
10522         LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV,
10523                         DT);
10524         ++LoopsVectorized;
10525 
10526         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10527         formLCSSARecursively(*L, *DT, LI, SE);
10528 
10529         // Second pass vectorizes the epilogue and adjusts the control flow
10530         // edges from the first pass.
10531         EPI.MainLoopVF = EPI.EpilogueVF;
10532         EPI.MainLoopUF = EPI.EpilogueUF;
10533         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10534                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
10535                                                  Checks);
10536 
10537         VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
10538 
10539         // Ensure that the start values for any VPReductionPHIRecipes are
10540         // updated before vectorising the epilogue loop.
10541         VPBasicBlock *Header =
10542             BestEpiPlan.getVectorLoopRegion()->getEntryBasicBlock();
10543         for (VPRecipeBase &R : Header->phis()) {
10544           if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10545             if (auto *Resume = MainILV.getReductionResumeValue(
10546                     ReductionPhi->getRecurrenceDescriptor())) {
10547               VPValue *StartVal = new VPValue(Resume);
10548               BestEpiPlan.addExternalDef(StartVal);
10549               ReductionPhi->setOperand(0, StartVal);
10550             }
10551           }
10552         }
10553 
10554         LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10555                         DT);
10556         ++LoopsEpilogueVectorized;
10557 
10558         if (!MainILV.areSafetyChecksAdded())
10559           DisableRuntimeUnroll = true;
10560       } else {
10561         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
10562                                &LVL, &CM, BFI, PSI, Checks);
10563 
10564         VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10565         LVP.executePlan(VF.Width, IC, BestPlan, LB, DT);
10566         ++LoopsVectorized;
10567 
10568         // Add metadata to disable runtime unrolling a scalar loop when there
10569         // are no runtime checks about strides and memory. A scalar loop that is
10570         // rarely used is not worth unrolling.
10571         if (!LB.areSafetyChecksAdded())
10572           DisableRuntimeUnroll = true;
10573       }
10574       // Report the vectorization decision.
10575       ORE->emit([&]() {
10576         return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
10577                                   L->getHeader())
10578                << "vectorized loop (vectorization width: "
10579                << NV("VectorizationFactor", VF.Width)
10580                << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
10581       });
10582     }
10583 
10584     if (ORE->allowExtraAnalysis(LV_NAME))
10585       checkMixedPrecision(L, ORE);
10586   }
10587 
10588   Optional<MDNode *> RemainderLoopID =
10589       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10590                                       LLVMLoopVectorizeFollowupEpilogue});
10591   if (RemainderLoopID.hasValue()) {
10592     L->setLoopID(RemainderLoopID.getValue());
10593   } else {
10594     if (DisableRuntimeUnroll)
10595       AddRuntimeUnrollDisableMetaData(L);
10596 
10597     // Mark the loop as already vectorized to avoid vectorizing again.
10598     Hints.setAlreadyVectorized();
10599   }
10600 
10601   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10602   return true;
10603 }
10604 
10605 LoopVectorizeResult LoopVectorizePass::runImpl(
10606     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10607     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
10608     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
10609     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
10610     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10611   SE = &SE_;
10612   LI = &LI_;
10613   TTI = &TTI_;
10614   DT = &DT_;
10615   BFI = &BFI_;
10616   TLI = TLI_;
10617   AA = &AA_;
10618   AC = &AC_;
10619   GetLAA = &GetLAA_;
10620   DB = &DB_;
10621   ORE = &ORE_;
10622   PSI = PSI_;
10623 
10624   // Don't attempt if
10625   // 1. the target claims to have no vector registers, and
10626   // 2. interleaving won't help ILP.
10627   //
10628   // The second condition is necessary because, even if the target has no
10629   // vector registers, loop vectorization may still enable scalar
10630   // interleaving.
10631   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10632       TTI->getMaxInterleaveFactor(1) < 2)
10633     return LoopVectorizeResult(false, false);
10634 
10635   bool Changed = false, CFGChanged = false;
10636 
10637   // The vectorizer requires loops to be in simplified form.
10638   // Since simplification may add new inner loops, it has to run before the
10639   // legality and profitability checks. This means running the loop vectorizer
10640   // will simplify all loops, regardless of whether anything end up being
10641   // vectorized.
10642   for (auto &L : *LI)
10643     Changed |= CFGChanged |=
10644         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10645 
10646   // Build up a worklist of inner-loops to vectorize. This is necessary as
10647   // the act of vectorizing or partially unrolling a loop creates new loops
10648   // and can invalidate iterators across the loops.
10649   SmallVector<Loop *, 8> Worklist;
10650 
10651   for (Loop *L : *LI)
10652     collectSupportedLoops(*L, LI, ORE, Worklist);
10653 
10654   LoopsAnalyzed += Worklist.size();
10655 
10656   // Now walk the identified inner loops.
10657   while (!Worklist.empty()) {
10658     Loop *L = Worklist.pop_back_val();
10659 
10660     // For the inner loops we actually process, form LCSSA to simplify the
10661     // transform.
10662     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10663 
10664     Changed |= CFGChanged |= processLoop(L);
10665   }
10666 
10667   // Process each loop nest in the function.
10668   return LoopVectorizeResult(Changed, CFGChanged);
10669 }
10670 
10671 PreservedAnalyses LoopVectorizePass::run(Function &F,
10672                                          FunctionAnalysisManager &AM) {
10673     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
10674     auto &LI = AM.getResult<LoopAnalysis>(F);
10675     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10676     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10677     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
10678     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10679     auto &AA = AM.getResult<AAManager>(F);
10680     auto &AC = AM.getResult<AssumptionAnalysis>(F);
10681     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10682     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10683 
10684     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
10685     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
10686         [&](Loop &L) -> const LoopAccessInfo & {
10687       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,      SE,
10688                                         TLI, TTI, nullptr, nullptr, nullptr};
10689       return LAM.getResult<LoopAccessAnalysis>(L, AR);
10690     };
10691     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10692     ProfileSummaryInfo *PSI =
10693         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10694     LoopVectorizeResult Result =
10695         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
10696     if (!Result.MadeAnyChange)
10697       return PreservedAnalyses::all();
10698     PreservedAnalyses PA;
10699 
10700     // We currently do not preserve loopinfo/dominator analyses with outer loop
10701     // vectorization. Until this is addressed, mark these analyses as preserved
10702     // only for non-VPlan-native path.
10703     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10704     if (!EnableVPlanNativePath) {
10705       PA.preserve<LoopAnalysis>();
10706       PA.preserve<DominatorTreeAnalysis>();
10707     }
10708 
10709     if (Result.MadeCFGChange) {
10710       // Making CFG changes likely means a loop got vectorized. Indicate that
10711       // extra simplification passes should be run.
10712       // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10713       // be run if runtime checks have been added.
10714       AM.getResult<ShouldRunExtraVectorPasses>(F);
10715       PA.preserve<ShouldRunExtraVectorPasses>();
10716     } else {
10717       PA.preserveSet<CFGAnalyses>();
10718     }
10719     return PA;
10720 }
10721 
10722 void LoopVectorizePass::printPipeline(
10723     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10724   static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10725       OS, MapClassName2PassName);
10726 
10727   OS << "<";
10728   OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10729   OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10730   OS << ">";
10731 }
10732