1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SmallPtrSet.h"
73 #include "llvm/ADT/SmallSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
91 #include "llvm/Analysis/ProfileSummaryInfo.h"
92 #include "llvm/Analysis/ScalarEvolution.h"
93 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
94 #include "llvm/Analysis/TargetLibraryInfo.h"
95 #include "llvm/Analysis/TargetTransformInfo.h"
96 #include "llvm/Analysis/VectorUtils.h"
97 #include "llvm/IR/Attributes.h"
98 #include "llvm/IR/BasicBlock.h"
99 #include "llvm/IR/CFG.h"
100 #include "llvm/IR/Constant.h"
101 #include "llvm/IR/Constants.h"
102 #include "llvm/IR/DataLayout.h"
103 #include "llvm/IR/DebugInfoMetadata.h"
104 #include "llvm/IR/DebugLoc.h"
105 #include "llvm/IR/DerivedTypes.h"
106 #include "llvm/IR/DiagnosticInfo.h"
107 #include "llvm/IR/Dominators.h"
108 #include "llvm/IR/Function.h"
109 #include "llvm/IR/IRBuilder.h"
110 #include "llvm/IR/InstrTypes.h"
111 #include "llvm/IR/Instruction.h"
112 #include "llvm/IR/Instructions.h"
113 #include "llvm/IR/IntrinsicInst.h"
114 #include "llvm/IR/Intrinsics.h"
115 #include "llvm/IR/LLVMContext.h"
116 #include "llvm/IR/Metadata.h"
117 #include "llvm/IR/Module.h"
118 #include "llvm/IR/Operator.h"
119 #include "llvm/IR/PatternMatch.h"
120 #include "llvm/IR/Type.h"
121 #include "llvm/IR/Use.h"
122 #include "llvm/IR/User.h"
123 #include "llvm/IR/Value.h"
124 #include "llvm/IR/ValueHandle.h"
125 #include "llvm/IR/Verifier.h"
126 #include "llvm/InitializePasses.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/InstructionCost.h"
134 #include "llvm/Support/MathExtras.h"
135 #include "llvm/Support/raw_ostream.h"
136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
137 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
138 #include "llvm/Transforms/Utils/LoopSimplify.h"
139 #include "llvm/Transforms/Utils/LoopUtils.h"
140 #include "llvm/Transforms/Utils/LoopVersioning.h"
141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
142 #include "llvm/Transforms/Utils/SizeOpts.h"
143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
144 #include <algorithm>
145 #include <cassert>
146 #include <cstdint>
147 #include <cstdlib>
148 #include <functional>
149 #include <iterator>
150 #include <limits>
151 #include <memory>
152 #include <string>
153 #include <tuple>
154 #include <utility>
155 
156 using namespace llvm;
157 
158 #define LV_NAME "loop-vectorize"
159 #define DEBUG_TYPE LV_NAME
160 
161 #ifndef NDEBUG
162 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
163 #endif
164 
165 /// @{
166 /// Metadata attribute names
167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
168 const char LLVMLoopVectorizeFollowupVectorized[] =
169     "llvm.loop.vectorize.followup_vectorized";
170 const char LLVMLoopVectorizeFollowupEpilogue[] =
171     "llvm.loop.vectorize.followup_epilogue";
172 /// @}
173 
174 STATISTIC(LoopsVectorized, "Number of loops vectorized");
175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
177 
178 static cl::opt<bool> EnableEpilogueVectorization(
179     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
180     cl::desc("Enable vectorization of epilogue loops."));
181 
182 static cl::opt<unsigned> EpilogueVectorizationForceVF(
183     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
184     cl::desc("When epilogue vectorization is enabled, and a value greater than "
185              "1 is specified, forces the given VF for all applicable epilogue "
186              "loops."));
187 
188 static cl::opt<unsigned> EpilogueVectorizationMinVF(
189     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
190     cl::desc("Only loops with vectorization factor equal to or larger than "
191              "the specified value are considered for epilogue vectorization."));
192 
193 /// Loops with a known constant trip count below this number are vectorized only
194 /// if no scalar iteration overheads are incurred.
195 static cl::opt<unsigned> TinyTripCountVectorThreshold(
196     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
197     cl::desc("Loops with a constant trip count that is smaller than this "
198              "value are vectorized only if no scalar iteration overheads "
199              "are incurred."));
200 
201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
202     "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
203     cl::desc("The maximum allowed number of runtime memory checks with a "
204              "vectorize(enable) pragma."));
205 
206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
207 // that predication is preferred, and this lists all options. I.e., the
208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
209 // and predicate the instructions accordingly. If tail-folding fails, there are
210 // different fallback strategies depending on these values:
211 namespace PreferPredicateTy {
212   enum Option {
213     ScalarEpilogue = 0,
214     PredicateElseScalarEpilogue,
215     PredicateOrDontVectorize
216   };
217 } // namespace PreferPredicateTy
218 
219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
220     "prefer-predicate-over-epilogue",
221     cl::init(PreferPredicateTy::ScalarEpilogue),
222     cl::Hidden,
223     cl::desc("Tail-folding and predication preferences over creating a scalar "
224              "epilogue loop."),
225     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
226                          "scalar-epilogue",
227                          "Don't tail-predicate loops, create scalar epilogue"),
228               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
229                          "predicate-else-scalar-epilogue",
230                          "prefer tail-folding, create scalar epilogue if tail "
231                          "folding fails."),
232               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
233                          "predicate-dont-vectorize",
234                          "prefers tail-folding, don't attempt vectorization if "
235                          "tail-folding fails.")));
236 
237 static cl::opt<bool> MaximizeBandwidth(
238     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
239     cl::desc("Maximize bandwidth when selecting vectorization factor which "
240              "will be determined by the smallest type in loop."));
241 
242 static cl::opt<bool> EnableInterleavedMemAccesses(
243     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
244     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
245 
246 /// An interleave-group may need masking if it resides in a block that needs
247 /// predication, or in order to mask away gaps.
248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
249     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
250     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
251 
252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
253     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
254     cl::desc("We don't interleave loops with a estimated constant trip count "
255              "below this number"));
256 
257 static cl::opt<unsigned> ForceTargetNumScalarRegs(
258     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
259     cl::desc("A flag that overrides the target's number of scalar registers."));
260 
261 static cl::opt<unsigned> ForceTargetNumVectorRegs(
262     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
263     cl::desc("A flag that overrides the target's number of vector registers."));
264 
265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
266     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
267     cl::desc("A flag that overrides the target's max interleave factor for "
268              "scalar loops."));
269 
270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
271     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
272     cl::desc("A flag that overrides the target's max interleave factor for "
273              "vectorized loops."));
274 
275 static cl::opt<unsigned> ForceTargetInstructionCost(
276     "force-target-instruction-cost", cl::init(0), cl::Hidden,
277     cl::desc("A flag that overrides the target's expected cost for "
278              "an instruction to a single constant value. Mostly "
279              "useful for getting consistent testing."));
280 
281 static cl::opt<bool> ForceTargetSupportsScalableVectors(
282     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
283     cl::desc(
284         "Pretend that scalable vectors are supported, even if the target does "
285         "not support them. This flag should only be used for testing."));
286 
287 static cl::opt<unsigned> SmallLoopCost(
288     "small-loop-cost", cl::init(20), cl::Hidden,
289     cl::desc(
290         "The cost of a loop that is considered 'small' by the interleaver."));
291 
292 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
293     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
294     cl::desc("Enable the use of the block frequency analysis to access PGO "
295              "heuristics minimizing code growth in cold regions and being more "
296              "aggressive in hot regions."));
297 
298 // Runtime interleave loops for load/store throughput.
299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
300     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
301     cl::desc(
302         "Enable runtime interleaving until load/store ports are saturated"));
303 
304 /// Interleave small loops with scalar reductions.
305 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
306     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
307     cl::desc("Enable interleaving for loops with small iteration counts that "
308              "contain scalar reductions to expose ILP."));
309 
310 /// The number of stores in a loop that are allowed to need predication.
311 static cl::opt<unsigned> NumberOfStoresToPredicate(
312     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
313     cl::desc("Max number of stores to be predicated behind an if."));
314 
315 static cl::opt<bool> EnableIndVarRegisterHeur(
316     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
317     cl::desc("Count the induction variable only once when interleaving"));
318 
319 static cl::opt<bool> EnableCondStoresVectorization(
320     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
321     cl::desc("Enable if predication of stores during vectorization."));
322 
323 static cl::opt<unsigned> MaxNestedScalarReductionIC(
324     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
325     cl::desc("The maximum interleave count to use when interleaving a scalar "
326              "reduction in a nested loop."));
327 
328 static cl::opt<bool>
329     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
330                            cl::Hidden,
331                            cl::desc("Prefer in-loop vector reductions, "
332                                     "overriding the targets preference."));
333 
334 static cl::opt<bool> ForceOrderedReductions(
335     "force-ordered-reductions", cl::init(false), cl::Hidden,
336     cl::desc("Enable the vectorisation of loops with in-order (strict) "
337              "FP reductions"));
338 
339 static cl::opt<bool> PreferPredicatedReductionSelect(
340     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
341     cl::desc(
342         "Prefer predicating a reduction operation over an after loop select."));
343 
344 cl::opt<bool> EnableVPlanNativePath(
345     "enable-vplan-native-path", cl::init(false), cl::Hidden,
346     cl::desc("Enable VPlan-native vectorization path with "
347              "support for outer loop vectorization."));
348 
349 // FIXME: Remove this switch once we have divergence analysis. Currently we
350 // assume divergent non-backedge branches when this switch is true.
351 cl::opt<bool> EnableVPlanPredication(
352     "enable-vplan-predication", cl::init(false), cl::Hidden,
353     cl::desc("Enable VPlan-native vectorization path predicator with "
354              "support for outer loop vectorization."));
355 
356 // This flag enables the stress testing of the VPlan H-CFG construction in the
357 // VPlan-native vectorization path. It must be used in conjuction with
358 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
359 // verification of the H-CFGs built.
360 static cl::opt<bool> VPlanBuildStressTest(
361     "vplan-build-stress-test", cl::init(false), cl::Hidden,
362     cl::desc(
363         "Build VPlan for every supported loop nest in the function and bail "
364         "out right after the build (stress test the VPlan H-CFG construction "
365         "in the VPlan-native vectorization path)."));
366 
367 cl::opt<bool> llvm::EnableLoopInterleaving(
368     "interleave-loops", cl::init(true), cl::Hidden,
369     cl::desc("Enable loop interleaving in Loop vectorization passes"));
370 cl::opt<bool> llvm::EnableLoopVectorization(
371     "vectorize-loops", cl::init(true), cl::Hidden,
372     cl::desc("Run the Loop vectorization passes"));
373 
374 cl::opt<bool> PrintVPlansInDotFormat(
375     "vplan-print-in-dot-format", cl::init(false), cl::Hidden,
376     cl::desc("Use dot format instead of plain text when dumping VPlans"));
377 
378 /// A helper function that returns true if the given type is irregular. The
379 /// type is irregular if its allocated size doesn't equal the store size of an
380 /// element of the corresponding vector type.
381 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
382   // Determine if an array of N elements of type Ty is "bitcast compatible"
383   // with a <N x Ty> vector.
384   // This is only true if there is no padding between the array elements.
385   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
386 }
387 
388 /// A helper function that returns the reciprocal of the block probability of
389 /// predicated blocks. If we return X, we are assuming the predicated block
390 /// will execute once for every X iterations of the loop header.
391 ///
392 /// TODO: We should use actual block probability here, if available. Currently,
393 ///       we always assume predicated blocks have a 50% chance of executing.
394 static unsigned getReciprocalPredBlockProb() { return 2; }
395 
396 /// A helper function that returns an integer or floating-point constant with
397 /// value C.
398 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
399   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
400                            : ConstantFP::get(Ty, C);
401 }
402 
403 /// Returns "best known" trip count for the specified loop \p L as defined by
404 /// the following procedure:
405 ///   1) Returns exact trip count if it is known.
406 ///   2) Returns expected trip count according to profile data if any.
407 ///   3) Returns upper bound estimate if it is known.
408 ///   4) Returns None if all of the above failed.
409 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
410   // Check if exact trip count is known.
411   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
412     return ExpectedTC;
413 
414   // Check if there is an expected trip count available from profile data.
415   if (LoopVectorizeWithBlockFrequency)
416     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
417       return EstimatedTC;
418 
419   // Check if upper bound estimate is known.
420   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
421     return ExpectedTC;
422 
423   return None;
424 }
425 
426 // Forward declare GeneratedRTChecks.
427 class GeneratedRTChecks;
428 
429 namespace llvm {
430 
431 AnalysisKey ShouldRunExtraVectorPasses::Key;
432 
433 /// InnerLoopVectorizer vectorizes loops which contain only one basic
434 /// block to a specified vectorization factor (VF).
435 /// This class performs the widening of scalars into vectors, or multiple
436 /// scalars. This class also implements the following features:
437 /// * It inserts an epilogue loop for handling loops that don't have iteration
438 ///   counts that are known to be a multiple of the vectorization factor.
439 /// * It handles the code generation for reduction variables.
440 /// * Scalarization (implementation using scalars) of un-vectorizable
441 ///   instructions.
442 /// InnerLoopVectorizer does not perform any vectorization-legality
443 /// checks, and relies on the caller to check for the different legality
444 /// aspects. The InnerLoopVectorizer relies on the
445 /// LoopVectorizationLegality class to provide information about the induction
446 /// and reduction variables that were found to a given vectorization factor.
447 class InnerLoopVectorizer {
448 public:
449   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
450                       LoopInfo *LI, DominatorTree *DT,
451                       const TargetLibraryInfo *TLI,
452                       const TargetTransformInfo *TTI, AssumptionCache *AC,
453                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
454                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
455                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
456                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
457       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
458         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
459         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
460         PSI(PSI), RTChecks(RTChecks) {
461     // Query this against the original loop and save it here because the profile
462     // of the original loop header may change as the transformation happens.
463     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
464         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
465   }
466 
467   virtual ~InnerLoopVectorizer() = default;
468 
469   /// Create a new empty loop that will contain vectorized instructions later
470   /// on, while the old loop will be used as the scalar remainder. Control flow
471   /// is generated around the vectorized (and scalar epilogue) loops consisting
472   /// of various checks and bypasses. Return the pre-header block of the new
473   /// loop.
474   /// In the case of epilogue vectorization, this function is overriden to
475   /// handle the more complex control flow around the loops.
476   virtual BasicBlock *createVectorizedLoopSkeleton();
477 
478   /// Widen a single call instruction within the innermost loop.
479   void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
480                             VPTransformState &State);
481 
482   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
483   void fixVectorizedLoop(VPTransformState &State);
484 
485   // Return true if any runtime check is added.
486   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
487 
488   /// A type for vectorized values in the new loop. Each value from the
489   /// original loop, when vectorized, is represented by UF vector values in the
490   /// new unrolled loop, where UF is the unroll factor.
491   using VectorParts = SmallVector<Value *, 2>;
492 
493   /// Vectorize a single first-order recurrence or pointer induction PHINode in
494   /// a block. This method handles the induction variable canonicalization. It
495   /// supports both VF = 1 for unrolled loops and arbitrary length vectors.
496   void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR,
497                            VPTransformState &State);
498 
499   /// A helper function to scalarize a single Instruction in the innermost loop.
500   /// Generates a sequence of scalar instances for each lane between \p MinLane
501   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
502   /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
503   /// Instr's operands.
504   void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe,
505                             const VPIteration &Instance, bool IfPredicateInstr,
506                             VPTransformState &State);
507 
508   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
509   /// is provided, the integer induction variable will first be truncated to
510   /// the corresponding type.
511   void widenIntOrFpInduction(PHINode *IV, const InductionDescriptor &ID,
512                              Value *Start, TruncInst *Trunc, VPValue *Def,
513                              VPTransformState &State);
514 
515   /// Construct the vector value of a scalarized value \p V one lane at a time.
516   void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
517                                  VPTransformState &State);
518 
519   /// Try to vectorize interleaved access group \p Group with the base address
520   /// given in \p Addr, optionally masking the vector operations if \p
521   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
522   /// values in the vectorized loop.
523   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
524                                 ArrayRef<VPValue *> VPDefs,
525                                 VPTransformState &State, VPValue *Addr,
526                                 ArrayRef<VPValue *> StoredValues,
527                                 VPValue *BlockInMask = nullptr);
528 
529   /// Set the debug location in the builder \p Ptr using the debug location in
530   /// \p V. If \p Ptr is None then it uses the class member's Builder.
531   void setDebugLocFromInst(const Value *V,
532                            Optional<IRBuilder<> *> CustomBuilder = None);
533 
534   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
535   void fixNonInductionPHIs(VPTransformState &State);
536 
537   /// Returns true if the reordering of FP operations is not allowed, but we are
538   /// able to vectorize with strict in-order reductions for the given RdxDesc.
539   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
540 
541   /// Create a broadcast instruction. This method generates a broadcast
542   /// instruction (shuffle) for loop invariant values and for the induction
543   /// value. If this is the induction variable then we extend it to N, N+1, ...
544   /// this is needed because each iteration in the loop corresponds to a SIMD
545   /// element.
546   virtual Value *getBroadcastInstrs(Value *V);
547 
548   /// Add metadata from one instruction to another.
549   ///
550   /// This includes both the original MDs from \p From and additional ones (\see
551   /// addNewMetadata).  Use this for *newly created* instructions in the vector
552   /// loop.
553   void addMetadata(Instruction *To, Instruction *From);
554 
555   /// Similar to the previous function but it adds the metadata to a
556   /// vector of instructions.
557   void addMetadata(ArrayRef<Value *> To, Instruction *From);
558 
559 protected:
560   friend class LoopVectorizationPlanner;
561 
562   /// A small list of PHINodes.
563   using PhiVector = SmallVector<PHINode *, 4>;
564 
565   /// A type for scalarized values in the new loop. Each value from the
566   /// original loop, when scalarized, is represented by UF x VF scalar values
567   /// in the new unrolled loop, where UF is the unroll factor and VF is the
568   /// vectorization factor.
569   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
570 
571   /// Set up the values of the IVs correctly when exiting the vector loop.
572   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
573                     Value *CountRoundDown, Value *EndValue,
574                     BasicBlock *MiddleBlock);
575 
576   /// Create a new induction variable inside L.
577   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
578                                    Value *Step, Instruction *DL);
579 
580   /// Handle all cross-iteration phis in the header.
581   void fixCrossIterationPHIs(VPTransformState &State);
582 
583   /// Create the exit value of first order recurrences in the middle block and
584   /// update their users.
585   void fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, VPTransformState &State);
586 
587   /// Create code for the loop exit value of the reduction.
588   void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
589 
590   /// Clear NSW/NUW flags from reduction instructions if necessary.
591   void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
592                                VPTransformState &State);
593 
594   /// Fixup the LCSSA phi nodes in the unique exit block.  This simply
595   /// means we need to add the appropriate incoming value from the middle
596   /// block as exiting edges from the scalar epilogue loop (if present) are
597   /// already in place, and we exit the vector loop exclusively to the middle
598   /// block.
599   void fixLCSSAPHIs(VPTransformState &State);
600 
601   /// Iteratively sink the scalarized operands of a predicated instruction into
602   /// the block that was created for it.
603   void sinkScalarOperands(Instruction *PredInst);
604 
605   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
606   /// represented as.
607   void truncateToMinimalBitwidths(VPTransformState &State);
608 
609   /// This function adds
610   /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
611   /// to each vector element of Val. The sequence starts at StartIndex.
612   /// \p Opcode is relevant for FP induction variable.
613   virtual Value *
614   getStepVector(Value *Val, Value *StartIdx, Value *Step,
615                 Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd);
616 
617   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
618   /// variable on which to base the steps, \p Step is the size of the step, and
619   /// \p EntryVal is the value from the original loop that maps to the steps.
620   /// Note that \p EntryVal doesn't have to be an induction variable - it
621   /// can also be a truncate instruction.
622   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
623                         const InductionDescriptor &ID, VPValue *Def,
624                         VPTransformState &State);
625 
626   /// Create a vector induction phi node based on an existing scalar one. \p
627   /// EntryVal is the value from the original loop that maps to the vector phi
628   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
629   /// truncate instruction, instead of widening the original IV, we widen a
630   /// version of the IV truncated to \p EntryVal's type.
631   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
632                                        Value *Step, Value *Start,
633                                        Instruction *EntryVal, VPValue *Def,
634                                        VPTransformState &State);
635 
636   /// Returns true if an instruction \p I should be scalarized instead of
637   /// vectorized for the chosen vectorization factor.
638   bool shouldScalarizeInstruction(Instruction *I) const;
639 
640   /// Returns true if we should generate a scalar version of \p IV.
641   bool needsScalarInduction(Instruction *IV) const;
642 
643   /// Generate a shuffle sequence that will reverse the vector Vec.
644   virtual Value *reverseVector(Value *Vec);
645 
646   /// Returns (and creates if needed) the original loop trip count.
647   Value *getOrCreateTripCount(Loop *NewLoop);
648 
649   /// Returns (and creates if needed) the trip count of the widened loop.
650   Value *getOrCreateVectorTripCount(Loop *NewLoop);
651 
652   /// Returns a bitcasted value to the requested vector type.
653   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
654   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
655                                 const DataLayout &DL);
656 
657   /// Emit a bypass check to see if the vector trip count is zero, including if
658   /// it overflows.
659   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
660 
661   /// Emit a bypass check to see if all of the SCEV assumptions we've
662   /// had to make are correct. Returns the block containing the checks or
663   /// nullptr if no checks have been added.
664   BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass);
665 
666   /// Emit bypass checks to check any memory assumptions we may have made.
667   /// Returns the block containing the checks or nullptr if no checks have been
668   /// added.
669   BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
670 
671   /// Compute the transformed value of Index at offset StartValue using step
672   /// StepValue.
673   /// For integer induction, returns StartValue + Index * StepValue.
674   /// For pointer induction, returns StartValue[Index * StepValue].
675   /// FIXME: The newly created binary instructions should contain nsw/nuw
676   /// flags, which can be found from the original scalar operations.
677   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
678                               const DataLayout &DL,
679                               const InductionDescriptor &ID) const;
680 
681   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
682   /// vector loop preheader, middle block and scalar preheader. Also
683   /// allocate a loop object for the new vector loop and return it.
684   Loop *createVectorLoopSkeleton(StringRef Prefix);
685 
686   /// Create new phi nodes for the induction variables to resume iteration count
687   /// in the scalar epilogue, from where the vectorized loop left off (given by
688   /// \p VectorTripCount).
689   /// In cases where the loop skeleton is more complicated (eg. epilogue
690   /// vectorization) and the resume values can come from an additional bypass
691   /// block, the \p AdditionalBypass pair provides information about the bypass
692   /// block and the end value on the edge from bypass to this loop.
693   void createInductionResumeValues(
694       Loop *L, Value *VectorTripCount,
695       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
696 
697   /// Complete the loop skeleton by adding debug MDs, creating appropriate
698   /// conditional branches in the middle block, preparing the builder and
699   /// running the verifier. Take in the vector loop \p L as argument, and return
700   /// the preheader of the completed vector loop.
701   BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
702 
703   /// Add additional metadata to \p To that was not present on \p Orig.
704   ///
705   /// Currently this is used to add the noalias annotations based on the
706   /// inserted memchecks.  Use this for instructions that are *cloned* into the
707   /// vector loop.
708   void addNewMetadata(Instruction *To, const Instruction *Orig);
709 
710   /// Collect poison-generating recipes that may generate a poison value that is
711   /// used after vectorization, even when their operands are not poison. Those
712   /// recipes meet the following conditions:
713   ///  * Contribute to the address computation of a recipe generating a widen
714   ///    memory load/store (VPWidenMemoryInstructionRecipe or
715   ///    VPInterleaveRecipe).
716   ///  * Such a widen memory load/store has at least one underlying Instruction
717   ///    that is in a basic block that needs predication and after vectorization
718   ///    the generated instruction won't be predicated.
719   void collectPoisonGeneratingRecipes(VPTransformState &State);
720 
721   /// Allow subclasses to override and print debug traces before/after vplan
722   /// execution, when trace information is requested.
723   virtual void printDebugTracesAtStart(){};
724   virtual void printDebugTracesAtEnd(){};
725 
726   /// The original loop.
727   Loop *OrigLoop;
728 
729   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
730   /// dynamic knowledge to simplify SCEV expressions and converts them to a
731   /// more usable form.
732   PredicatedScalarEvolution &PSE;
733 
734   /// Loop Info.
735   LoopInfo *LI;
736 
737   /// Dominator Tree.
738   DominatorTree *DT;
739 
740   /// Alias Analysis.
741   AAResults *AA;
742 
743   /// Target Library Info.
744   const TargetLibraryInfo *TLI;
745 
746   /// Target Transform Info.
747   const TargetTransformInfo *TTI;
748 
749   /// Assumption Cache.
750   AssumptionCache *AC;
751 
752   /// Interface to emit optimization remarks.
753   OptimizationRemarkEmitter *ORE;
754 
755   /// LoopVersioning.  It's only set up (non-null) if memchecks were
756   /// used.
757   ///
758   /// This is currently only used to add no-alias metadata based on the
759   /// memchecks.  The actually versioning is performed manually.
760   std::unique_ptr<LoopVersioning> LVer;
761 
762   /// The vectorization SIMD factor to use. Each vector will have this many
763   /// vector elements.
764   ElementCount VF;
765 
766   /// The vectorization unroll factor to use. Each scalar is vectorized to this
767   /// many different vector instructions.
768   unsigned UF;
769 
770   /// The builder that we use
771   IRBuilder<> Builder;
772 
773   // --- Vectorization state ---
774 
775   /// The vector-loop preheader.
776   BasicBlock *LoopVectorPreHeader;
777 
778   /// The scalar-loop preheader.
779   BasicBlock *LoopScalarPreHeader;
780 
781   /// Middle Block between the vector and the scalar.
782   BasicBlock *LoopMiddleBlock;
783 
784   /// The unique ExitBlock of the scalar loop if one exists.  Note that
785   /// there can be multiple exiting edges reaching this block.
786   BasicBlock *LoopExitBlock;
787 
788   /// The vector loop body.
789   BasicBlock *LoopVectorBody;
790 
791   /// The scalar loop body.
792   BasicBlock *LoopScalarBody;
793 
794   /// A list of all bypass blocks. The first block is the entry of the loop.
795   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
796 
797   /// The new Induction variable which was added to the new block.
798   PHINode *Induction = nullptr;
799 
800   /// The induction variable of the old basic block.
801   PHINode *OldInduction = nullptr;
802 
803   /// Store instructions that were predicated.
804   SmallVector<Instruction *, 4> PredicatedInstructions;
805 
806   /// Trip count of the original loop.
807   Value *TripCount = nullptr;
808 
809   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
810   Value *VectorTripCount = nullptr;
811 
812   /// The legality analysis.
813   LoopVectorizationLegality *Legal;
814 
815   /// The profitablity analysis.
816   LoopVectorizationCostModel *Cost;
817 
818   // Record whether runtime checks are added.
819   bool AddedSafetyChecks = false;
820 
821   // Holds the end values for each induction variable. We save the end values
822   // so we can later fix-up the external users of the induction variables.
823   DenseMap<PHINode *, Value *> IVEndValues;
824 
825   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
826   // fixed up at the end of vector code generation.
827   SmallVector<PHINode *, 8> OrigPHIsToFix;
828 
829   /// BFI and PSI are used to check for profile guided size optimizations.
830   BlockFrequencyInfo *BFI;
831   ProfileSummaryInfo *PSI;
832 
833   // Whether this loop should be optimized for size based on profile guided size
834   // optimizatios.
835   bool OptForSizeBasedOnProfile;
836 
837   /// Structure to hold information about generated runtime checks, responsible
838   /// for cleaning the checks, if vectorization turns out unprofitable.
839   GeneratedRTChecks &RTChecks;
840 };
841 
842 class InnerLoopUnroller : public InnerLoopVectorizer {
843 public:
844   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
845                     LoopInfo *LI, DominatorTree *DT,
846                     const TargetLibraryInfo *TLI,
847                     const TargetTransformInfo *TTI, AssumptionCache *AC,
848                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
849                     LoopVectorizationLegality *LVL,
850                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
851                     ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
852       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
853                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
854                             BFI, PSI, Check) {}
855 
856 private:
857   Value *getBroadcastInstrs(Value *V) override;
858   Value *getStepVector(
859       Value *Val, Value *StartIdx, Value *Step,
860       Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd) override;
861   Value *reverseVector(Value *Vec) override;
862 };
863 
864 /// Encapsulate information regarding vectorization of a loop and its epilogue.
865 /// This information is meant to be updated and used across two stages of
866 /// epilogue vectorization.
867 struct EpilogueLoopVectorizationInfo {
868   ElementCount MainLoopVF = ElementCount::getFixed(0);
869   unsigned MainLoopUF = 0;
870   ElementCount EpilogueVF = ElementCount::getFixed(0);
871   unsigned EpilogueUF = 0;
872   BasicBlock *MainLoopIterationCountCheck = nullptr;
873   BasicBlock *EpilogueIterationCountCheck = nullptr;
874   BasicBlock *SCEVSafetyCheck = nullptr;
875   BasicBlock *MemSafetyCheck = nullptr;
876   Value *TripCount = nullptr;
877   Value *VectorTripCount = nullptr;
878 
879   EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
880                                 ElementCount EVF, unsigned EUF)
881       : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
882     assert(EUF == 1 &&
883            "A high UF for the epilogue loop is likely not beneficial.");
884   }
885 };
886 
887 /// An extension of the inner loop vectorizer that creates a skeleton for a
888 /// vectorized loop that has its epilogue (residual) also vectorized.
889 /// The idea is to run the vplan on a given loop twice, firstly to setup the
890 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
891 /// from the first step and vectorize the epilogue.  This is achieved by
892 /// deriving two concrete strategy classes from this base class and invoking
893 /// them in succession from the loop vectorizer planner.
894 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
895 public:
896   InnerLoopAndEpilogueVectorizer(
897       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
898       DominatorTree *DT, const TargetLibraryInfo *TLI,
899       const TargetTransformInfo *TTI, AssumptionCache *AC,
900       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
901       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
902       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
903       GeneratedRTChecks &Checks)
904       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
905                             EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI,
906                             Checks),
907         EPI(EPI) {}
908 
909   // Override this function to handle the more complex control flow around the
910   // three loops.
911   BasicBlock *createVectorizedLoopSkeleton() final override {
912     return createEpilogueVectorizedLoopSkeleton();
913   }
914 
915   /// The interface for creating a vectorized skeleton using one of two
916   /// different strategies, each corresponding to one execution of the vplan
917   /// as described above.
918   virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
919 
920   /// Holds and updates state information required to vectorize the main loop
921   /// and its epilogue in two separate passes. This setup helps us avoid
922   /// regenerating and recomputing runtime safety checks. It also helps us to
923   /// shorten the iteration-count-check path length for the cases where the
924   /// iteration count of the loop is so small that the main vector loop is
925   /// completely skipped.
926   EpilogueLoopVectorizationInfo &EPI;
927 };
928 
929 /// A specialized derived class of inner loop vectorizer that performs
930 /// vectorization of *main* loops in the process of vectorizing loops and their
931 /// epilogues.
932 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
933 public:
934   EpilogueVectorizerMainLoop(
935       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
936       DominatorTree *DT, const TargetLibraryInfo *TLI,
937       const TargetTransformInfo *TTI, AssumptionCache *AC,
938       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
939       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
940       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
941       GeneratedRTChecks &Check)
942       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
943                                        EPI, LVL, CM, BFI, PSI, Check) {}
944   /// Implements the interface for creating a vectorized skeleton using the
945   /// *main loop* strategy (ie the first pass of vplan execution).
946   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
947 
948 protected:
949   /// Emits an iteration count bypass check once for the main loop (when \p
950   /// ForEpilogue is false) and once for the epilogue loop (when \p
951   /// ForEpilogue is true).
952   BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
953                                              bool ForEpilogue);
954   void printDebugTracesAtStart() override;
955   void printDebugTracesAtEnd() override;
956 };
957 
958 // A specialized derived class of inner loop vectorizer that performs
959 // vectorization of *epilogue* loops in the process of vectorizing loops and
960 // their epilogues.
961 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
962 public:
963   EpilogueVectorizerEpilogueLoop(
964       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
965       DominatorTree *DT, const TargetLibraryInfo *TLI,
966       const TargetTransformInfo *TTI, AssumptionCache *AC,
967       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
968       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
969       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
970       GeneratedRTChecks &Checks)
971       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
972                                        EPI, LVL, CM, BFI, PSI, Checks) {}
973   /// Implements the interface for creating a vectorized skeleton using the
974   /// *epilogue loop* strategy (ie the second pass of vplan execution).
975   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
976 
977 protected:
978   /// Emits an iteration count bypass check after the main vector loop has
979   /// finished to see if there are any iterations left to execute by either
980   /// the vector epilogue or the scalar epilogue.
981   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
982                                                       BasicBlock *Bypass,
983                                                       BasicBlock *Insert);
984   void printDebugTracesAtStart() override;
985   void printDebugTracesAtEnd() override;
986 };
987 } // end namespace llvm
988 
989 /// Look for a meaningful debug location on the instruction or it's
990 /// operands.
991 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
992   if (!I)
993     return I;
994 
995   DebugLoc Empty;
996   if (I->getDebugLoc() != Empty)
997     return I;
998 
999   for (Use &Op : I->operands()) {
1000     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
1001       if (OpInst->getDebugLoc() != Empty)
1002         return OpInst;
1003   }
1004 
1005   return I;
1006 }
1007 
1008 void InnerLoopVectorizer::setDebugLocFromInst(
1009     const Value *V, Optional<IRBuilder<> *> CustomBuilder) {
1010   IRBuilder<> *B = (CustomBuilder == None) ? &Builder : *CustomBuilder;
1011   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) {
1012     const DILocation *DIL = Inst->getDebugLoc();
1013 
1014     // When a FSDiscriminator is enabled, we don't need to add the multiply
1015     // factors to the discriminators.
1016     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
1017         !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) {
1018       // FIXME: For scalable vectors, assume vscale=1.
1019       auto NewDIL =
1020           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
1021       if (NewDIL)
1022         B->SetCurrentDebugLocation(NewDIL.getValue());
1023       else
1024         LLVM_DEBUG(dbgs()
1025                    << "Failed to create new discriminator: "
1026                    << DIL->getFilename() << " Line: " << DIL->getLine());
1027     } else
1028       B->SetCurrentDebugLocation(DIL);
1029   } else
1030     B->SetCurrentDebugLocation(DebugLoc());
1031 }
1032 
1033 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
1034 /// is passed, the message relates to that particular instruction.
1035 #ifndef NDEBUG
1036 static void debugVectorizationMessage(const StringRef Prefix,
1037                                       const StringRef DebugMsg,
1038                                       Instruction *I) {
1039   dbgs() << "LV: " << Prefix << DebugMsg;
1040   if (I != nullptr)
1041     dbgs() << " " << *I;
1042   else
1043     dbgs() << '.';
1044   dbgs() << '\n';
1045 }
1046 #endif
1047 
1048 /// Create an analysis remark that explains why vectorization failed
1049 ///
1050 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
1051 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
1052 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
1053 /// the location of the remark.  \return the remark object that can be
1054 /// streamed to.
1055 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
1056     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
1057   Value *CodeRegion = TheLoop->getHeader();
1058   DebugLoc DL = TheLoop->getStartLoc();
1059 
1060   if (I) {
1061     CodeRegion = I->getParent();
1062     // If there is no debug location attached to the instruction, revert back to
1063     // using the loop's.
1064     if (I->getDebugLoc())
1065       DL = I->getDebugLoc();
1066   }
1067 
1068   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
1069 }
1070 
1071 /// Return a value for Step multiplied by VF.
1072 static Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF,
1073                               int64_t Step) {
1074   assert(Ty->isIntegerTy() && "Expected an integer step");
1075   Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue());
1076   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
1077 }
1078 
1079 namespace llvm {
1080 
1081 /// Return the runtime value for VF.
1082 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) {
1083   Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
1084   return VF.isScalable() ? B.CreateVScale(EC) : EC;
1085 }
1086 
1087 static Value *getRuntimeVFAsFloat(IRBuilder<> &B, Type *FTy, ElementCount VF) {
1088   assert(FTy->isFloatingPointTy() && "Expected floating point type!");
1089   Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
1090   Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
1091   return B.CreateUIToFP(RuntimeVF, FTy);
1092 }
1093 
1094 void reportVectorizationFailure(const StringRef DebugMsg,
1095                                 const StringRef OREMsg, const StringRef ORETag,
1096                                 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1097                                 Instruction *I) {
1098   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
1099   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1100   ORE->emit(
1101       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1102       << "loop not vectorized: " << OREMsg);
1103 }
1104 
1105 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1106                              OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1107                              Instruction *I) {
1108   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
1109   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1110   ORE->emit(
1111       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1112       << Msg);
1113 }
1114 
1115 } // end namespace llvm
1116 
1117 #ifndef NDEBUG
1118 /// \return string containing a file name and a line # for the given loop.
1119 static std::string getDebugLocString(const Loop *L) {
1120   std::string Result;
1121   if (L) {
1122     raw_string_ostream OS(Result);
1123     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1124       LoopDbgLoc.print(OS);
1125     else
1126       // Just print the module name.
1127       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1128     OS.flush();
1129   }
1130   return Result;
1131 }
1132 #endif
1133 
1134 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1135                                          const Instruction *Orig) {
1136   // If the loop was versioned with memchecks, add the corresponding no-alias
1137   // metadata.
1138   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1139     LVer->annotateInstWithNoAlias(To, Orig);
1140 }
1141 
1142 void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
1143     VPTransformState &State) {
1144 
1145   // Collect recipes in the backward slice of `Root` that may generate a poison
1146   // value that is used after vectorization.
1147   SmallPtrSet<VPRecipeBase *, 16> Visited;
1148   auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
1149     SmallVector<VPRecipeBase *, 16> Worklist;
1150     Worklist.push_back(Root);
1151 
1152     // Traverse the backward slice of Root through its use-def chain.
1153     while (!Worklist.empty()) {
1154       VPRecipeBase *CurRec = Worklist.back();
1155       Worklist.pop_back();
1156 
1157       if (!Visited.insert(CurRec).second)
1158         continue;
1159 
1160       // Prune search if we find another recipe generating a widen memory
1161       // instruction. Widen memory instructions involved in address computation
1162       // will lead to gather/scatter instructions, which don't need to be
1163       // handled.
1164       if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
1165           isa<VPInterleaveRecipe>(CurRec))
1166         continue;
1167 
1168       // This recipe contributes to the address computation of a widen
1169       // load/store. Collect recipe if its underlying instruction has
1170       // poison-generating flags.
1171       Instruction *Instr = CurRec->getUnderlyingInstr();
1172       if (Instr && Instr->hasPoisonGeneratingFlags())
1173         State.MayGeneratePoisonRecipes.insert(CurRec);
1174 
1175       // Add new definitions to the worklist.
1176       for (VPValue *operand : CurRec->operands())
1177         if (VPDef *OpDef = operand->getDef())
1178           Worklist.push_back(cast<VPRecipeBase>(OpDef));
1179     }
1180   });
1181 
1182   // Traverse all the recipes in the VPlan and collect the poison-generating
1183   // recipes in the backward slice starting at the address of a VPWidenRecipe or
1184   // VPInterleaveRecipe.
1185   auto Iter = depth_first(
1186       VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry()));
1187   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1188     for (VPRecipeBase &Recipe : *VPBB) {
1189       if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
1190         Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr();
1191         VPDef *AddrDef = WidenRec->getAddr()->getDef();
1192         if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr &&
1193             Legal->blockNeedsPredication(UnderlyingInstr->getParent()))
1194           collectPoisonGeneratingInstrsInBackwardSlice(
1195               cast<VPRecipeBase>(AddrDef));
1196       } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1197         VPDef *AddrDef = InterleaveRec->getAddr()->getDef();
1198         if (AddrDef) {
1199           // Check if any member of the interleave group needs predication.
1200           const InterleaveGroup<Instruction> *InterGroup =
1201               InterleaveRec->getInterleaveGroup();
1202           bool NeedPredication = false;
1203           for (int I = 0, NumMembers = InterGroup->getNumMembers();
1204                I < NumMembers; ++I) {
1205             Instruction *Member = InterGroup->getMember(I);
1206             if (Member)
1207               NeedPredication |=
1208                   Legal->blockNeedsPredication(Member->getParent());
1209           }
1210 
1211           if (NeedPredication)
1212             collectPoisonGeneratingInstrsInBackwardSlice(
1213                 cast<VPRecipeBase>(AddrDef));
1214         }
1215       }
1216     }
1217   }
1218 }
1219 
1220 void InnerLoopVectorizer::addMetadata(Instruction *To,
1221                                       Instruction *From) {
1222   propagateMetadata(To, From);
1223   addNewMetadata(To, From);
1224 }
1225 
1226 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1227                                       Instruction *From) {
1228   for (Value *V : To) {
1229     if (Instruction *I = dyn_cast<Instruction>(V))
1230       addMetadata(I, From);
1231   }
1232 }
1233 
1234 namespace llvm {
1235 
1236 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1237 // lowered.
1238 enum ScalarEpilogueLowering {
1239 
1240   // The default: allowing scalar epilogues.
1241   CM_ScalarEpilogueAllowed,
1242 
1243   // Vectorization with OptForSize: don't allow epilogues.
1244   CM_ScalarEpilogueNotAllowedOptSize,
1245 
1246   // A special case of vectorisation with OptForSize: loops with a very small
1247   // trip count are considered for vectorization under OptForSize, thereby
1248   // making sure the cost of their loop body is dominant, free of runtime
1249   // guards and scalar iteration overheads.
1250   CM_ScalarEpilogueNotAllowedLowTripLoop,
1251 
1252   // Loop hint predicate indicating an epilogue is undesired.
1253   CM_ScalarEpilogueNotNeededUsePredicate,
1254 
1255   // Directive indicating we must either tail fold or not vectorize
1256   CM_ScalarEpilogueNotAllowedUsePredicate
1257 };
1258 
1259 /// ElementCountComparator creates a total ordering for ElementCount
1260 /// for the purposes of using it in a set structure.
1261 struct ElementCountComparator {
1262   bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
1263     return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
1264            std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
1265   }
1266 };
1267 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
1268 
1269 /// LoopVectorizationCostModel - estimates the expected speedups due to
1270 /// vectorization.
1271 /// In many cases vectorization is not profitable. This can happen because of
1272 /// a number of reasons. In this class we mainly attempt to predict the
1273 /// expected speedup/slowdowns due to the supported instruction set. We use the
1274 /// TargetTransformInfo to query the different backends for the cost of
1275 /// different operations.
1276 class LoopVectorizationCostModel {
1277 public:
1278   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1279                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1280                              LoopVectorizationLegality *Legal,
1281                              const TargetTransformInfo &TTI,
1282                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1283                              AssumptionCache *AC,
1284                              OptimizationRemarkEmitter *ORE, const Function *F,
1285                              const LoopVectorizeHints *Hints,
1286                              InterleavedAccessInfo &IAI)
1287       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1288         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1289         Hints(Hints), InterleaveInfo(IAI) {}
1290 
1291   /// \return An upper bound for the vectorization factors (both fixed and
1292   /// scalable). If the factors are 0, vectorization and interleaving should be
1293   /// avoided up front.
1294   FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1295 
1296   /// \return True if runtime checks are required for vectorization, and false
1297   /// otherwise.
1298   bool runtimeChecksRequired();
1299 
1300   /// \return The most profitable vectorization factor and the cost of that VF.
1301   /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
1302   /// then this vectorization factor will be selected if vectorization is
1303   /// possible.
1304   VectorizationFactor
1305   selectVectorizationFactor(const ElementCountSet &CandidateVFs);
1306 
1307   VectorizationFactor
1308   selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1309                                     const LoopVectorizationPlanner &LVP);
1310 
1311   /// Setup cost-based decisions for user vectorization factor.
1312   /// \return true if the UserVF is a feasible VF to be chosen.
1313   bool selectUserVectorizationFactor(ElementCount UserVF) {
1314     collectUniformsAndScalars(UserVF);
1315     collectInstsToScalarize(UserVF);
1316     return expectedCost(UserVF).first.isValid();
1317   }
1318 
1319   /// \return The size (in bits) of the smallest and widest types in the code
1320   /// that needs to be vectorized. We ignore values that remain scalar such as
1321   /// 64 bit loop indices.
1322   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1323 
1324   /// \return The desired interleave count.
1325   /// If interleave count has been specified by metadata it will be returned.
1326   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1327   /// are the selected vectorization factor and the cost of the selected VF.
1328   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1329 
1330   /// Memory access instruction may be vectorized in more than one way.
1331   /// Form of instruction after vectorization depends on cost.
1332   /// This function takes cost-based decisions for Load/Store instructions
1333   /// and collects them in a map. This decisions map is used for building
1334   /// the lists of loop-uniform and loop-scalar instructions.
1335   /// The calculated cost is saved with widening decision in order to
1336   /// avoid redundant calculations.
1337   void setCostBasedWideningDecision(ElementCount VF);
1338 
1339   /// A struct that represents some properties of the register usage
1340   /// of a loop.
1341   struct RegisterUsage {
1342     /// Holds the number of loop invariant values that are used in the loop.
1343     /// The key is ClassID of target-provided register class.
1344     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1345     /// Holds the maximum number of concurrent live intervals in the loop.
1346     /// The key is ClassID of target-provided register class.
1347     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1348   };
1349 
1350   /// \return Returns information about the register usages of the loop for the
1351   /// given vectorization factors.
1352   SmallVector<RegisterUsage, 8>
1353   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1354 
1355   /// Collect values we want to ignore in the cost model.
1356   void collectValuesToIgnore();
1357 
1358   /// Collect all element types in the loop for which widening is needed.
1359   void collectElementTypesForWidening();
1360 
1361   /// Split reductions into those that happen in the loop, and those that happen
1362   /// outside. In loop reductions are collected into InLoopReductionChains.
1363   void collectInLoopReductions();
1364 
1365   /// Returns true if we should use strict in-order reductions for the given
1366   /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1367   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1368   /// of FP operations.
1369   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) {
1370     return !Hints->allowReordering() && RdxDesc.isOrdered();
1371   }
1372 
1373   /// \returns The smallest bitwidth each instruction can be represented with.
1374   /// The vector equivalents of these instructions should be truncated to this
1375   /// type.
1376   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1377     return MinBWs;
1378   }
1379 
1380   /// \returns True if it is more profitable to scalarize instruction \p I for
1381   /// vectorization factor \p VF.
1382   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1383     assert(VF.isVector() &&
1384            "Profitable to scalarize relevant only for VF > 1.");
1385 
1386     // Cost model is not run in the VPlan-native path - return conservative
1387     // result until this changes.
1388     if (EnableVPlanNativePath)
1389       return false;
1390 
1391     auto Scalars = InstsToScalarize.find(VF);
1392     assert(Scalars != InstsToScalarize.end() &&
1393            "VF not yet analyzed for scalarization profitability");
1394     return Scalars->second.find(I) != Scalars->second.end();
1395   }
1396 
1397   /// Returns true if \p I is known to be uniform after vectorization.
1398   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1399     if (VF.isScalar())
1400       return true;
1401 
1402     // Cost model is not run in the VPlan-native path - return conservative
1403     // result until this changes.
1404     if (EnableVPlanNativePath)
1405       return false;
1406 
1407     auto UniformsPerVF = Uniforms.find(VF);
1408     assert(UniformsPerVF != Uniforms.end() &&
1409            "VF not yet analyzed for uniformity");
1410     return UniformsPerVF->second.count(I);
1411   }
1412 
1413   /// Returns true if \p I is known to be scalar after vectorization.
1414   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1415     if (VF.isScalar())
1416       return true;
1417 
1418     // Cost model is not run in the VPlan-native path - return conservative
1419     // result until this changes.
1420     if (EnableVPlanNativePath)
1421       return false;
1422 
1423     auto ScalarsPerVF = Scalars.find(VF);
1424     assert(ScalarsPerVF != Scalars.end() &&
1425            "Scalar values are not calculated for VF");
1426     return ScalarsPerVF->second.count(I);
1427   }
1428 
1429   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1430   /// for vectorization factor \p VF.
1431   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1432     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1433            !isProfitableToScalarize(I, VF) &&
1434            !isScalarAfterVectorization(I, VF);
1435   }
1436 
1437   /// Decision that was taken during cost calculation for memory instruction.
1438   enum InstWidening {
1439     CM_Unknown,
1440     CM_Widen,         // For consecutive accesses with stride +1.
1441     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1442     CM_Interleave,
1443     CM_GatherScatter,
1444     CM_Scalarize
1445   };
1446 
1447   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1448   /// instruction \p I and vector width \p VF.
1449   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1450                            InstructionCost Cost) {
1451     assert(VF.isVector() && "Expected VF >=2");
1452     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1453   }
1454 
1455   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1456   /// interleaving group \p Grp and vector width \p VF.
1457   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1458                            ElementCount VF, InstWidening W,
1459                            InstructionCost Cost) {
1460     assert(VF.isVector() && "Expected VF >=2");
1461     /// Broadcast this decicion to all instructions inside the group.
1462     /// But the cost will be assigned to one instruction only.
1463     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1464       if (auto *I = Grp->getMember(i)) {
1465         if (Grp->getInsertPos() == I)
1466           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1467         else
1468           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1469       }
1470     }
1471   }
1472 
1473   /// Return the cost model decision for the given instruction \p I and vector
1474   /// width \p VF. Return CM_Unknown if this instruction did not pass
1475   /// through the cost modeling.
1476   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1477     assert(VF.isVector() && "Expected VF to be a vector VF");
1478     // Cost model is not run in the VPlan-native path - return conservative
1479     // result until this changes.
1480     if (EnableVPlanNativePath)
1481       return CM_GatherScatter;
1482 
1483     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1484     auto Itr = WideningDecisions.find(InstOnVF);
1485     if (Itr == WideningDecisions.end())
1486       return CM_Unknown;
1487     return Itr->second.first;
1488   }
1489 
1490   /// Return the vectorization cost for the given instruction \p I and vector
1491   /// width \p VF.
1492   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1493     assert(VF.isVector() && "Expected VF >=2");
1494     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1495     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1496            "The cost is not calculated");
1497     return WideningDecisions[InstOnVF].second;
1498   }
1499 
1500   /// Return True if instruction \p I is an optimizable truncate whose operand
1501   /// is an induction variable. Such a truncate will be removed by adding a new
1502   /// induction variable with the destination type.
1503   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1504     // If the instruction is not a truncate, return false.
1505     auto *Trunc = dyn_cast<TruncInst>(I);
1506     if (!Trunc)
1507       return false;
1508 
1509     // Get the source and destination types of the truncate.
1510     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1511     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1512 
1513     // If the truncate is free for the given types, return false. Replacing a
1514     // free truncate with an induction variable would add an induction variable
1515     // update instruction to each iteration of the loop. We exclude from this
1516     // check the primary induction variable since it will need an update
1517     // instruction regardless.
1518     Value *Op = Trunc->getOperand(0);
1519     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1520       return false;
1521 
1522     // If the truncated value is not an induction variable, return false.
1523     return Legal->isInductionPhi(Op);
1524   }
1525 
1526   /// Collects the instructions to scalarize for each predicated instruction in
1527   /// the loop.
1528   void collectInstsToScalarize(ElementCount VF);
1529 
1530   /// Collect Uniform and Scalar values for the given \p VF.
1531   /// The sets depend on CM decision for Load/Store instructions
1532   /// that may be vectorized as interleave, gather-scatter or scalarized.
1533   void collectUniformsAndScalars(ElementCount VF) {
1534     // Do the analysis once.
1535     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1536       return;
1537     setCostBasedWideningDecision(VF);
1538     collectLoopUniforms(VF);
1539     collectLoopScalars(VF);
1540   }
1541 
1542   /// Returns true if the target machine supports masked store operation
1543   /// for the given \p DataType and kind of access to \p Ptr.
1544   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1545     return Legal->isConsecutivePtr(DataType, Ptr) &&
1546            TTI.isLegalMaskedStore(DataType, Alignment);
1547   }
1548 
1549   /// Returns true if the target machine supports masked load operation
1550   /// for the given \p DataType and kind of access to \p Ptr.
1551   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1552     return Legal->isConsecutivePtr(DataType, Ptr) &&
1553            TTI.isLegalMaskedLoad(DataType, Alignment);
1554   }
1555 
1556   /// Returns true if the target machine can represent \p V as a masked gather
1557   /// or scatter operation.
1558   bool isLegalGatherOrScatter(Value *V) {
1559     bool LI = isa<LoadInst>(V);
1560     bool SI = isa<StoreInst>(V);
1561     if (!LI && !SI)
1562       return false;
1563     auto *Ty = getLoadStoreType(V);
1564     Align Align = getLoadStoreAlignment(V);
1565     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1566            (SI && TTI.isLegalMaskedScatter(Ty, Align));
1567   }
1568 
1569   /// Returns true if the target machine supports all of the reduction
1570   /// variables found for the given VF.
1571   bool canVectorizeReductions(ElementCount VF) const {
1572     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1573       const RecurrenceDescriptor &RdxDesc = Reduction.second;
1574       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1575     }));
1576   }
1577 
1578   /// Returns true if \p I is an instruction that will be scalarized with
1579   /// predication. Such instructions include conditional stores and
1580   /// instructions that may divide by zero.
1581   /// If a non-zero VF has been calculated, we check if I will be scalarized
1582   /// predication for that VF.
1583   bool isScalarWithPredication(Instruction *I) const;
1584 
1585   // Returns true if \p I is an instruction that will be predicated either
1586   // through scalar predication or masked load/store or masked gather/scatter.
1587   // Superset of instructions that return true for isScalarWithPredication.
1588   bool isPredicatedInst(Instruction *I, bool IsKnownUniform = false) {
1589     // When we know the load is uniform and the original scalar loop was not
1590     // predicated we don't need to mark it as a predicated instruction. Any
1591     // vectorised blocks created when tail-folding are something artificial we
1592     // have introduced and we know there is always at least one active lane.
1593     // That's why we call Legal->blockNeedsPredication here because it doesn't
1594     // query tail-folding.
1595     if (IsKnownUniform && isa<LoadInst>(I) &&
1596         !Legal->blockNeedsPredication(I->getParent()))
1597       return false;
1598     if (!blockNeedsPredicationForAnyReason(I->getParent()))
1599       return false;
1600     // Loads and stores that need some form of masked operation are predicated
1601     // instructions.
1602     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1603       return Legal->isMaskRequired(I);
1604     return isScalarWithPredication(I);
1605   }
1606 
1607   /// Returns true if \p I is a memory instruction with consecutive memory
1608   /// access that can be widened.
1609   bool
1610   memoryInstructionCanBeWidened(Instruction *I,
1611                                 ElementCount VF = ElementCount::getFixed(1));
1612 
1613   /// Returns true if \p I is a memory instruction in an interleaved-group
1614   /// of memory accesses that can be vectorized with wide vector loads/stores
1615   /// and shuffles.
1616   bool
1617   interleavedAccessCanBeWidened(Instruction *I,
1618                                 ElementCount VF = ElementCount::getFixed(1));
1619 
1620   /// Check if \p Instr belongs to any interleaved access group.
1621   bool isAccessInterleaved(Instruction *Instr) {
1622     return InterleaveInfo.isInterleaved(Instr);
1623   }
1624 
1625   /// Get the interleaved access group that \p Instr belongs to.
1626   const InterleaveGroup<Instruction> *
1627   getInterleavedAccessGroup(Instruction *Instr) {
1628     return InterleaveInfo.getInterleaveGroup(Instr);
1629   }
1630 
1631   /// Returns true if we're required to use a scalar epilogue for at least
1632   /// the final iteration of the original loop.
1633   bool requiresScalarEpilogue(ElementCount VF) const {
1634     if (!isScalarEpilogueAllowed())
1635       return false;
1636     // If we might exit from anywhere but the latch, must run the exiting
1637     // iteration in scalar form.
1638     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1639       return true;
1640     return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();
1641   }
1642 
1643   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1644   /// loop hint annotation.
1645   bool isScalarEpilogueAllowed() const {
1646     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1647   }
1648 
1649   /// Returns true if all loop blocks should be masked to fold tail loop.
1650   bool foldTailByMasking() const { return FoldTailByMasking; }
1651 
1652   /// Returns true if the instructions in this block requires predication
1653   /// for any reason, e.g. because tail folding now requires a predicate
1654   /// or because the block in the original loop was predicated.
1655   bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1656     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1657   }
1658 
1659   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1660   /// nodes to the chain of instructions representing the reductions. Uses a
1661   /// MapVector to ensure deterministic iteration order.
1662   using ReductionChainMap =
1663       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1664 
1665   /// Return the chain of instructions representing an inloop reduction.
1666   const ReductionChainMap &getInLoopReductionChains() const {
1667     return InLoopReductionChains;
1668   }
1669 
1670   /// Returns true if the Phi is part of an inloop reduction.
1671   bool isInLoopReduction(PHINode *Phi) const {
1672     return InLoopReductionChains.count(Phi);
1673   }
1674 
1675   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1676   /// with factor VF.  Return the cost of the instruction, including
1677   /// scalarization overhead if it's needed.
1678   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1679 
1680   /// Estimate cost of a call instruction CI if it were vectorized with factor
1681   /// VF. Return the cost of the instruction, including scalarization overhead
1682   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1683   /// scalarized -
1684   /// i.e. either vector version isn't available, or is too expensive.
1685   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1686                                     bool &NeedToScalarize) const;
1687 
1688   /// Returns true if the per-lane cost of VectorizationFactor A is lower than
1689   /// that of B.
1690   bool isMoreProfitable(const VectorizationFactor &A,
1691                         const VectorizationFactor &B) const;
1692 
1693   /// Invalidates decisions already taken by the cost model.
1694   void invalidateCostModelingDecisions() {
1695     WideningDecisions.clear();
1696     Uniforms.clear();
1697     Scalars.clear();
1698   }
1699 
1700 private:
1701   unsigned NumPredStores = 0;
1702 
1703   /// \return An upper bound for the vectorization factors for both
1704   /// fixed and scalable vectorization, where the minimum-known number of
1705   /// elements is a power-of-2 larger than zero. If scalable vectorization is
1706   /// disabled or unsupported, then the scalable part will be equal to
1707   /// ElementCount::getScalable(0).
1708   FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1709                                            ElementCount UserVF);
1710 
1711   /// \return the maximized element count based on the targets vector
1712   /// registers and the loop trip-count, but limited to a maximum safe VF.
1713   /// This is a helper function of computeFeasibleMaxVF.
1714   /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure
1715   /// issue that occurred on one of the buildbots which cannot be reproduced
1716   /// without having access to the properietary compiler (see comments on
1717   /// D98509). The issue is currently under investigation and this workaround
1718   /// will be removed as soon as possible.
1719   ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
1720                                        unsigned SmallestType,
1721                                        unsigned WidestType,
1722                                        const ElementCount &MaxSafeVF);
1723 
1724   /// \return the maximum legal scalable VF, based on the safe max number
1725   /// of elements.
1726   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1727 
1728   /// The vectorization cost is a combination of the cost itself and a boolean
1729   /// indicating whether any of the contributing operations will actually
1730   /// operate on vector values after type legalization in the backend. If this
1731   /// latter value is false, then all operations will be scalarized (i.e. no
1732   /// vectorization has actually taken place).
1733   using VectorizationCostTy = std::pair<InstructionCost, bool>;
1734 
1735   /// Returns the expected execution cost. The unit of the cost does
1736   /// not matter because we use the 'cost' units to compare different
1737   /// vector widths. The cost that is returned is *not* normalized by
1738   /// the factor width. If \p Invalid is not nullptr, this function
1739   /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1740   /// each instruction that has an Invalid cost for the given VF.
1741   using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1742   VectorizationCostTy
1743   expectedCost(ElementCount VF,
1744                SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1745 
1746   /// Returns the execution time cost of an instruction for a given vector
1747   /// width. Vector width of one means scalar.
1748   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1749 
1750   /// The cost-computation logic from getInstructionCost which provides
1751   /// the vector type as an output parameter.
1752   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1753                                      Type *&VectorTy);
1754 
1755   /// Return the cost of instructions in an inloop reduction pattern, if I is
1756   /// part of that pattern.
1757   Optional<InstructionCost>
1758   getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1759                           TTI::TargetCostKind CostKind);
1760 
1761   /// Calculate vectorization cost of memory instruction \p I.
1762   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1763 
1764   /// The cost computation for scalarized memory instruction.
1765   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1766 
1767   /// The cost computation for interleaving group of memory instructions.
1768   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1769 
1770   /// The cost computation for Gather/Scatter instruction.
1771   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1772 
1773   /// The cost computation for widening instruction \p I with consecutive
1774   /// memory access.
1775   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1776 
1777   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1778   /// Load: scalar load + broadcast.
1779   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1780   /// element)
1781   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1782 
1783   /// Estimate the overhead of scalarizing an instruction. This is a
1784   /// convenience wrapper for the type-based getScalarizationOverhead API.
1785   InstructionCost getScalarizationOverhead(Instruction *I,
1786                                            ElementCount VF) const;
1787 
1788   /// Returns whether the instruction is a load or store and will be a emitted
1789   /// as a vector operation.
1790   bool isConsecutiveLoadOrStore(Instruction *I);
1791 
1792   /// Returns true if an artificially high cost for emulated masked memrefs
1793   /// should be used.
1794   bool useEmulatedMaskMemRefHack(Instruction *I);
1795 
1796   /// Map of scalar integer values to the smallest bitwidth they can be legally
1797   /// represented as. The vector equivalents of these values should be truncated
1798   /// to this type.
1799   MapVector<Instruction *, uint64_t> MinBWs;
1800 
1801   /// A type representing the costs for instructions if they were to be
1802   /// scalarized rather than vectorized. The entries are Instruction-Cost
1803   /// pairs.
1804   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1805 
1806   /// A set containing all BasicBlocks that are known to present after
1807   /// vectorization as a predicated block.
1808   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1809 
1810   /// Records whether it is allowed to have the original scalar loop execute at
1811   /// least once. This may be needed as a fallback loop in case runtime
1812   /// aliasing/dependence checks fail, or to handle the tail/remainder
1813   /// iterations when the trip count is unknown or doesn't divide by the VF,
1814   /// or as a peel-loop to handle gaps in interleave-groups.
1815   /// Under optsize and when the trip count is very small we don't allow any
1816   /// iterations to execute in the scalar loop.
1817   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1818 
1819   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1820   bool FoldTailByMasking = false;
1821 
1822   /// A map holding scalar costs for different vectorization factors. The
1823   /// presence of a cost for an instruction in the mapping indicates that the
1824   /// instruction will be scalarized when vectorizing with the associated
1825   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1826   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1827 
1828   /// Holds the instructions known to be uniform after vectorization.
1829   /// The data is collected per VF.
1830   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1831 
1832   /// Holds the instructions known to be scalar after vectorization.
1833   /// The data is collected per VF.
1834   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1835 
1836   /// Holds the instructions (address computations) that are forced to be
1837   /// scalarized.
1838   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1839 
1840   /// PHINodes of the reductions that should be expanded in-loop along with
1841   /// their associated chains of reduction operations, in program order from top
1842   /// (PHI) to bottom
1843   ReductionChainMap InLoopReductionChains;
1844 
1845   /// A Map of inloop reduction operations and their immediate chain operand.
1846   /// FIXME: This can be removed once reductions can be costed correctly in
1847   /// vplan. This was added to allow quick lookup to the inloop operations,
1848   /// without having to loop through InLoopReductionChains.
1849   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1850 
1851   /// Returns the expected difference in cost from scalarizing the expression
1852   /// feeding a predicated instruction \p PredInst. The instructions to
1853   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1854   /// non-negative return value implies the expression will be scalarized.
1855   /// Currently, only single-use chains are considered for scalarization.
1856   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1857                               ElementCount VF);
1858 
1859   /// Collect the instructions that are uniform after vectorization. An
1860   /// instruction is uniform if we represent it with a single scalar value in
1861   /// the vectorized loop corresponding to each vector iteration. Examples of
1862   /// uniform instructions include pointer operands of consecutive or
1863   /// interleaved memory accesses. Note that although uniformity implies an
1864   /// instruction will be scalar, the reverse is not true. In general, a
1865   /// scalarized instruction will be represented by VF scalar values in the
1866   /// vectorized loop, each corresponding to an iteration of the original
1867   /// scalar loop.
1868   void collectLoopUniforms(ElementCount VF);
1869 
1870   /// Collect the instructions that are scalar after vectorization. An
1871   /// instruction is scalar if it is known to be uniform or will be scalarized
1872   /// during vectorization. collectLoopScalars should only add non-uniform nodes
1873   /// to the list if they are used by a load/store instruction that is marked as
1874   /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1875   /// VF values in the vectorized loop, each corresponding to an iteration of
1876   /// the original scalar loop.
1877   void collectLoopScalars(ElementCount VF);
1878 
1879   /// Keeps cost model vectorization decision and cost for instructions.
1880   /// Right now it is used for memory instructions only.
1881   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1882                                 std::pair<InstWidening, InstructionCost>>;
1883 
1884   DecisionList WideningDecisions;
1885 
1886   /// Returns true if \p V is expected to be vectorized and it needs to be
1887   /// extracted.
1888   bool needsExtract(Value *V, ElementCount VF) const {
1889     Instruction *I = dyn_cast<Instruction>(V);
1890     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1891         TheLoop->isLoopInvariant(I))
1892       return false;
1893 
1894     // Assume we can vectorize V (and hence we need extraction) if the
1895     // scalars are not computed yet. This can happen, because it is called
1896     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1897     // the scalars are collected. That should be a safe assumption in most
1898     // cases, because we check if the operands have vectorizable types
1899     // beforehand in LoopVectorizationLegality.
1900     return Scalars.find(VF) == Scalars.end() ||
1901            !isScalarAfterVectorization(I, VF);
1902   };
1903 
1904   /// Returns a range containing only operands needing to be extracted.
1905   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1906                                                    ElementCount VF) const {
1907     return SmallVector<Value *, 4>(make_filter_range(
1908         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1909   }
1910 
1911   /// Determines if we have the infrastructure to vectorize loop \p L and its
1912   /// epilogue, assuming the main loop is vectorized by \p VF.
1913   bool isCandidateForEpilogueVectorization(const Loop &L,
1914                                            const ElementCount VF) const;
1915 
1916   /// Returns true if epilogue vectorization is considered profitable, and
1917   /// false otherwise.
1918   /// \p VF is the vectorization factor chosen for the original loop.
1919   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1920 
1921 public:
1922   /// The loop that we evaluate.
1923   Loop *TheLoop;
1924 
1925   /// Predicated scalar evolution analysis.
1926   PredicatedScalarEvolution &PSE;
1927 
1928   /// Loop Info analysis.
1929   LoopInfo *LI;
1930 
1931   /// Vectorization legality.
1932   LoopVectorizationLegality *Legal;
1933 
1934   /// Vector target information.
1935   const TargetTransformInfo &TTI;
1936 
1937   /// Target Library Info.
1938   const TargetLibraryInfo *TLI;
1939 
1940   /// Demanded bits analysis.
1941   DemandedBits *DB;
1942 
1943   /// Assumption cache.
1944   AssumptionCache *AC;
1945 
1946   /// Interface to emit optimization remarks.
1947   OptimizationRemarkEmitter *ORE;
1948 
1949   const Function *TheFunction;
1950 
1951   /// Loop Vectorize Hint.
1952   const LoopVectorizeHints *Hints;
1953 
1954   /// The interleave access information contains groups of interleaved accesses
1955   /// with the same stride and close to each other.
1956   InterleavedAccessInfo &InterleaveInfo;
1957 
1958   /// Values to ignore in the cost model.
1959   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1960 
1961   /// Values to ignore in the cost model when VF > 1.
1962   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1963 
1964   /// All element types found in the loop.
1965   SmallPtrSet<Type *, 16> ElementTypesInLoop;
1966 
1967   /// Profitable vector factors.
1968   SmallVector<VectorizationFactor, 8> ProfitableVFs;
1969 };
1970 } // end namespace llvm
1971 
1972 /// Helper struct to manage generating runtime checks for vectorization.
1973 ///
1974 /// The runtime checks are created up-front in temporary blocks to allow better
1975 /// estimating the cost and un-linked from the existing IR. After deciding to
1976 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1977 /// temporary blocks are completely removed.
1978 class GeneratedRTChecks {
1979   /// Basic block which contains the generated SCEV checks, if any.
1980   BasicBlock *SCEVCheckBlock = nullptr;
1981 
1982   /// The value representing the result of the generated SCEV checks. If it is
1983   /// nullptr, either no SCEV checks have been generated or they have been used.
1984   Value *SCEVCheckCond = nullptr;
1985 
1986   /// Basic block which contains the generated memory runtime checks, if any.
1987   BasicBlock *MemCheckBlock = nullptr;
1988 
1989   /// The value representing the result of the generated memory runtime checks.
1990   /// If it is nullptr, either no memory runtime checks have been generated or
1991   /// they have been used.
1992   Value *MemRuntimeCheckCond = nullptr;
1993 
1994   DominatorTree *DT;
1995   LoopInfo *LI;
1996 
1997   SCEVExpander SCEVExp;
1998   SCEVExpander MemCheckExp;
1999 
2000 public:
2001   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
2002                     const DataLayout &DL)
2003       : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"),
2004         MemCheckExp(SE, DL, "scev.check") {}
2005 
2006   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
2007   /// accurately estimate the cost of the runtime checks. The blocks are
2008   /// un-linked from the IR and is added back during vector code generation. If
2009   /// there is no vector code generation, the check blocks are removed
2010   /// completely.
2011   void Create(Loop *L, const LoopAccessInfo &LAI,
2012               const SCEVUnionPredicate &UnionPred) {
2013 
2014     BasicBlock *LoopHeader = L->getHeader();
2015     BasicBlock *Preheader = L->getLoopPreheader();
2016 
2017     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
2018     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
2019     // may be used by SCEVExpander. The blocks will be un-linked from their
2020     // predecessors and removed from LI & DT at the end of the function.
2021     if (!UnionPred.isAlwaysTrue()) {
2022       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
2023                                   nullptr, "vector.scevcheck");
2024 
2025       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
2026           &UnionPred, SCEVCheckBlock->getTerminator());
2027     }
2028 
2029     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
2030     if (RtPtrChecking.Need) {
2031       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
2032       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
2033                                  "vector.memcheck");
2034 
2035       MemRuntimeCheckCond =
2036           addRuntimeChecks(MemCheckBlock->getTerminator(), L,
2037                            RtPtrChecking.getChecks(), MemCheckExp);
2038       assert(MemRuntimeCheckCond &&
2039              "no RT checks generated although RtPtrChecking "
2040              "claimed checks are required");
2041     }
2042 
2043     if (!MemCheckBlock && !SCEVCheckBlock)
2044       return;
2045 
2046     // Unhook the temporary block with the checks, update various places
2047     // accordingly.
2048     if (SCEVCheckBlock)
2049       SCEVCheckBlock->replaceAllUsesWith(Preheader);
2050     if (MemCheckBlock)
2051       MemCheckBlock->replaceAllUsesWith(Preheader);
2052 
2053     if (SCEVCheckBlock) {
2054       SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2055       new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
2056       Preheader->getTerminator()->eraseFromParent();
2057     }
2058     if (MemCheckBlock) {
2059       MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2060       new UnreachableInst(Preheader->getContext(), MemCheckBlock);
2061       Preheader->getTerminator()->eraseFromParent();
2062     }
2063 
2064     DT->changeImmediateDominator(LoopHeader, Preheader);
2065     if (MemCheckBlock) {
2066       DT->eraseNode(MemCheckBlock);
2067       LI->removeBlock(MemCheckBlock);
2068     }
2069     if (SCEVCheckBlock) {
2070       DT->eraseNode(SCEVCheckBlock);
2071       LI->removeBlock(SCEVCheckBlock);
2072     }
2073   }
2074 
2075   /// Remove the created SCEV & memory runtime check blocks & instructions, if
2076   /// unused.
2077   ~GeneratedRTChecks() {
2078     SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT);
2079     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT);
2080     if (!SCEVCheckCond)
2081       SCEVCleaner.markResultUsed();
2082 
2083     if (!MemRuntimeCheckCond)
2084       MemCheckCleaner.markResultUsed();
2085 
2086     if (MemRuntimeCheckCond) {
2087       auto &SE = *MemCheckExp.getSE();
2088       // Memory runtime check generation creates compares that use expanded
2089       // values. Remove them before running the SCEVExpanderCleaners.
2090       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2091         if (MemCheckExp.isInsertedInstruction(&I))
2092           continue;
2093         SE.forgetValue(&I);
2094         I.eraseFromParent();
2095       }
2096     }
2097     MemCheckCleaner.cleanup();
2098     SCEVCleaner.cleanup();
2099 
2100     if (SCEVCheckCond)
2101       SCEVCheckBlock->eraseFromParent();
2102     if (MemRuntimeCheckCond)
2103       MemCheckBlock->eraseFromParent();
2104   }
2105 
2106   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2107   /// adjusts the branches to branch to the vector preheader or \p Bypass,
2108   /// depending on the generated condition.
2109   BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass,
2110                              BasicBlock *LoopVectorPreHeader,
2111                              BasicBlock *LoopExitBlock) {
2112     if (!SCEVCheckCond)
2113       return nullptr;
2114     if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond))
2115       if (C->isZero())
2116         return nullptr;
2117 
2118     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2119 
2120     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2121     // Create new preheader for vector loop.
2122     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2123       PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2124 
2125     SCEVCheckBlock->getTerminator()->eraseFromParent();
2126     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2127     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2128                                                 SCEVCheckBlock);
2129 
2130     DT->addNewBlock(SCEVCheckBlock, Pred);
2131     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2132 
2133     ReplaceInstWithInst(
2134         SCEVCheckBlock->getTerminator(),
2135         BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond));
2136     // Mark the check as used, to prevent it from being removed during cleanup.
2137     SCEVCheckCond = nullptr;
2138     return SCEVCheckBlock;
2139   }
2140 
2141   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2142   /// the branches to branch to the vector preheader or \p Bypass, depending on
2143   /// the generated condition.
2144   BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass,
2145                                    BasicBlock *LoopVectorPreHeader) {
2146     // Check if we generated code that checks in runtime if arrays overlap.
2147     if (!MemRuntimeCheckCond)
2148       return nullptr;
2149 
2150     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2151     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2152                                                 MemCheckBlock);
2153 
2154     DT->addNewBlock(MemCheckBlock, Pred);
2155     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2156     MemCheckBlock->moveBefore(LoopVectorPreHeader);
2157 
2158     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2159       PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2160 
2161     ReplaceInstWithInst(
2162         MemCheckBlock->getTerminator(),
2163         BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2164     MemCheckBlock->getTerminator()->setDebugLoc(
2165         Pred->getTerminator()->getDebugLoc());
2166 
2167     // Mark the check as used, to prevent it from being removed during cleanup.
2168     MemRuntimeCheckCond = nullptr;
2169     return MemCheckBlock;
2170   }
2171 };
2172 
2173 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2174 // vectorization. The loop needs to be annotated with #pragma omp simd
2175 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2176 // vector length information is not provided, vectorization is not considered
2177 // explicit. Interleave hints are not allowed either. These limitations will be
2178 // relaxed in the future.
2179 // Please, note that we are currently forced to abuse the pragma 'clang
2180 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2181 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2182 // provides *explicit vectorization hints* (LV can bypass legal checks and
2183 // assume that vectorization is legal). However, both hints are implemented
2184 // using the same metadata (llvm.loop.vectorize, processed by
2185 // LoopVectorizeHints). This will be fixed in the future when the native IR
2186 // representation for pragma 'omp simd' is introduced.
2187 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2188                                    OptimizationRemarkEmitter *ORE) {
2189   assert(!OuterLp->isInnermost() && "This is not an outer loop");
2190   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2191 
2192   // Only outer loops with an explicit vectorization hint are supported.
2193   // Unannotated outer loops are ignored.
2194   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2195     return false;
2196 
2197   Function *Fn = OuterLp->getHeader()->getParent();
2198   if (!Hints.allowVectorization(Fn, OuterLp,
2199                                 true /*VectorizeOnlyWhenForced*/)) {
2200     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2201     return false;
2202   }
2203 
2204   if (Hints.getInterleave() > 1) {
2205     // TODO: Interleave support is future work.
2206     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2207                          "outer loops.\n");
2208     Hints.emitRemarkWithHints();
2209     return false;
2210   }
2211 
2212   return true;
2213 }
2214 
2215 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2216                                   OptimizationRemarkEmitter *ORE,
2217                                   SmallVectorImpl<Loop *> &V) {
2218   // Collect inner loops and outer loops without irreducible control flow. For
2219   // now, only collect outer loops that have explicit vectorization hints. If we
2220   // are stress testing the VPlan H-CFG construction, we collect the outermost
2221   // loop of every loop nest.
2222   if (L.isInnermost() || VPlanBuildStressTest ||
2223       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2224     LoopBlocksRPO RPOT(&L);
2225     RPOT.perform(LI);
2226     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2227       V.push_back(&L);
2228       // TODO: Collect inner loops inside marked outer loops in case
2229       // vectorization fails for the outer loop. Do not invoke
2230       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2231       // already known to be reducible. We can use an inherited attribute for
2232       // that.
2233       return;
2234     }
2235   }
2236   for (Loop *InnerL : L)
2237     collectSupportedLoops(*InnerL, LI, ORE, V);
2238 }
2239 
2240 namespace {
2241 
2242 /// The LoopVectorize Pass.
2243 struct LoopVectorize : public FunctionPass {
2244   /// Pass identification, replacement for typeid
2245   static char ID;
2246 
2247   LoopVectorizePass Impl;
2248 
2249   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2250                          bool VectorizeOnlyWhenForced = false)
2251       : FunctionPass(ID),
2252         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2253     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2254   }
2255 
2256   bool runOnFunction(Function &F) override {
2257     if (skipFunction(F))
2258       return false;
2259 
2260     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2261     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2262     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2263     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2264     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2265     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2266     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2267     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2268     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2269     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2270     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2271     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2272     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2273 
2274     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2275         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2276 
2277     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2278                         GetLAA, *ORE, PSI).MadeAnyChange;
2279   }
2280 
2281   void getAnalysisUsage(AnalysisUsage &AU) const override {
2282     AU.addRequired<AssumptionCacheTracker>();
2283     AU.addRequired<BlockFrequencyInfoWrapperPass>();
2284     AU.addRequired<DominatorTreeWrapperPass>();
2285     AU.addRequired<LoopInfoWrapperPass>();
2286     AU.addRequired<ScalarEvolutionWrapperPass>();
2287     AU.addRequired<TargetTransformInfoWrapperPass>();
2288     AU.addRequired<AAResultsWrapperPass>();
2289     AU.addRequired<LoopAccessLegacyAnalysis>();
2290     AU.addRequired<DemandedBitsWrapperPass>();
2291     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2292     AU.addRequired<InjectTLIMappingsLegacy>();
2293 
2294     // We currently do not preserve loopinfo/dominator analyses with outer loop
2295     // vectorization. Until this is addressed, mark these analyses as preserved
2296     // only for non-VPlan-native path.
2297     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2298     if (!EnableVPlanNativePath) {
2299       AU.addPreserved<LoopInfoWrapperPass>();
2300       AU.addPreserved<DominatorTreeWrapperPass>();
2301     }
2302 
2303     AU.addPreserved<BasicAAWrapperPass>();
2304     AU.addPreserved<GlobalsAAWrapperPass>();
2305     AU.addRequired<ProfileSummaryInfoWrapperPass>();
2306   }
2307 };
2308 
2309 } // end anonymous namespace
2310 
2311 //===----------------------------------------------------------------------===//
2312 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2313 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2314 //===----------------------------------------------------------------------===//
2315 
2316 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2317   // We need to place the broadcast of invariant variables outside the loop,
2318   // but only if it's proven safe to do so. Else, broadcast will be inside
2319   // vector loop body.
2320   Instruction *Instr = dyn_cast<Instruction>(V);
2321   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2322                      (!Instr ||
2323                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2324   // Place the code for broadcasting invariant variables in the new preheader.
2325   IRBuilder<>::InsertPointGuard Guard(Builder);
2326   if (SafeToHoist)
2327     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2328 
2329   // Broadcast the scalar into all locations in the vector.
2330   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2331 
2332   return Shuf;
2333 }
2334 
2335 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
2336     const InductionDescriptor &II, Value *Step, Value *Start,
2337     Instruction *EntryVal, VPValue *Def, VPTransformState &State) {
2338   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2339          "Expected either an induction phi-node or a truncate of it!");
2340 
2341   // Construct the initial value of the vector IV in the vector loop preheader
2342   auto CurrIP = Builder.saveIP();
2343   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2344   if (isa<TruncInst>(EntryVal)) {
2345     assert(Start->getType()->isIntegerTy() &&
2346            "Truncation requires an integer type");
2347     auto *TruncType = cast<IntegerType>(EntryVal->getType());
2348     Step = Builder.CreateTrunc(Step, TruncType);
2349     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2350   }
2351 
2352   Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
2353   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
2354   Value *SteppedStart =
2355       getStepVector(SplatStart, Zero, Step, II.getInductionOpcode());
2356 
2357   // We create vector phi nodes for both integer and floating-point induction
2358   // variables. Here, we determine the kind of arithmetic we will perform.
2359   Instruction::BinaryOps AddOp;
2360   Instruction::BinaryOps MulOp;
2361   if (Step->getType()->isIntegerTy()) {
2362     AddOp = Instruction::Add;
2363     MulOp = Instruction::Mul;
2364   } else {
2365     AddOp = II.getInductionOpcode();
2366     MulOp = Instruction::FMul;
2367   }
2368 
2369   // Multiply the vectorization factor by the step using integer or
2370   // floating-point arithmetic as appropriate.
2371   Type *StepType = Step->getType();
2372   Value *RuntimeVF;
2373   if (Step->getType()->isFloatingPointTy())
2374     RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, VF);
2375   else
2376     RuntimeVF = getRuntimeVF(Builder, StepType, VF);
2377   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
2378 
2379   // Create a vector splat to use in the induction update.
2380   //
2381   // FIXME: If the step is non-constant, we create the vector splat with
2382   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2383   //        handle a constant vector splat.
2384   Value *SplatVF = isa<Constant>(Mul)
2385                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
2386                        : Builder.CreateVectorSplat(VF, Mul);
2387   Builder.restoreIP(CurrIP);
2388 
2389   // We may need to add the step a number of times, depending on the unroll
2390   // factor. The last of those goes into the PHI.
2391   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2392                                     &*LoopVectorBody->getFirstInsertionPt());
2393   VecInd->setDebugLoc(EntryVal->getDebugLoc());
2394   Instruction *LastInduction = VecInd;
2395   for (unsigned Part = 0; Part < UF; ++Part) {
2396     State.set(Def, LastInduction, Part);
2397 
2398     if (isa<TruncInst>(EntryVal))
2399       addMetadata(LastInduction, EntryVal);
2400 
2401     LastInduction = cast<Instruction>(
2402         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
2403     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
2404   }
2405 
2406   // Move the last step to the end of the latch block. This ensures consistent
2407   // placement of all induction updates.
2408   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2409   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2410   auto *ICmp = cast<Instruction>(Br->getCondition());
2411   LastInduction->moveBefore(ICmp);
2412   LastInduction->setName("vec.ind.next");
2413 
2414   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2415   VecInd->addIncoming(LastInduction, LoopVectorLatch);
2416 }
2417 
2418 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
2419   return Cost->isScalarAfterVectorization(I, VF) ||
2420          Cost->isProfitableToScalarize(I, VF);
2421 }
2422 
2423 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
2424   if (shouldScalarizeInstruction(IV))
2425     return true;
2426   auto isScalarInst = [&](User *U) -> bool {
2427     auto *I = cast<Instruction>(U);
2428     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
2429   };
2430   return llvm::any_of(IV->users(), isScalarInst);
2431 }
2432 
2433 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV,
2434                                                 const InductionDescriptor &ID,
2435                                                 Value *Start, TruncInst *Trunc,
2436                                                 VPValue *Def,
2437                                                 VPTransformState &State) {
2438   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
2439          "Primary induction variable must have an integer type");
2440   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
2441 
2442   // The value from the original loop to which we are mapping the new induction
2443   // variable.
2444   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2445 
2446   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2447 
2448   // Generate code for the induction step. Note that induction steps are
2449   // required to be loop-invariant
2450   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
2451     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
2452            "Induction step should be loop invariant");
2453     if (PSE.getSE()->isSCEVable(IV->getType())) {
2454       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2455       return Exp.expandCodeFor(Step, Step->getType(),
2456                                LoopVectorPreHeader->getTerminator());
2457     }
2458     return cast<SCEVUnknown>(Step)->getValue();
2459   };
2460 
2461   // The scalar value to broadcast. This is derived from the canonical
2462   // induction variable. If a truncation type is given, truncate the canonical
2463   // induction variable and step. Otherwise, derive these values from the
2464   // induction descriptor.
2465   auto CreateScalarIV = [&](Value *&Step) -> Value * {
2466     Value *ScalarIV = Induction;
2467     if (IV != OldInduction) {
2468       ScalarIV = IV->getType()->isIntegerTy()
2469                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
2470                      : Builder.CreateCast(Instruction::SIToFP, Induction,
2471                                           IV->getType());
2472       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
2473       ScalarIV->setName("offset.idx");
2474     }
2475     if (Trunc) {
2476       auto *TruncType = cast<IntegerType>(Trunc->getType());
2477       assert(Step->getType()->isIntegerTy() &&
2478              "Truncation requires an integer step");
2479       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2480       Step = Builder.CreateTrunc(Step, TruncType);
2481     }
2482     return ScalarIV;
2483   };
2484 
2485   // Create the vector values from the scalar IV, in the absence of creating a
2486   // vector IV.
2487   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
2488     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
2489     for (unsigned Part = 0; Part < UF; ++Part) {
2490       assert(!VF.isScalable() && "scalable vectors not yet supported.");
2491       Value *StartIdx;
2492       if (Step->getType()->isFloatingPointTy())
2493         StartIdx = getRuntimeVFAsFloat(Builder, Step->getType(), VF * Part);
2494       else
2495         StartIdx = getRuntimeVF(Builder, Step->getType(), VF * Part);
2496 
2497       Value *EntryPart =
2498           getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode());
2499       State.set(Def, EntryPart, Part);
2500       if (Trunc)
2501         addMetadata(EntryPart, Trunc);
2502     }
2503   };
2504 
2505   // Fast-math-flags propagate from the original induction instruction.
2506   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
2507   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
2508     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
2509 
2510   // Now do the actual transformations, and start with creating the step value.
2511   Value *Step = CreateStepValue(ID.getStep());
2512   if (VF.isZero() || VF.isScalar()) {
2513     Value *ScalarIV = CreateScalarIV(Step);
2514     CreateSplatIV(ScalarIV, Step);
2515     return;
2516   }
2517 
2518   // Determine if we want a scalar version of the induction variable. This is
2519   // true if the induction variable itself is not widened, or if it has at
2520   // least one user in the loop that is not widened.
2521   auto NeedsScalarIV = needsScalarInduction(EntryVal);
2522   if (!NeedsScalarIV) {
2523     createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State);
2524     return;
2525   }
2526 
2527   // Try to create a new independent vector induction variable. If we can't
2528   // create the phi node, we will splat the scalar induction variable in each
2529   // loop iteration.
2530   if (!shouldScalarizeInstruction(EntryVal)) {
2531     createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State);
2532     Value *ScalarIV = CreateScalarIV(Step);
2533     // Create scalar steps that can be used by instructions we will later
2534     // scalarize. Note that the addition of the scalar steps will not increase
2535     // the number of instructions in the loop in the common case prior to
2536     // InstCombine. We will be trading one vector extract for each scalar step.
2537     buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State);
2538     return;
2539   }
2540 
2541   // All IV users are scalar instructions, so only emit a scalar IV, not a
2542   // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2543   // predicate used by the masked loads/stores.
2544   Value *ScalarIV = CreateScalarIV(Step);
2545   if (!Cost->isScalarEpilogueAllowed())
2546     CreateSplatIV(ScalarIV, Step);
2547   buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State);
2548 }
2549 
2550 Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx,
2551                                           Value *Step,
2552                                           Instruction::BinaryOps BinOp) {
2553   // Create and check the types.
2554   auto *ValVTy = cast<VectorType>(Val->getType());
2555   ElementCount VLen = ValVTy->getElementCount();
2556 
2557   Type *STy = Val->getType()->getScalarType();
2558   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2559          "Induction Step must be an integer or FP");
2560   assert(Step->getType() == STy && "Step has wrong type");
2561 
2562   SmallVector<Constant *, 8> Indices;
2563 
2564   // Create a vector of consecutive numbers from zero to VF.
2565   VectorType *InitVecValVTy = ValVTy;
2566   Type *InitVecValSTy = STy;
2567   if (STy->isFloatingPointTy()) {
2568     InitVecValSTy =
2569         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
2570     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2571   }
2572   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2573 
2574   // Splat the StartIdx
2575   Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
2576 
2577   if (STy->isIntegerTy()) {
2578     InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2579     Step = Builder.CreateVectorSplat(VLen, Step);
2580     assert(Step->getType() == Val->getType() && "Invalid step vec");
2581     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2582     // which can be found from the original scalar operations.
2583     Step = Builder.CreateMul(InitVec, Step);
2584     return Builder.CreateAdd(Val, Step, "induction");
2585   }
2586 
2587   // Floating point induction.
2588   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2589          "Binary Opcode should be specified for FP induction");
2590   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2591   InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
2592 
2593   Step = Builder.CreateVectorSplat(VLen, Step);
2594   Value *MulOp = Builder.CreateFMul(InitVec, Step);
2595   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2596 }
2597 
2598 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2599                                            Instruction *EntryVal,
2600                                            const InductionDescriptor &ID,
2601                                            VPValue *Def,
2602                                            VPTransformState &State) {
2603   // We shouldn't have to build scalar steps if we aren't vectorizing.
2604   assert(VF.isVector() && "VF should be greater than one");
2605   // Get the value type and ensure it and the step have the same integer type.
2606   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2607   assert(ScalarIVTy == Step->getType() &&
2608          "Val and Step should have the same type");
2609 
2610   // We build scalar steps for both integer and floating-point induction
2611   // variables. Here, we determine the kind of arithmetic we will perform.
2612   Instruction::BinaryOps AddOp;
2613   Instruction::BinaryOps MulOp;
2614   if (ScalarIVTy->isIntegerTy()) {
2615     AddOp = Instruction::Add;
2616     MulOp = Instruction::Mul;
2617   } else {
2618     AddOp = ID.getInductionOpcode();
2619     MulOp = Instruction::FMul;
2620   }
2621 
2622   // Determine the number of scalars we need to generate for each unroll
2623   // iteration. If EntryVal is uniform, we only need to generate the first
2624   // lane. Otherwise, we generate all VF values.
2625   bool IsUniform =
2626       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF);
2627   unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue();
2628   // Compute the scalar steps and save the results in State.
2629   Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2630                                      ScalarIVTy->getScalarSizeInBits());
2631   Type *VecIVTy = nullptr;
2632   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2633   if (!IsUniform && VF.isScalable()) {
2634     VecIVTy = VectorType::get(ScalarIVTy, VF);
2635     UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF));
2636     SplatStep = Builder.CreateVectorSplat(VF, Step);
2637     SplatIV = Builder.CreateVectorSplat(VF, ScalarIV);
2638   }
2639 
2640   for (unsigned Part = 0; Part < UF; ++Part) {
2641     Value *StartIdx0 = createStepForVF(Builder, IntStepTy, VF, Part);
2642 
2643     if (!IsUniform && VF.isScalable()) {
2644       auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0);
2645       auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2646       if (ScalarIVTy->isFloatingPointTy())
2647         InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2648       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2649       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2650       State.set(Def, Add, Part);
2651       // It's useful to record the lane values too for the known minimum number
2652       // of elements so we do those below. This improves the code quality when
2653       // trying to extract the first element, for example.
2654     }
2655 
2656     if (ScalarIVTy->isFloatingPointTy())
2657       StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2658 
2659     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2660       Value *StartIdx = Builder.CreateBinOp(
2661           AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2662       // The step returned by `createStepForVF` is a runtime-evaluated value
2663       // when VF is scalable. Otherwise, it should be folded into a Constant.
2664       assert((VF.isScalable() || isa<Constant>(StartIdx)) &&
2665              "Expected StartIdx to be folded to a constant when VF is not "
2666              "scalable");
2667       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2668       auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2669       State.set(Def, Add, VPIteration(Part, Lane));
2670     }
2671   }
2672 }
2673 
2674 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2675                                                     const VPIteration &Instance,
2676                                                     VPTransformState &State) {
2677   Value *ScalarInst = State.get(Def, Instance);
2678   Value *VectorValue = State.get(Def, Instance.Part);
2679   VectorValue = Builder.CreateInsertElement(
2680       VectorValue, ScalarInst,
2681       Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2682   State.set(Def, VectorValue, Instance.Part);
2683 }
2684 
2685 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2686   assert(Vec->getType()->isVectorTy() && "Invalid type");
2687   return Builder.CreateVectorReverse(Vec, "reverse");
2688 }
2689 
2690 // Return whether we allow using masked interleave-groups (for dealing with
2691 // strided loads/stores that reside in predicated blocks, or for dealing
2692 // with gaps).
2693 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2694   // If an override option has been passed in for interleaved accesses, use it.
2695   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2696     return EnableMaskedInterleavedMemAccesses;
2697 
2698   return TTI.enableMaskedInterleavedAccessVectorization();
2699 }
2700 
2701 // Try to vectorize the interleave group that \p Instr belongs to.
2702 //
2703 // E.g. Translate following interleaved load group (factor = 3):
2704 //   for (i = 0; i < N; i+=3) {
2705 //     R = Pic[i];             // Member of index 0
2706 //     G = Pic[i+1];           // Member of index 1
2707 //     B = Pic[i+2];           // Member of index 2
2708 //     ... // do something to R, G, B
2709 //   }
2710 // To:
2711 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2712 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2713 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2714 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2715 //
2716 // Or translate following interleaved store group (factor = 3):
2717 //   for (i = 0; i < N; i+=3) {
2718 //     ... do something to R, G, B
2719 //     Pic[i]   = R;           // Member of index 0
2720 //     Pic[i+1] = G;           // Member of index 1
2721 //     Pic[i+2] = B;           // Member of index 2
2722 //   }
2723 // To:
2724 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2725 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2726 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2727 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2728 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2729 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2730     const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2731     VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2732     VPValue *BlockInMask) {
2733   Instruction *Instr = Group->getInsertPos();
2734   const DataLayout &DL = Instr->getModule()->getDataLayout();
2735 
2736   // Prepare for the vector type of the interleaved load/store.
2737   Type *ScalarTy = getLoadStoreType(Instr);
2738   unsigned InterleaveFactor = Group->getFactor();
2739   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2740   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2741 
2742   // Prepare for the new pointers.
2743   SmallVector<Value *, 2> AddrParts;
2744   unsigned Index = Group->getIndex(Instr);
2745 
2746   // TODO: extend the masked interleaved-group support to reversed access.
2747   assert((!BlockInMask || !Group->isReverse()) &&
2748          "Reversed masked interleave-group not supported.");
2749 
2750   // If the group is reverse, adjust the index to refer to the last vector lane
2751   // instead of the first. We adjust the index from the first vector lane,
2752   // rather than directly getting the pointer for lane VF - 1, because the
2753   // pointer operand of the interleaved access is supposed to be uniform. For
2754   // uniform instructions, we're only required to generate a value for the
2755   // first vector lane in each unroll iteration.
2756   if (Group->isReverse())
2757     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2758 
2759   for (unsigned Part = 0; Part < UF; Part++) {
2760     Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2761     setDebugLocFromInst(AddrPart);
2762 
2763     // Notice current instruction could be any index. Need to adjust the address
2764     // to the member of index 0.
2765     //
2766     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2767     //       b = A[i];       // Member of index 0
2768     // Current pointer is pointed to A[i+1], adjust it to A[i].
2769     //
2770     // E.g.  A[i+1] = a;     // Member of index 1
2771     //       A[i]   = b;     // Member of index 0
2772     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2773     // Current pointer is pointed to A[i+2], adjust it to A[i].
2774 
2775     bool InBounds = false;
2776     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2777       InBounds = gep->isInBounds();
2778     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2779     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2780 
2781     // Cast to the vector pointer type.
2782     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2783     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2784     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2785   }
2786 
2787   setDebugLocFromInst(Instr);
2788   Value *PoisonVec = PoisonValue::get(VecTy);
2789 
2790   Value *MaskForGaps = nullptr;
2791   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2792     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2793     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2794   }
2795 
2796   // Vectorize the interleaved load group.
2797   if (isa<LoadInst>(Instr)) {
2798     // For each unroll part, create a wide load for the group.
2799     SmallVector<Value *, 2> NewLoads;
2800     for (unsigned Part = 0; Part < UF; Part++) {
2801       Instruction *NewLoad;
2802       if (BlockInMask || MaskForGaps) {
2803         assert(useMaskedInterleavedAccesses(*TTI) &&
2804                "masked interleaved groups are not allowed.");
2805         Value *GroupMask = MaskForGaps;
2806         if (BlockInMask) {
2807           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2808           Value *ShuffledMask = Builder.CreateShuffleVector(
2809               BlockInMaskPart,
2810               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2811               "interleaved.mask");
2812           GroupMask = MaskForGaps
2813                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2814                                                 MaskForGaps)
2815                           : ShuffledMask;
2816         }
2817         NewLoad =
2818             Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2819                                      GroupMask, PoisonVec, "wide.masked.vec");
2820       }
2821       else
2822         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2823                                             Group->getAlign(), "wide.vec");
2824       Group->addMetadata(NewLoad);
2825       NewLoads.push_back(NewLoad);
2826     }
2827 
2828     // For each member in the group, shuffle out the appropriate data from the
2829     // wide loads.
2830     unsigned J = 0;
2831     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2832       Instruction *Member = Group->getMember(I);
2833 
2834       // Skip the gaps in the group.
2835       if (!Member)
2836         continue;
2837 
2838       auto StrideMask =
2839           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2840       for (unsigned Part = 0; Part < UF; Part++) {
2841         Value *StridedVec = Builder.CreateShuffleVector(
2842             NewLoads[Part], StrideMask, "strided.vec");
2843 
2844         // If this member has different type, cast the result type.
2845         if (Member->getType() != ScalarTy) {
2846           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2847           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2848           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2849         }
2850 
2851         if (Group->isReverse())
2852           StridedVec = reverseVector(StridedVec);
2853 
2854         State.set(VPDefs[J], StridedVec, Part);
2855       }
2856       ++J;
2857     }
2858     return;
2859   }
2860 
2861   // The sub vector type for current instruction.
2862   auto *SubVT = VectorType::get(ScalarTy, VF);
2863 
2864   // Vectorize the interleaved store group.
2865   MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2866   assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2867          "masked interleaved groups are not allowed.");
2868   assert((!MaskForGaps || !VF.isScalable()) &&
2869          "masking gaps for scalable vectors is not yet supported.");
2870   for (unsigned Part = 0; Part < UF; Part++) {
2871     // Collect the stored vector from each member.
2872     SmallVector<Value *, 4> StoredVecs;
2873     for (unsigned i = 0; i < InterleaveFactor; i++) {
2874       assert((Group->getMember(i) || MaskForGaps) &&
2875              "Fail to get a member from an interleaved store group");
2876       Instruction *Member = Group->getMember(i);
2877 
2878       // Skip the gaps in the group.
2879       if (!Member) {
2880         Value *Undef = PoisonValue::get(SubVT);
2881         StoredVecs.push_back(Undef);
2882         continue;
2883       }
2884 
2885       Value *StoredVec = State.get(StoredValues[i], Part);
2886 
2887       if (Group->isReverse())
2888         StoredVec = reverseVector(StoredVec);
2889 
2890       // If this member has different type, cast it to a unified type.
2891 
2892       if (StoredVec->getType() != SubVT)
2893         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2894 
2895       StoredVecs.push_back(StoredVec);
2896     }
2897 
2898     // Concatenate all vectors into a wide vector.
2899     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2900 
2901     // Interleave the elements in the wide vector.
2902     Value *IVec = Builder.CreateShuffleVector(
2903         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2904         "interleaved.vec");
2905 
2906     Instruction *NewStoreInstr;
2907     if (BlockInMask || MaskForGaps) {
2908       Value *GroupMask = MaskForGaps;
2909       if (BlockInMask) {
2910         Value *BlockInMaskPart = State.get(BlockInMask, Part);
2911         Value *ShuffledMask = Builder.CreateShuffleVector(
2912             BlockInMaskPart,
2913             createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2914             "interleaved.mask");
2915         GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And,
2916                                                       ShuffledMask, MaskForGaps)
2917                                 : ShuffledMask;
2918       }
2919       NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2920                                                 Group->getAlign(), GroupMask);
2921     } else
2922       NewStoreInstr =
2923           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2924 
2925     Group->addMetadata(NewStoreInstr);
2926   }
2927 }
2928 
2929 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2930                                                VPReplicateRecipe *RepRecipe,
2931                                                const VPIteration &Instance,
2932                                                bool IfPredicateInstr,
2933                                                VPTransformState &State) {
2934   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2935 
2936   // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2937   // the first lane and part.
2938   if (isa<NoAliasScopeDeclInst>(Instr))
2939     if (!Instance.isFirstIteration())
2940       return;
2941 
2942   setDebugLocFromInst(Instr);
2943 
2944   // Does this instruction return a value ?
2945   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2946 
2947   Instruction *Cloned = Instr->clone();
2948   if (!IsVoidRetTy)
2949     Cloned->setName(Instr->getName() + ".cloned");
2950 
2951   // If the scalarized instruction contributes to the address computation of a
2952   // widen masked load/store which was in a basic block that needed predication
2953   // and is not predicated after vectorization, we can't propagate
2954   // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized
2955   // instruction could feed a poison value to the base address of the widen
2956   // load/store.
2957   if (State.MayGeneratePoisonRecipes.count(RepRecipe) > 0)
2958     Cloned->dropPoisonGeneratingFlags();
2959 
2960   State.Builder.SetInsertPoint(Builder.GetInsertBlock(),
2961                                Builder.GetInsertPoint());
2962   // Replace the operands of the cloned instructions with their scalar
2963   // equivalents in the new loop.
2964   for (auto &I : enumerate(RepRecipe->operands())) {
2965     auto InputInstance = Instance;
2966     VPValue *Operand = I.value();
2967     if (State.Plan->isUniformAfterVectorization(Operand))
2968       InputInstance.Lane = VPLane::getFirstLane();
2969     Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2970   }
2971   addNewMetadata(Cloned, Instr);
2972 
2973   // Place the cloned scalar in the new loop.
2974   Builder.Insert(Cloned);
2975 
2976   State.set(RepRecipe, Cloned, Instance);
2977 
2978   // If we just cloned a new assumption, add it the assumption cache.
2979   if (auto *II = dyn_cast<AssumeInst>(Cloned))
2980     AC->registerAssumption(II);
2981 
2982   // End if-block.
2983   if (IfPredicateInstr)
2984     PredicatedInstructions.push_back(Cloned);
2985 }
2986 
2987 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2988                                                       Value *End, Value *Step,
2989                                                       Instruction *DL) {
2990   BasicBlock *Header = L->getHeader();
2991   BasicBlock *Latch = L->getLoopLatch();
2992   // As we're just creating this loop, it's possible no latch exists
2993   // yet. If so, use the header as this will be a single block loop.
2994   if (!Latch)
2995     Latch = Header;
2996 
2997   IRBuilder<> B(&*Header->getFirstInsertionPt());
2998   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2999   setDebugLocFromInst(OldInst, &B);
3000   auto *Induction = B.CreatePHI(Start->getType(), 2, "index");
3001 
3002   B.SetInsertPoint(Latch->getTerminator());
3003   setDebugLocFromInst(OldInst, &B);
3004 
3005   // Create i+1 and fill the PHINode.
3006   //
3007   // If the tail is not folded, we know that End - Start >= Step (either
3008   // statically or through the minimum iteration checks). We also know that both
3009   // Start % Step == 0 and End % Step == 0. We exit the vector loop if %IV +
3010   // %Step == %End. Hence we must exit the loop before %IV + %Step unsigned
3011   // overflows and we can mark the induction increment as NUW.
3012   Value *Next = B.CreateAdd(Induction, Step, "index.next",
3013                             /*NUW=*/!Cost->foldTailByMasking(), /*NSW=*/false);
3014   Induction->addIncoming(Start, L->getLoopPreheader());
3015   Induction->addIncoming(Next, Latch);
3016   // Create the compare.
3017   Value *ICmp = B.CreateICmpEQ(Next, End);
3018   B.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header);
3019 
3020   // Now we have two terminators. Remove the old one from the block.
3021   Latch->getTerminator()->eraseFromParent();
3022 
3023   return Induction;
3024 }
3025 
3026 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
3027   if (TripCount)
3028     return TripCount;
3029 
3030   assert(L && "Create Trip Count for null loop.");
3031   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3032   // Find the loop boundaries.
3033   ScalarEvolution *SE = PSE.getSE();
3034   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
3035   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
3036          "Invalid loop count");
3037 
3038   Type *IdxTy = Legal->getWidestInductionType();
3039   assert(IdxTy && "No type for induction");
3040 
3041   // The exit count might have the type of i64 while the phi is i32. This can
3042   // happen if we have an induction variable that is sign extended before the
3043   // compare. The only way that we get a backedge taken count is that the
3044   // induction variable was signed and as such will not overflow. In such a case
3045   // truncation is legal.
3046   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
3047       IdxTy->getPrimitiveSizeInBits())
3048     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
3049   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
3050 
3051   // Get the total trip count from the count by adding 1.
3052   const SCEV *ExitCount = SE->getAddExpr(
3053       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
3054 
3055   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
3056 
3057   // Expand the trip count and place the new instructions in the preheader.
3058   // Notice that the pre-header does not change, only the loop body.
3059   SCEVExpander Exp(*SE, DL, "induction");
3060 
3061   // Count holds the overall loop count (N).
3062   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
3063                                 L->getLoopPreheader()->getTerminator());
3064 
3065   if (TripCount->getType()->isPointerTy())
3066     TripCount =
3067         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
3068                                     L->getLoopPreheader()->getTerminator());
3069 
3070   return TripCount;
3071 }
3072 
3073 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
3074   if (VectorTripCount)
3075     return VectorTripCount;
3076 
3077   Value *TC = getOrCreateTripCount(L);
3078   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3079 
3080   Type *Ty = TC->getType();
3081   // This is where we can make the step a runtime constant.
3082   Value *Step = createStepForVF(Builder, Ty, VF, UF);
3083 
3084   // If the tail is to be folded by masking, round the number of iterations N
3085   // up to a multiple of Step instead of rounding down. This is done by first
3086   // adding Step-1 and then rounding down. Note that it's ok if this addition
3087   // overflows: the vector induction variable will eventually wrap to zero given
3088   // that it starts at zero and its Step is a power of two; the loop will then
3089   // exit, with the last early-exit vector comparison also producing all-true.
3090   if (Cost->foldTailByMasking()) {
3091     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
3092            "VF*UF must be a power of 2 when folding tail by masking");
3093     assert(!VF.isScalable() &&
3094            "Tail folding not yet supported for scalable vectors");
3095     TC = Builder.CreateAdd(
3096         TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
3097   }
3098 
3099   // Now we need to generate the expression for the part of the loop that the
3100   // vectorized body will execute. This is equal to N - (N % Step) if scalar
3101   // iterations are not required for correctness, or N - Step, otherwise. Step
3102   // is equal to the vectorization factor (number of SIMD elements) times the
3103   // unroll factor (number of SIMD instructions).
3104   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
3105 
3106   // There are cases where we *must* run at least one iteration in the remainder
3107   // loop.  See the cost model for when this can happen.  If the step evenly
3108   // divides the trip count, we set the remainder to be equal to the step. If
3109   // the step does not evenly divide the trip count, no adjustment is necessary
3110   // since there will already be scalar iterations. Note that the minimum
3111   // iterations check ensures that N >= Step.
3112   if (Cost->requiresScalarEpilogue(VF)) {
3113     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
3114     R = Builder.CreateSelect(IsZero, Step, R);
3115   }
3116 
3117   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
3118 
3119   return VectorTripCount;
3120 }
3121 
3122 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
3123                                                    const DataLayout &DL) {
3124   // Verify that V is a vector type with same number of elements as DstVTy.
3125   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
3126   unsigned VF = DstFVTy->getNumElements();
3127   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
3128   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
3129   Type *SrcElemTy = SrcVecTy->getElementType();
3130   Type *DstElemTy = DstFVTy->getElementType();
3131   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3132          "Vector elements must have same size");
3133 
3134   // Do a direct cast if element types are castable.
3135   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3136     return Builder.CreateBitOrPointerCast(V, DstFVTy);
3137   }
3138   // V cannot be directly casted to desired vector type.
3139   // May happen when V is a floating point vector but DstVTy is a vector of
3140   // pointers or vice-versa. Handle this using a two-step bitcast using an
3141   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3142   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3143          "Only one type should be a pointer type");
3144   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3145          "Only one type should be a floating point type");
3146   Type *IntTy =
3147       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3148   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
3149   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3150   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
3151 }
3152 
3153 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
3154                                                          BasicBlock *Bypass) {
3155   Value *Count = getOrCreateTripCount(L);
3156   // Reuse existing vector loop preheader for TC checks.
3157   // Note that new preheader block is generated for vector loop.
3158   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
3159   IRBuilder<> Builder(TCCheckBlock->getTerminator());
3160 
3161   // Generate code to check if the loop's trip count is less than VF * UF, or
3162   // equal to it in case a scalar epilogue is required; this implies that the
3163   // vector trip count is zero. This check also covers the case where adding one
3164   // to the backedge-taken count overflowed leading to an incorrect trip count
3165   // of zero. In this case we will also jump to the scalar loop.
3166   auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
3167                                             : ICmpInst::ICMP_ULT;
3168 
3169   // If tail is to be folded, vector loop takes care of all iterations.
3170   Value *CheckMinIters = Builder.getFalse();
3171   if (!Cost->foldTailByMasking()) {
3172     Value *Step = createStepForVF(Builder, Count->getType(), VF, UF);
3173     CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
3174   }
3175   // Create new preheader for vector loop.
3176   LoopVectorPreHeader =
3177       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3178                  "vector.ph");
3179 
3180   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
3181                                DT->getNode(Bypass)->getIDom()) &&
3182          "TC check is expected to dominate Bypass");
3183 
3184   // Update dominator for Bypass & LoopExit (if needed).
3185   DT->changeImmediateDominator(Bypass, TCCheckBlock);
3186   if (!Cost->requiresScalarEpilogue(VF))
3187     // If there is an epilogue which must run, there's no edge from the
3188     // middle block to exit blocks  and thus no need to update the immediate
3189     // dominator of the exit blocks.
3190     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3191 
3192   ReplaceInstWithInst(
3193       TCCheckBlock->getTerminator(),
3194       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3195   LoopBypassBlocks.push_back(TCCheckBlock);
3196 }
3197 
3198 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
3199 
3200   BasicBlock *const SCEVCheckBlock =
3201       RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock);
3202   if (!SCEVCheckBlock)
3203     return nullptr;
3204 
3205   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3206            (OptForSizeBasedOnProfile &&
3207             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
3208          "Cannot SCEV check stride or overflow when optimizing for size");
3209 
3210 
3211   // Update dominator only if this is first RT check.
3212   if (LoopBypassBlocks.empty()) {
3213     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3214     if (!Cost->requiresScalarEpilogue(VF))
3215       // If there is an epilogue which must run, there's no edge from the
3216       // middle block to exit blocks  and thus no need to update the immediate
3217       // dominator of the exit blocks.
3218       DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3219   }
3220 
3221   LoopBypassBlocks.push_back(SCEVCheckBlock);
3222   AddedSafetyChecks = true;
3223   return SCEVCheckBlock;
3224 }
3225 
3226 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
3227                                                       BasicBlock *Bypass) {
3228   // VPlan-native path does not do any analysis for runtime checks currently.
3229   if (EnableVPlanNativePath)
3230     return nullptr;
3231 
3232   BasicBlock *const MemCheckBlock =
3233       RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader);
3234 
3235   // Check if we generated code that checks in runtime if arrays overlap. We put
3236   // the checks into a separate block to make the more common case of few
3237   // elements faster.
3238   if (!MemCheckBlock)
3239     return nullptr;
3240 
3241   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3242     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3243            "Cannot emit memory checks when optimizing for size, unless forced "
3244            "to vectorize.");
3245     ORE->emit([&]() {
3246       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3247                                         L->getStartLoc(), L->getHeader())
3248              << "Code-size may be reduced by not forcing "
3249                 "vectorization, or by source-code modifications "
3250                 "eliminating the need for runtime checks "
3251                 "(e.g., adding 'restrict').";
3252     });
3253   }
3254 
3255   LoopBypassBlocks.push_back(MemCheckBlock);
3256 
3257   AddedSafetyChecks = true;
3258 
3259   // We currently don't use LoopVersioning for the actual loop cloning but we
3260   // still use it to add the noalias metadata.
3261   LVer = std::make_unique<LoopVersioning>(
3262       *Legal->getLAI(),
3263       Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3264       DT, PSE.getSE());
3265   LVer->prepareNoAliasMetadata();
3266   return MemCheckBlock;
3267 }
3268 
3269 Value *InnerLoopVectorizer::emitTransformedIndex(
3270     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
3271     const InductionDescriptor &ID) const {
3272 
3273   SCEVExpander Exp(*SE, DL, "induction");
3274   auto Step = ID.getStep();
3275   auto StartValue = ID.getStartValue();
3276   assert(Index->getType()->getScalarType() == Step->getType() &&
3277          "Index scalar type does not match StepValue type");
3278 
3279   // Note: the IR at this point is broken. We cannot use SE to create any new
3280   // SCEV and then expand it, hoping that SCEV's simplification will give us
3281   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3282   // lead to various SCEV crashes. So all we can do is to use builder and rely
3283   // on InstCombine for future simplifications. Here we handle some trivial
3284   // cases only.
3285   auto CreateAdd = [&B](Value *X, Value *Y) {
3286     assert(X->getType() == Y->getType() && "Types don't match!");
3287     if (auto *CX = dyn_cast<ConstantInt>(X))
3288       if (CX->isZero())
3289         return Y;
3290     if (auto *CY = dyn_cast<ConstantInt>(Y))
3291       if (CY->isZero())
3292         return X;
3293     return B.CreateAdd(X, Y);
3294   };
3295 
3296   // We allow X to be a vector type, in which case Y will potentially be
3297   // splatted into a vector with the same element count.
3298   auto CreateMul = [&B](Value *X, Value *Y) {
3299     assert(X->getType()->getScalarType() == Y->getType() &&
3300            "Types don't match!");
3301     if (auto *CX = dyn_cast<ConstantInt>(X))
3302       if (CX->isOne())
3303         return Y;
3304     if (auto *CY = dyn_cast<ConstantInt>(Y))
3305       if (CY->isOne())
3306         return X;
3307     VectorType *XVTy = dyn_cast<VectorType>(X->getType());
3308     if (XVTy && !isa<VectorType>(Y->getType()))
3309       Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
3310     return B.CreateMul(X, Y);
3311   };
3312 
3313   // Get a suitable insert point for SCEV expansion. For blocks in the vector
3314   // loop, choose the end of the vector loop header (=LoopVectorBody), because
3315   // the DomTree is not kept up-to-date for additional blocks generated in the
3316   // vector loop. By using the header as insertion point, we guarantee that the
3317   // expanded instructions dominate all their uses.
3318   auto GetInsertPoint = [this, &B]() {
3319     BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3320     if (InsertBB != LoopVectorBody &&
3321         LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
3322       return LoopVectorBody->getTerminator();
3323     return &*B.GetInsertPoint();
3324   };
3325 
3326   switch (ID.getKind()) {
3327   case InductionDescriptor::IK_IntInduction: {
3328     assert(!isa<VectorType>(Index->getType()) &&
3329            "Vector indices not supported for integer inductions yet");
3330     assert(Index->getType() == StartValue->getType() &&
3331            "Index type does not match StartValue type");
3332     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3333       return B.CreateSub(StartValue, Index);
3334     auto *Offset = CreateMul(
3335         Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3336     return CreateAdd(StartValue, Offset);
3337   }
3338   case InductionDescriptor::IK_PtrInduction: {
3339     assert(isa<SCEVConstant>(Step) &&
3340            "Expected constant step for pointer induction");
3341     return B.CreateGEP(
3342         ID.getElementType(), StartValue,
3343         CreateMul(Index,
3344                   Exp.expandCodeFor(Step, Index->getType()->getScalarType(),
3345                                     GetInsertPoint())));
3346   }
3347   case InductionDescriptor::IK_FpInduction: {
3348     assert(!isa<VectorType>(Index->getType()) &&
3349            "Vector indices not supported for FP inductions yet");
3350     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
3351     auto InductionBinOp = ID.getInductionBinOp();
3352     assert(InductionBinOp &&
3353            (InductionBinOp->getOpcode() == Instruction::FAdd ||
3354             InductionBinOp->getOpcode() == Instruction::FSub) &&
3355            "Original bin op should be defined for FP induction");
3356 
3357     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3358     Value *MulExp = B.CreateFMul(StepValue, Index);
3359     return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3360                          "induction");
3361   }
3362   case InductionDescriptor::IK_NoInduction:
3363     return nullptr;
3364   }
3365   llvm_unreachable("invalid enum");
3366 }
3367 
3368 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3369   LoopScalarBody = OrigLoop->getHeader();
3370   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3371   assert(LoopVectorPreHeader && "Invalid loop structure");
3372   LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3373   assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) &&
3374          "multiple exit loop without required epilogue?");
3375 
3376   LoopMiddleBlock =
3377       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3378                  LI, nullptr, Twine(Prefix) + "middle.block");
3379   LoopScalarPreHeader =
3380       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3381                  nullptr, Twine(Prefix) + "scalar.ph");
3382 
3383   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3384 
3385   // Set up the middle block terminator.  Two cases:
3386   // 1) If we know that we must execute the scalar epilogue, emit an
3387   //    unconditional branch.
3388   // 2) Otherwise, we must have a single unique exit block (due to how we
3389   //    implement the multiple exit case).  In this case, set up a conditonal
3390   //    branch from the middle block to the loop scalar preheader, and the
3391   //    exit block.  completeLoopSkeleton will update the condition to use an
3392   //    iteration check, if required to decide whether to execute the remainder.
3393   BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ?
3394     BranchInst::Create(LoopScalarPreHeader) :
3395     BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3396                        Builder.getTrue());
3397   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3398   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3399 
3400   // We intentionally don't let SplitBlock to update LoopInfo since
3401   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3402   // LoopVectorBody is explicitly added to the correct place few lines later.
3403   LoopVectorBody =
3404       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3405                  nullptr, nullptr, Twine(Prefix) + "vector.body");
3406 
3407   // Update dominator for loop exit.
3408   if (!Cost->requiresScalarEpilogue(VF))
3409     // If there is an epilogue which must run, there's no edge from the
3410     // middle block to exit blocks  and thus no need to update the immediate
3411     // dominator of the exit blocks.
3412     DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3413 
3414   // Create and register the new vector loop.
3415   Loop *Lp = LI->AllocateLoop();
3416   Loop *ParentLoop = OrigLoop->getParentLoop();
3417 
3418   // Insert the new loop into the loop nest and register the new basic blocks
3419   // before calling any utilities such as SCEV that require valid LoopInfo.
3420   if (ParentLoop) {
3421     ParentLoop->addChildLoop(Lp);
3422   } else {
3423     LI->addTopLevelLoop(Lp);
3424   }
3425   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3426   return Lp;
3427 }
3428 
3429 void InnerLoopVectorizer::createInductionResumeValues(
3430     Loop *L, Value *VectorTripCount,
3431     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3432   assert(VectorTripCount && L && "Expected valid arguments");
3433   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3434           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3435          "Inconsistent information about additional bypass.");
3436   // We are going to resume the execution of the scalar loop.
3437   // Go over all of the induction variables that we found and fix the
3438   // PHIs that are left in the scalar version of the loop.
3439   // The starting values of PHI nodes depend on the counter of the last
3440   // iteration in the vectorized loop.
3441   // If we come from a bypass edge then we need to start from the original
3442   // start value.
3443   for (auto &InductionEntry : Legal->getInductionVars()) {
3444     PHINode *OrigPhi = InductionEntry.first;
3445     InductionDescriptor II = InductionEntry.second;
3446 
3447     // Create phi nodes to merge from the  backedge-taken check block.
3448     PHINode *BCResumeVal =
3449         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3450                         LoopScalarPreHeader->getTerminator());
3451     // Copy original phi DL over to the new one.
3452     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3453     Value *&EndValue = IVEndValues[OrigPhi];
3454     Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3455     if (OrigPhi == OldInduction) {
3456       // We know what the end value is.
3457       EndValue = VectorTripCount;
3458     } else {
3459       IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3460 
3461       // Fast-math-flags propagate from the original induction instruction.
3462       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3463         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3464 
3465       Type *StepType = II.getStep()->getType();
3466       Instruction::CastOps CastOp =
3467           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3468       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3469       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3470       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3471       EndValue->setName("ind.end");
3472 
3473       // Compute the end value for the additional bypass (if applicable).
3474       if (AdditionalBypass.first) {
3475         B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3476         CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3477                                          StepType, true);
3478         CRD =
3479             B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
3480         EndValueFromAdditionalBypass =
3481             emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3482         EndValueFromAdditionalBypass->setName("ind.end");
3483       }
3484     }
3485     // The new PHI merges the original incoming value, in case of a bypass,
3486     // or the value at the end of the vectorized loop.
3487     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3488 
3489     // Fix the scalar body counter (PHI node).
3490     // The old induction's phi node in the scalar body needs the truncated
3491     // value.
3492     for (BasicBlock *BB : LoopBypassBlocks)
3493       BCResumeVal->addIncoming(II.getStartValue(), BB);
3494 
3495     if (AdditionalBypass.first)
3496       BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3497                                             EndValueFromAdditionalBypass);
3498 
3499     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3500   }
3501 }
3502 
3503 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3504                                                       MDNode *OrigLoopID) {
3505   assert(L && "Expected valid loop.");
3506 
3507   // The trip counts should be cached by now.
3508   Value *Count = getOrCreateTripCount(L);
3509   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3510 
3511   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3512 
3513   // Add a check in the middle block to see if we have completed
3514   // all of the iterations in the first vector loop.  Three cases:
3515   // 1) If we require a scalar epilogue, there is no conditional branch as
3516   //    we unconditionally branch to the scalar preheader.  Do nothing.
3517   // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3518   //    Thus if tail is to be folded, we know we don't need to run the
3519   //    remainder and we can use the previous value for the condition (true).
3520   // 3) Otherwise, construct a runtime check.
3521   if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) {
3522     Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3523                                         Count, VectorTripCount, "cmp.n",
3524                                         LoopMiddleBlock->getTerminator());
3525 
3526     // Here we use the same DebugLoc as the scalar loop latch terminator instead
3527     // of the corresponding compare because they may have ended up with
3528     // different line numbers and we want to avoid awkward line stepping while
3529     // debugging. Eg. if the compare has got a line number inside the loop.
3530     CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3531     cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3532   }
3533 
3534   // Get ready to start creating new instructions into the vectorized body.
3535   assert(LoopVectorPreHeader == L->getLoopPreheader() &&
3536          "Inconsistent vector loop preheader");
3537   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3538 
3539   Optional<MDNode *> VectorizedLoopID =
3540       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3541                                       LLVMLoopVectorizeFollowupVectorized});
3542   if (VectorizedLoopID.hasValue()) {
3543     L->setLoopID(VectorizedLoopID.getValue());
3544 
3545     // Do not setAlreadyVectorized if loop attributes have been defined
3546     // explicitly.
3547     return LoopVectorPreHeader;
3548   }
3549 
3550   // Keep all loop hints from the original loop on the vector loop (we'll
3551   // replace the vectorizer-specific hints below).
3552   if (MDNode *LID = OrigLoop->getLoopID())
3553     L->setLoopID(LID);
3554 
3555   LoopVectorizeHints Hints(L, true, *ORE);
3556   Hints.setAlreadyVectorized();
3557 
3558 #ifdef EXPENSIVE_CHECKS
3559   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3560   LI->verify(*DT);
3561 #endif
3562 
3563   return LoopVectorPreHeader;
3564 }
3565 
3566 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3567   /*
3568    In this function we generate a new loop. The new loop will contain
3569    the vectorized instructions while the old loop will continue to run the
3570    scalar remainder.
3571 
3572        [ ] <-- loop iteration number check.
3573     /   |
3574    /    v
3575   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3576   |  /  |
3577   | /   v
3578   ||   [ ]     <-- vector pre header.
3579   |/    |
3580   |     v
3581   |    [  ] \
3582   |    [  ]_|   <-- vector loop.
3583   |     |
3584   |     v
3585   \   -[ ]   <--- middle-block.
3586    \/   |
3587    /\   v
3588    | ->[ ]     <--- new preheader.
3589    |    |
3590  (opt)  v      <-- edge from middle to exit iff epilogue is not required.
3591    |   [ ] \
3592    |   [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue).
3593     \   |
3594      \  v
3595       >[ ]     <-- exit block(s).
3596    ...
3597    */
3598 
3599   // Get the metadata of the original loop before it gets modified.
3600   MDNode *OrigLoopID = OrigLoop->getLoopID();
3601 
3602   // Workaround!  Compute the trip count of the original loop and cache it
3603   // before we start modifying the CFG.  This code has a systemic problem
3604   // wherein it tries to run analysis over partially constructed IR; this is
3605   // wrong, and not simply for SCEV.  The trip count of the original loop
3606   // simply happens to be prone to hitting this in practice.  In theory, we
3607   // can hit the same issue for any SCEV, or ValueTracking query done during
3608   // mutation.  See PR49900.
3609   getOrCreateTripCount(OrigLoop);
3610 
3611   // Create an empty vector loop, and prepare basic blocks for the runtime
3612   // checks.
3613   Loop *Lp = createVectorLoopSkeleton("");
3614 
3615   // Now, compare the new count to zero. If it is zero skip the vector loop and
3616   // jump to the scalar loop. This check also covers the case where the
3617   // backedge-taken count is uint##_max: adding one to it will overflow leading
3618   // to an incorrect trip count of zero. In this (rare) case we will also jump
3619   // to the scalar loop.
3620   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3621 
3622   // Generate the code to check any assumptions that we've made for SCEV
3623   // expressions.
3624   emitSCEVChecks(Lp, LoopScalarPreHeader);
3625 
3626   // Generate the code that checks in runtime if arrays overlap. We put the
3627   // checks into a separate block to make the more common case of few elements
3628   // faster.
3629   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3630 
3631   // Some loops have a single integer induction variable, while other loops
3632   // don't. One example is c++ iterators that often have multiple pointer
3633   // induction variables. In the code below we also support a case where we
3634   // don't have a single induction variable.
3635   //
3636   // We try to obtain an induction variable from the original loop as hard
3637   // as possible. However if we don't find one that:
3638   //   - is an integer
3639   //   - counts from zero, stepping by one
3640   //   - is the size of the widest induction variable type
3641   // then we create a new one.
3642   OldInduction = Legal->getPrimaryInduction();
3643   Type *IdxTy = Legal->getWidestInductionType();
3644   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3645   // The loop step is equal to the vectorization factor (num of SIMD elements)
3646   // times the unroll factor (num of SIMD instructions).
3647   Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
3648   Value *Step = createStepForVF(Builder, IdxTy, VF, UF);
3649   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3650   Induction =
3651       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3652                               getDebugLocFromInstOrOperands(OldInduction));
3653 
3654   // Emit phis for the new starting index of the scalar loop.
3655   createInductionResumeValues(Lp, CountRoundDown);
3656 
3657   return completeLoopSkeleton(Lp, OrigLoopID);
3658 }
3659 
3660 // Fix up external users of the induction variable. At this point, we are
3661 // in LCSSA form, with all external PHIs that use the IV having one input value,
3662 // coming from the remainder loop. We need those PHIs to also have a correct
3663 // value for the IV when arriving directly from the middle block.
3664 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3665                                        const InductionDescriptor &II,
3666                                        Value *CountRoundDown, Value *EndValue,
3667                                        BasicBlock *MiddleBlock) {
3668   // There are two kinds of external IV usages - those that use the value
3669   // computed in the last iteration (the PHI) and those that use the penultimate
3670   // value (the value that feeds into the phi from the loop latch).
3671   // We allow both, but they, obviously, have different values.
3672 
3673   assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3674 
3675   DenseMap<Value *, Value *> MissingVals;
3676 
3677   // An external user of the last iteration's value should see the value that
3678   // the remainder loop uses to initialize its own IV.
3679   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3680   for (User *U : PostInc->users()) {
3681     Instruction *UI = cast<Instruction>(U);
3682     if (!OrigLoop->contains(UI)) {
3683       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3684       MissingVals[UI] = EndValue;
3685     }
3686   }
3687 
3688   // An external user of the penultimate value need to see EndValue - Step.
3689   // The simplest way to get this is to recompute it from the constituent SCEVs,
3690   // that is Start + (Step * (CRD - 1)).
3691   for (User *U : OrigPhi->users()) {
3692     auto *UI = cast<Instruction>(U);
3693     if (!OrigLoop->contains(UI)) {
3694       const DataLayout &DL =
3695           OrigLoop->getHeader()->getModule()->getDataLayout();
3696       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3697 
3698       IRBuilder<> B(MiddleBlock->getTerminator());
3699 
3700       // Fast-math-flags propagate from the original induction instruction.
3701       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3702         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3703 
3704       Value *CountMinusOne = B.CreateSub(
3705           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3706       Value *CMO =
3707           !II.getStep()->getType()->isIntegerTy()
3708               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3709                              II.getStep()->getType())
3710               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3711       CMO->setName("cast.cmo");
3712       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3713       Escape->setName("ind.escape");
3714       MissingVals[UI] = Escape;
3715     }
3716   }
3717 
3718   for (auto &I : MissingVals) {
3719     PHINode *PHI = cast<PHINode>(I.first);
3720     // One corner case we have to handle is two IVs "chasing" each-other,
3721     // that is %IV2 = phi [...], [ %IV1, %latch ]
3722     // In this case, if IV1 has an external use, we need to avoid adding both
3723     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3724     // don't already have an incoming value for the middle block.
3725     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3726       PHI->addIncoming(I.second, MiddleBlock);
3727   }
3728 }
3729 
3730 namespace {
3731 
3732 struct CSEDenseMapInfo {
3733   static bool canHandle(const Instruction *I) {
3734     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3735            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3736   }
3737 
3738   static inline Instruction *getEmptyKey() {
3739     return DenseMapInfo<Instruction *>::getEmptyKey();
3740   }
3741 
3742   static inline Instruction *getTombstoneKey() {
3743     return DenseMapInfo<Instruction *>::getTombstoneKey();
3744   }
3745 
3746   static unsigned getHashValue(const Instruction *I) {
3747     assert(canHandle(I) && "Unknown instruction!");
3748     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3749                                                            I->value_op_end()));
3750   }
3751 
3752   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3753     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3754         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3755       return LHS == RHS;
3756     return LHS->isIdenticalTo(RHS);
3757   }
3758 };
3759 
3760 } // end anonymous namespace
3761 
3762 ///Perform cse of induction variable instructions.
3763 static void cse(BasicBlock *BB) {
3764   // Perform simple cse.
3765   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3766   for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3767     if (!CSEDenseMapInfo::canHandle(&In))
3768       continue;
3769 
3770     // Check if we can replace this instruction with any of the
3771     // visited instructions.
3772     if (Instruction *V = CSEMap.lookup(&In)) {
3773       In.replaceAllUsesWith(V);
3774       In.eraseFromParent();
3775       continue;
3776     }
3777 
3778     CSEMap[&In] = &In;
3779   }
3780 }
3781 
3782 InstructionCost
3783 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3784                                               bool &NeedToScalarize) const {
3785   Function *F = CI->getCalledFunction();
3786   Type *ScalarRetTy = CI->getType();
3787   SmallVector<Type *, 4> Tys, ScalarTys;
3788   for (auto &ArgOp : CI->args())
3789     ScalarTys.push_back(ArgOp->getType());
3790 
3791   // Estimate cost of scalarized vector call. The source operands are assumed
3792   // to be vectors, so we need to extract individual elements from there,
3793   // execute VF scalar calls, and then gather the result into the vector return
3794   // value.
3795   InstructionCost ScalarCallCost =
3796       TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
3797   if (VF.isScalar())
3798     return ScalarCallCost;
3799 
3800   // Compute corresponding vector type for return value and arguments.
3801   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3802   for (Type *ScalarTy : ScalarTys)
3803     Tys.push_back(ToVectorTy(ScalarTy, VF));
3804 
3805   // Compute costs of unpacking argument values for the scalar calls and
3806   // packing the return values to a vector.
3807   InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
3808 
3809   InstructionCost Cost =
3810       ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3811 
3812   // If we can't emit a vector call for this function, then the currently found
3813   // cost is the cost we need to return.
3814   NeedToScalarize = true;
3815   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3816   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3817 
3818   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3819     return Cost;
3820 
3821   // If the corresponding vector cost is cheaper, return its cost.
3822   InstructionCost VectorCallCost =
3823       TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
3824   if (VectorCallCost < Cost) {
3825     NeedToScalarize = false;
3826     Cost = VectorCallCost;
3827   }
3828   return Cost;
3829 }
3830 
3831 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3832   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3833     return Elt;
3834   return VectorType::get(Elt, VF);
3835 }
3836 
3837 InstructionCost
3838 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3839                                                    ElementCount VF) const {
3840   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3841   assert(ID && "Expected intrinsic call!");
3842   Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3843   FastMathFlags FMF;
3844   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3845     FMF = FPMO->getFastMathFlags();
3846 
3847   SmallVector<const Value *> Arguments(CI->args());
3848   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3849   SmallVector<Type *> ParamTys;
3850   std::transform(FTy->param_begin(), FTy->param_end(),
3851                  std::back_inserter(ParamTys),
3852                  [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3853 
3854   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3855                                     dyn_cast<IntrinsicInst>(CI));
3856   return TTI.getIntrinsicInstrCost(CostAttrs,
3857                                    TargetTransformInfo::TCK_RecipThroughput);
3858 }
3859 
3860 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3861   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3862   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3863   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3864 }
3865 
3866 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3867   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3868   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3869   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3870 }
3871 
3872 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3873   // For every instruction `I` in MinBWs, truncate the operands, create a
3874   // truncated version of `I` and reextend its result. InstCombine runs
3875   // later and will remove any ext/trunc pairs.
3876   SmallPtrSet<Value *, 4> Erased;
3877   for (const auto &KV : Cost->getMinimalBitwidths()) {
3878     // If the value wasn't vectorized, we must maintain the original scalar
3879     // type. The absence of the value from State indicates that it
3880     // wasn't vectorized.
3881     // FIXME: Should not rely on getVPValue at this point.
3882     VPValue *Def = State.Plan->getVPValue(KV.first, true);
3883     if (!State.hasAnyVectorValue(Def))
3884       continue;
3885     for (unsigned Part = 0; Part < UF; ++Part) {
3886       Value *I = State.get(Def, Part);
3887       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3888         continue;
3889       Type *OriginalTy = I->getType();
3890       Type *ScalarTruncatedTy =
3891           IntegerType::get(OriginalTy->getContext(), KV.second);
3892       auto *TruncatedTy = VectorType::get(
3893           ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
3894       if (TruncatedTy == OriginalTy)
3895         continue;
3896 
3897       IRBuilder<> B(cast<Instruction>(I));
3898       auto ShrinkOperand = [&](Value *V) -> Value * {
3899         if (auto *ZI = dyn_cast<ZExtInst>(V))
3900           if (ZI->getSrcTy() == TruncatedTy)
3901             return ZI->getOperand(0);
3902         return B.CreateZExtOrTrunc(V, TruncatedTy);
3903       };
3904 
3905       // The actual instruction modification depends on the instruction type,
3906       // unfortunately.
3907       Value *NewI = nullptr;
3908       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3909         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3910                              ShrinkOperand(BO->getOperand(1)));
3911 
3912         // Any wrapping introduced by shrinking this operation shouldn't be
3913         // considered undefined behavior. So, we can't unconditionally copy
3914         // arithmetic wrapping flags to NewI.
3915         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3916       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3917         NewI =
3918             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3919                          ShrinkOperand(CI->getOperand(1)));
3920       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3921         NewI = B.CreateSelect(SI->getCondition(),
3922                               ShrinkOperand(SI->getTrueValue()),
3923                               ShrinkOperand(SI->getFalseValue()));
3924       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3925         switch (CI->getOpcode()) {
3926         default:
3927           llvm_unreachable("Unhandled cast!");
3928         case Instruction::Trunc:
3929           NewI = ShrinkOperand(CI->getOperand(0));
3930           break;
3931         case Instruction::SExt:
3932           NewI = B.CreateSExtOrTrunc(
3933               CI->getOperand(0),
3934               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3935           break;
3936         case Instruction::ZExt:
3937           NewI = B.CreateZExtOrTrunc(
3938               CI->getOperand(0),
3939               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3940           break;
3941         }
3942       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3943         auto Elements0 =
3944             cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
3945         auto *O0 = B.CreateZExtOrTrunc(
3946             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3947         auto Elements1 =
3948             cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
3949         auto *O1 = B.CreateZExtOrTrunc(
3950             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3951 
3952         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3953       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3954         // Don't do anything with the operands, just extend the result.
3955         continue;
3956       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3957         auto Elements =
3958             cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
3959         auto *O0 = B.CreateZExtOrTrunc(
3960             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3961         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3962         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3963       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3964         auto Elements =
3965             cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
3966         auto *O0 = B.CreateZExtOrTrunc(
3967             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3968         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3969       } else {
3970         // If we don't know what to do, be conservative and don't do anything.
3971         continue;
3972       }
3973 
3974       // Lastly, extend the result.
3975       NewI->takeName(cast<Instruction>(I));
3976       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3977       I->replaceAllUsesWith(Res);
3978       cast<Instruction>(I)->eraseFromParent();
3979       Erased.insert(I);
3980       State.reset(Def, Res, Part);
3981     }
3982   }
3983 
3984   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3985   for (const auto &KV : Cost->getMinimalBitwidths()) {
3986     // If the value wasn't vectorized, we must maintain the original scalar
3987     // type. The absence of the value from State indicates that it
3988     // wasn't vectorized.
3989     // FIXME: Should not rely on getVPValue at this point.
3990     VPValue *Def = State.Plan->getVPValue(KV.first, true);
3991     if (!State.hasAnyVectorValue(Def))
3992       continue;
3993     for (unsigned Part = 0; Part < UF; ++Part) {
3994       Value *I = State.get(Def, Part);
3995       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3996       if (Inst && Inst->use_empty()) {
3997         Value *NewI = Inst->getOperand(0);
3998         Inst->eraseFromParent();
3999         State.reset(Def, NewI, Part);
4000       }
4001     }
4002   }
4003 }
4004 
4005 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
4006   // Insert truncates and extends for any truncated instructions as hints to
4007   // InstCombine.
4008   if (VF.isVector())
4009     truncateToMinimalBitwidths(State);
4010 
4011   // Fix widened non-induction PHIs by setting up the PHI operands.
4012   if (OrigPHIsToFix.size()) {
4013     assert(EnableVPlanNativePath &&
4014            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
4015     fixNonInductionPHIs(State);
4016   }
4017 
4018   // At this point every instruction in the original loop is widened to a
4019   // vector form. Now we need to fix the recurrences in the loop. These PHI
4020   // nodes are currently empty because we did not want to introduce cycles.
4021   // This is the second stage of vectorizing recurrences.
4022   fixCrossIterationPHIs(State);
4023 
4024   // Forget the original basic block.
4025   PSE.getSE()->forgetLoop(OrigLoop);
4026 
4027   // If we inserted an edge from the middle block to the unique exit block,
4028   // update uses outside the loop (phis) to account for the newly inserted
4029   // edge.
4030   if (!Cost->requiresScalarEpilogue(VF)) {
4031     // Fix-up external users of the induction variables.
4032     for (auto &Entry : Legal->getInductionVars())
4033       fixupIVUsers(Entry.first, Entry.second,
4034                    getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
4035                    IVEndValues[Entry.first], LoopMiddleBlock);
4036 
4037     fixLCSSAPHIs(State);
4038   }
4039 
4040   for (Instruction *PI : PredicatedInstructions)
4041     sinkScalarOperands(&*PI);
4042 
4043   // Remove redundant induction instructions.
4044   cse(LoopVectorBody);
4045 
4046   // Set/update profile weights for the vector and remainder loops as original
4047   // loop iterations are now distributed among them. Note that original loop
4048   // represented by LoopScalarBody becomes remainder loop after vectorization.
4049   //
4050   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
4051   // end up getting slightly roughened result but that should be OK since
4052   // profile is not inherently precise anyway. Note also possible bypass of
4053   // vector code caused by legality checks is ignored, assigning all the weight
4054   // to the vector loop, optimistically.
4055   //
4056   // For scalable vectorization we can't know at compile time how many iterations
4057   // of the loop are handled in one vector iteration, so instead assume a pessimistic
4058   // vscale of '1'.
4059   setProfileInfoAfterUnrolling(
4060       LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
4061       LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
4062 }
4063 
4064 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
4065   // In order to support recurrences we need to be able to vectorize Phi nodes.
4066   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4067   // stage #2: We now need to fix the recurrences by adding incoming edges to
4068   // the currently empty PHI nodes. At this point every instruction in the
4069   // original loop is widened to a vector form so we can use them to construct
4070   // the incoming edges.
4071   VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock();
4072   for (VPRecipeBase &R : Header->phis()) {
4073     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
4074       fixReduction(ReductionPhi, State);
4075     else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
4076       fixFirstOrderRecurrence(FOR, State);
4077   }
4078 }
4079 
4080 void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR,
4081                                                   VPTransformState &State) {
4082   // This is the second phase of vectorizing first-order recurrences. An
4083   // overview of the transformation is described below. Suppose we have the
4084   // following loop.
4085   //
4086   //   for (int i = 0; i < n; ++i)
4087   //     b[i] = a[i] - a[i - 1];
4088   //
4089   // There is a first-order recurrence on "a". For this loop, the shorthand
4090   // scalar IR looks like:
4091   //
4092   //   scalar.ph:
4093   //     s_init = a[-1]
4094   //     br scalar.body
4095   //
4096   //   scalar.body:
4097   //     i = phi [0, scalar.ph], [i+1, scalar.body]
4098   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
4099   //     s2 = a[i]
4100   //     b[i] = s2 - s1
4101   //     br cond, scalar.body, ...
4102   //
4103   // In this example, s1 is a recurrence because it's value depends on the
4104   // previous iteration. In the first phase of vectorization, we created a
4105   // vector phi v1 for s1. We now complete the vectorization and produce the
4106   // shorthand vector IR shown below (for VF = 4, UF = 1).
4107   //
4108   //   vector.ph:
4109   //     v_init = vector(..., ..., ..., a[-1])
4110   //     br vector.body
4111   //
4112   //   vector.body
4113   //     i = phi [0, vector.ph], [i+4, vector.body]
4114   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
4115   //     v2 = a[i, i+1, i+2, i+3];
4116   //     v3 = vector(v1(3), v2(0, 1, 2))
4117   //     b[i, i+1, i+2, i+3] = v2 - v3
4118   //     br cond, vector.body, middle.block
4119   //
4120   //   middle.block:
4121   //     x = v2(3)
4122   //     br scalar.ph
4123   //
4124   //   scalar.ph:
4125   //     s_init = phi [x, middle.block], [a[-1], otherwise]
4126   //     br scalar.body
4127   //
4128   // After execution completes the vector loop, we extract the next value of
4129   // the recurrence (x) to use as the initial value in the scalar loop.
4130 
4131   // Extract the last vector element in the middle block. This will be the
4132   // initial value for the recurrence when jumping to the scalar loop.
4133   VPValue *PreviousDef = PhiR->getBackedgeValue();
4134   Value *Incoming = State.get(PreviousDef, UF - 1);
4135   auto *ExtractForScalar = Incoming;
4136   auto *IdxTy = Builder.getInt32Ty();
4137   if (VF.isVector()) {
4138     auto *One = ConstantInt::get(IdxTy, 1);
4139     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4140     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
4141     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
4142     ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
4143                                                     "vector.recur.extract");
4144   }
4145   // Extract the second last element in the middle block if the
4146   // Phi is used outside the loop. We need to extract the phi itself
4147   // and not the last element (the phi update in the current iteration). This
4148   // will be the value when jumping to the exit block from the LoopMiddleBlock,
4149   // when the scalar loop is not run at all.
4150   Value *ExtractForPhiUsedOutsideLoop = nullptr;
4151   if (VF.isVector()) {
4152     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
4153     auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
4154     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4155         Incoming, Idx, "vector.recur.extract.for.phi");
4156   } else if (UF > 1)
4157     // When loop is unrolled without vectorizing, initialize
4158     // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
4159     // of `Incoming`. This is analogous to the vectorized case above: extracting
4160     // the second last element when VF > 1.
4161     ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
4162 
4163   // Fix the initial value of the original recurrence in the scalar loop.
4164   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
4165   PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
4166   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4167   auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
4168   for (auto *BB : predecessors(LoopScalarPreHeader)) {
4169     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4170     Start->addIncoming(Incoming, BB);
4171   }
4172 
4173   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
4174   Phi->setName("scalar.recur");
4175 
4176   // Finally, fix users of the recurrence outside the loop. The users will need
4177   // either the last value of the scalar recurrence or the last value of the
4178   // vector recurrence we extracted in the middle block. Since the loop is in
4179   // LCSSA form, we just need to find all the phi nodes for the original scalar
4180   // recurrence in the exit block, and then add an edge for the middle block.
4181   // Note that LCSSA does not imply single entry when the original scalar loop
4182   // had multiple exiting edges (as we always run the last iteration in the
4183   // scalar epilogue); in that case, there is no edge from middle to exit and
4184   // and thus no phis which needed updated.
4185   if (!Cost->requiresScalarEpilogue(VF))
4186     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4187       if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi))
4188         LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4189 }
4190 
4191 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
4192                                        VPTransformState &State) {
4193   PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
4194   // Get it's reduction variable descriptor.
4195   assert(Legal->isReductionVariable(OrigPhi) &&
4196          "Unable to find the reduction variable");
4197   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
4198 
4199   RecurKind RK = RdxDesc.getRecurrenceKind();
4200   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4201   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4202   setDebugLocFromInst(ReductionStartValue);
4203 
4204   VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
4205   // This is the vector-clone of the value that leaves the loop.
4206   Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
4207 
4208   // Wrap flags are in general invalid after vectorization, clear them.
4209   clearReductionWrapFlags(RdxDesc, State);
4210 
4211   // Before each round, move the insertion point right between
4212   // the PHIs and the values we are going to write.
4213   // This allows us to write both PHINodes and the extractelement
4214   // instructions.
4215   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4216 
4217   setDebugLocFromInst(LoopExitInst);
4218 
4219   Type *PhiTy = OrigPhi->getType();
4220   // If tail is folded by masking, the vector value to leave the loop should be
4221   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
4222   // instead of the former. For an inloop reduction the reduction will already
4223   // be predicated, and does not need to be handled here.
4224   if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
4225     for (unsigned Part = 0; Part < UF; ++Part) {
4226       Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
4227       Value *Sel = nullptr;
4228       for (User *U : VecLoopExitInst->users()) {
4229         if (isa<SelectInst>(U)) {
4230           assert(!Sel && "Reduction exit feeding two selects");
4231           Sel = U;
4232         } else
4233           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
4234       }
4235       assert(Sel && "Reduction exit feeds no select");
4236       State.reset(LoopExitInstDef, Sel, Part);
4237 
4238       // If the target can create a predicated operator for the reduction at no
4239       // extra cost in the loop (for example a predicated vadd), it can be
4240       // cheaper for the select to remain in the loop than be sunk out of it,
4241       // and so use the select value for the phi instead of the old
4242       // LoopExitValue.
4243       if (PreferPredicatedReductionSelect ||
4244           TTI->preferPredicatedReductionSelect(
4245               RdxDesc.getOpcode(), PhiTy,
4246               TargetTransformInfo::ReductionFlags())) {
4247         auto *VecRdxPhi =
4248             cast<PHINode>(State.get(PhiR, Part));
4249         VecRdxPhi->setIncomingValueForBlock(
4250             LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4251       }
4252     }
4253   }
4254 
4255   // If the vector reduction can be performed in a smaller type, we truncate
4256   // then extend the loop exit value to enable InstCombine to evaluate the
4257   // entire expression in the smaller type.
4258   if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
4259     assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
4260     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4261     Builder.SetInsertPoint(
4262         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4263     VectorParts RdxParts(UF);
4264     for (unsigned Part = 0; Part < UF; ++Part) {
4265       RdxParts[Part] = State.get(LoopExitInstDef, Part);
4266       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4267       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4268                                         : Builder.CreateZExt(Trunc, VecTy);
4269       for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users()))
4270         if (U != Trunc) {
4271           U->replaceUsesOfWith(RdxParts[Part], Extnd);
4272           RdxParts[Part] = Extnd;
4273         }
4274     }
4275     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4276     for (unsigned Part = 0; Part < UF; ++Part) {
4277       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4278       State.reset(LoopExitInstDef, RdxParts[Part], Part);
4279     }
4280   }
4281 
4282   // Reduce all of the unrolled parts into a single vector.
4283   Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
4284   unsigned Op = RecurrenceDescriptor::getOpcode(RK);
4285 
4286   // The middle block terminator has already been assigned a DebugLoc here (the
4287   // OrigLoop's single latch terminator). We want the whole middle block to
4288   // appear to execute on this line because: (a) it is all compiler generated,
4289   // (b) these instructions are always executed after evaluating the latch
4290   // conditional branch, and (c) other passes may add new predecessors which
4291   // terminate on this line. This is the easiest way to ensure we don't
4292   // accidentally cause an extra step back into the loop while debugging.
4293   setDebugLocFromInst(LoopMiddleBlock->getTerminator());
4294   if (PhiR->isOrdered())
4295     ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
4296   else {
4297     // Floating-point operations should have some FMF to enable the reduction.
4298     IRBuilderBase::FastMathFlagGuard FMFG(Builder);
4299     Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
4300     for (unsigned Part = 1; Part < UF; ++Part) {
4301       Value *RdxPart = State.get(LoopExitInstDef, Part);
4302       if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
4303         ReducedPartRdx = Builder.CreateBinOp(
4304             (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
4305       } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK))
4306         ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK,
4307                                            ReducedPartRdx, RdxPart);
4308       else
4309         ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
4310     }
4311   }
4312 
4313   // Create the reduction after the loop. Note that inloop reductions create the
4314   // target reduction in the loop using a Reduction recipe.
4315   if (VF.isVector() && !PhiR->isInLoop()) {
4316     ReducedPartRdx =
4317         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi);
4318     // If the reduction can be performed in a smaller type, we need to extend
4319     // the reduction to the wider type before we branch to the original loop.
4320     if (PhiTy != RdxDesc.getRecurrenceType())
4321       ReducedPartRdx = RdxDesc.isSigned()
4322                            ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
4323                            : Builder.CreateZExt(ReducedPartRdx, PhiTy);
4324   }
4325 
4326   // Create a phi node that merges control-flow from the backedge-taken check
4327   // block and the middle block.
4328   PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
4329                                         LoopScalarPreHeader->getTerminator());
4330   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4331     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4332   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4333 
4334   // Now, we need to fix the users of the reduction variable
4335   // inside and outside of the scalar remainder loop.
4336 
4337   // We know that the loop is in LCSSA form. We need to update the PHI nodes
4338   // in the exit blocks.  See comment on analogous loop in
4339   // fixFirstOrderRecurrence for a more complete explaination of the logic.
4340   if (!Cost->requiresScalarEpilogue(VF))
4341     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4342       if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst))
4343         LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4344 
4345   // Fix the scalar loop reduction variable with the incoming reduction sum
4346   // from the vector body and from the backedge value.
4347   int IncomingEdgeBlockIdx =
4348       OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4349   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4350   // Pick the other block.
4351   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4352   OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4353   OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4354 }
4355 
4356 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
4357                                                   VPTransformState &State) {
4358   RecurKind RK = RdxDesc.getRecurrenceKind();
4359   if (RK != RecurKind::Add && RK != RecurKind::Mul)
4360     return;
4361 
4362   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4363   assert(LoopExitInstr && "null loop exit instruction");
4364   SmallVector<Instruction *, 8> Worklist;
4365   SmallPtrSet<Instruction *, 8> Visited;
4366   Worklist.push_back(LoopExitInstr);
4367   Visited.insert(LoopExitInstr);
4368 
4369   while (!Worklist.empty()) {
4370     Instruction *Cur = Worklist.pop_back_val();
4371     if (isa<OverflowingBinaryOperator>(Cur))
4372       for (unsigned Part = 0; Part < UF; ++Part) {
4373         // FIXME: Should not rely on getVPValue at this point.
4374         Value *V = State.get(State.Plan->getVPValue(Cur, true), Part);
4375         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4376       }
4377 
4378     for (User *U : Cur->users()) {
4379       Instruction *UI = cast<Instruction>(U);
4380       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4381           Visited.insert(UI).second)
4382         Worklist.push_back(UI);
4383     }
4384   }
4385 }
4386 
4387 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) {
4388   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4389     if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
4390       // Some phis were already hand updated by the reduction and recurrence
4391       // code above, leave them alone.
4392       continue;
4393 
4394     auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4395     // Non-instruction incoming values will have only one value.
4396 
4397     VPLane Lane = VPLane::getFirstLane();
4398     if (isa<Instruction>(IncomingValue) &&
4399         !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue),
4400                                            VF))
4401       Lane = VPLane::getLastLaneForVF(VF);
4402 
4403     // Can be a loop invariant incoming value or the last scalar value to be
4404     // extracted from the vectorized loop.
4405     // FIXME: Should not rely on getVPValue at this point.
4406     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4407     Value *lastIncomingValue =
4408         OrigLoop->isLoopInvariant(IncomingValue)
4409             ? IncomingValue
4410             : State.get(State.Plan->getVPValue(IncomingValue, true),
4411                         VPIteration(UF - 1, Lane));
4412     LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4413   }
4414 }
4415 
4416 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4417   // The basic block and loop containing the predicated instruction.
4418   auto *PredBB = PredInst->getParent();
4419   auto *VectorLoop = LI->getLoopFor(PredBB);
4420 
4421   // Initialize a worklist with the operands of the predicated instruction.
4422   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4423 
4424   // Holds instructions that we need to analyze again. An instruction may be
4425   // reanalyzed if we don't yet know if we can sink it or not.
4426   SmallVector<Instruction *, 8> InstsToReanalyze;
4427 
4428   // Returns true if a given use occurs in the predicated block. Phi nodes use
4429   // their operands in their corresponding predecessor blocks.
4430   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4431     auto *I = cast<Instruction>(U.getUser());
4432     BasicBlock *BB = I->getParent();
4433     if (auto *Phi = dyn_cast<PHINode>(I))
4434       BB = Phi->getIncomingBlock(
4435           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4436     return BB == PredBB;
4437   };
4438 
4439   // Iteratively sink the scalarized operands of the predicated instruction
4440   // into the block we created for it. When an instruction is sunk, it's
4441   // operands are then added to the worklist. The algorithm ends after one pass
4442   // through the worklist doesn't sink a single instruction.
4443   bool Changed;
4444   do {
4445     // Add the instructions that need to be reanalyzed to the worklist, and
4446     // reset the changed indicator.
4447     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4448     InstsToReanalyze.clear();
4449     Changed = false;
4450 
4451     while (!Worklist.empty()) {
4452       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4453 
4454       // We can't sink an instruction if it is a phi node, is not in the loop,
4455       // or may have side effects.
4456       if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
4457           I->mayHaveSideEffects())
4458         continue;
4459 
4460       // If the instruction is already in PredBB, check if we can sink its
4461       // operands. In that case, VPlan's sinkScalarOperands() succeeded in
4462       // sinking the scalar instruction I, hence it appears in PredBB; but it
4463       // may have failed to sink I's operands (recursively), which we try
4464       // (again) here.
4465       if (I->getParent() == PredBB) {
4466         Worklist.insert(I->op_begin(), I->op_end());
4467         continue;
4468       }
4469 
4470       // It's legal to sink the instruction if all its uses occur in the
4471       // predicated block. Otherwise, there's nothing to do yet, and we may
4472       // need to reanalyze the instruction.
4473       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4474         InstsToReanalyze.push_back(I);
4475         continue;
4476       }
4477 
4478       // Move the instruction to the beginning of the predicated block, and add
4479       // it's operands to the worklist.
4480       I->moveBefore(&*PredBB->getFirstInsertionPt());
4481       Worklist.insert(I->op_begin(), I->op_end());
4482 
4483       // The sinking may have enabled other instructions to be sunk, so we will
4484       // need to iterate.
4485       Changed = true;
4486     }
4487   } while (Changed);
4488 }
4489 
4490 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
4491   for (PHINode *OrigPhi : OrigPHIsToFix) {
4492     VPWidenPHIRecipe *VPPhi =
4493         cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi));
4494     PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4495     // Make sure the builder has a valid insert point.
4496     Builder.SetInsertPoint(NewPhi);
4497     for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4498       VPValue *Inc = VPPhi->getIncomingValue(i);
4499       VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4500       NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4501     }
4502   }
4503 }
4504 
4505 bool InnerLoopVectorizer::useOrderedReductions(
4506     const RecurrenceDescriptor &RdxDesc) {
4507   return Cost->useOrderedReductions(RdxDesc);
4508 }
4509 
4510 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
4511                                               VPWidenPHIRecipe *PhiR,
4512                                               VPTransformState &State) {
4513   PHINode *P = cast<PHINode>(PN);
4514   if (EnableVPlanNativePath) {
4515     // Currently we enter here in the VPlan-native path for non-induction
4516     // PHIs where all control flow is uniform. We simply widen these PHIs.
4517     // Create a vector phi with no operands - the vector phi operands will be
4518     // set at the end of vector code generation.
4519     Type *VecTy = (State.VF.isScalar())
4520                       ? PN->getType()
4521                       : VectorType::get(PN->getType(), State.VF);
4522     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4523     State.set(PhiR, VecPhi, 0);
4524     OrigPHIsToFix.push_back(P);
4525 
4526     return;
4527   }
4528 
4529   assert(PN->getParent() == OrigLoop->getHeader() &&
4530          "Non-header phis should have been handled elsewhere");
4531 
4532   // In order to support recurrences we need to be able to vectorize Phi nodes.
4533   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4534   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4535   // this value when we vectorize all of the instructions that use the PHI.
4536 
4537   assert(!Legal->isReductionVariable(P) &&
4538          "reductions should be handled elsewhere");
4539 
4540   setDebugLocFromInst(P);
4541 
4542   // This PHINode must be an induction variable.
4543   // Make sure that we know about it.
4544   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4545 
4546   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4547   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4548 
4549   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4550   // which can be found from the original scalar operations.
4551   switch (II.getKind()) {
4552   case InductionDescriptor::IK_NoInduction:
4553     llvm_unreachable("Unknown induction");
4554   case InductionDescriptor::IK_IntInduction:
4555   case InductionDescriptor::IK_FpInduction:
4556     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4557   case InductionDescriptor::IK_PtrInduction: {
4558     // Handle the pointer induction variable case.
4559     assert(P->getType()->isPointerTy() && "Unexpected type.");
4560 
4561     if (Cost->isScalarAfterVectorization(P, State.VF)) {
4562       // This is the normalized GEP that starts counting at zero.
4563       Value *PtrInd =
4564           Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4565       // Determine the number of scalars we need to generate for each unroll
4566       // iteration. If the instruction is uniform, we only need to generate the
4567       // first lane. Otherwise, we generate all VF values.
4568       bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF);
4569       assert((IsUniform || !State.VF.isScalable()) &&
4570              "Cannot scalarize a scalable VF");
4571       unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
4572 
4573       for (unsigned Part = 0; Part < UF; ++Part) {
4574         Value *PartStart =
4575             createStepForVF(Builder, PtrInd->getType(), VF, Part);
4576 
4577         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4578           Value *Idx = Builder.CreateAdd(
4579               PartStart, ConstantInt::get(PtrInd->getType(), Lane));
4580           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4581           Value *SclrGep =
4582               emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4583           SclrGep->setName("next.gep");
4584           State.set(PhiR, SclrGep, VPIteration(Part, Lane));
4585         }
4586       }
4587       return;
4588     }
4589     assert(isa<SCEVConstant>(II.getStep()) &&
4590            "Induction step not a SCEV constant!");
4591     Type *PhiType = II.getStep()->getType();
4592 
4593     // Build a pointer phi
4594     Value *ScalarStartValue = II.getStartValue();
4595     Type *ScStValueType = ScalarStartValue->getType();
4596     PHINode *NewPointerPhi =
4597         PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4598     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4599 
4600     // A pointer induction, performed by using a gep
4601     BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4602     Instruction *InductionLoc = LoopLatch->getTerminator();
4603     const SCEV *ScalarStep = II.getStep();
4604     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4605     Value *ScalarStepValue =
4606         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4607     Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF);
4608     Value *NumUnrolledElems =
4609         Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
4610     Value *InductionGEP = GetElementPtrInst::Create(
4611         II.getElementType(), NewPointerPhi,
4612         Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
4613         InductionLoc);
4614     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4615 
4616     // Create UF many actual address geps that use the pointer
4617     // phi as base and a vectorized version of the step value
4618     // (<step*0, ..., step*N>) as offset.
4619     for (unsigned Part = 0; Part < State.UF; ++Part) {
4620       Type *VecPhiType = VectorType::get(PhiType, State.VF);
4621       Value *StartOffsetScalar =
4622           Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
4623       Value *StartOffset =
4624           Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
4625       // Create a vector of consecutive numbers from zero to VF.
4626       StartOffset =
4627           Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType));
4628 
4629       Value *GEP = Builder.CreateGEP(
4630           II.getElementType(), NewPointerPhi,
4631           Builder.CreateMul(
4632               StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue),
4633               "vector.gep"));
4634       State.set(PhiR, GEP, Part);
4635     }
4636   }
4637   }
4638 }
4639 
4640 /// A helper function for checking whether an integer division-related
4641 /// instruction may divide by zero (in which case it must be predicated if
4642 /// executed conditionally in the scalar code).
4643 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4644 /// Non-zero divisors that are non compile-time constants will not be
4645 /// converted into multiplication, so we will still end up scalarizing
4646 /// the division, but can do so w/o predication.
4647 static bool mayDivideByZero(Instruction &I) {
4648   assert((I.getOpcode() == Instruction::UDiv ||
4649           I.getOpcode() == Instruction::SDiv ||
4650           I.getOpcode() == Instruction::URem ||
4651           I.getOpcode() == Instruction::SRem) &&
4652          "Unexpected instruction");
4653   Value *Divisor = I.getOperand(1);
4654   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4655   return !CInt || CInt->isZero();
4656 }
4657 
4658 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4659                                                VPUser &ArgOperands,
4660                                                VPTransformState &State) {
4661   assert(!isa<DbgInfoIntrinsic>(I) &&
4662          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4663   setDebugLocFromInst(&I);
4664 
4665   Module *M = I.getParent()->getParent()->getParent();
4666   auto *CI = cast<CallInst>(&I);
4667 
4668   SmallVector<Type *, 4> Tys;
4669   for (Value *ArgOperand : CI->args())
4670     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4671 
4672   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4673 
4674   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4675   // version of the instruction.
4676   // Is it beneficial to perform intrinsic call compared to lib call?
4677   bool NeedToScalarize = false;
4678   InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4679   InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
4680   bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
4681   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4682          "Instruction should be scalarized elsewhere.");
4683   assert((IntrinsicCost.isValid() || CallCost.isValid()) &&
4684          "Either the intrinsic cost or vector call cost must be valid");
4685 
4686   for (unsigned Part = 0; Part < UF; ++Part) {
4687     SmallVector<Type *, 2> TysForDecl = {CI->getType()};
4688     SmallVector<Value *, 4> Args;
4689     for (auto &I : enumerate(ArgOperands.operands())) {
4690       // Some intrinsics have a scalar argument - don't replace it with a
4691       // vector.
4692       Value *Arg;
4693       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4694         Arg = State.get(I.value(), Part);
4695       else {
4696         Arg = State.get(I.value(), VPIteration(0, 0));
4697         if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index()))
4698           TysForDecl.push_back(Arg->getType());
4699       }
4700       Args.push_back(Arg);
4701     }
4702 
4703     Function *VectorF;
4704     if (UseVectorIntrinsic) {
4705       // Use vector version of the intrinsic.
4706       if (VF.isVector())
4707         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4708       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4709       assert(VectorF && "Can't retrieve vector intrinsic.");
4710     } else {
4711       // Use vector version of the function call.
4712       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4713 #ifndef NDEBUG
4714       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4715              "Can't create vector function.");
4716 #endif
4717         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4718     }
4719       SmallVector<OperandBundleDef, 1> OpBundles;
4720       CI->getOperandBundlesAsDefs(OpBundles);
4721       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4722 
4723       if (isa<FPMathOperator>(V))
4724         V->copyFastMathFlags(CI);
4725 
4726       State.set(Def, V, Part);
4727       addMetadata(V, &I);
4728   }
4729 }
4730 
4731 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4732   // We should not collect Scalars more than once per VF. Right now, this
4733   // function is called from collectUniformsAndScalars(), which already does
4734   // this check. Collecting Scalars for VF=1 does not make any sense.
4735   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4736          "This function should not be visited twice for the same VF");
4737 
4738   SmallSetVector<Instruction *, 8> Worklist;
4739 
4740   // These sets are used to seed the analysis with pointers used by memory
4741   // accesses that will remain scalar.
4742   SmallSetVector<Instruction *, 8> ScalarPtrs;
4743   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4744   auto *Latch = TheLoop->getLoopLatch();
4745 
4746   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4747   // The pointer operands of loads and stores will be scalar as long as the
4748   // memory access is not a gather or scatter operation. The value operand of a
4749   // store will remain scalar if the store is scalarized.
4750   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4751     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4752     assert(WideningDecision != CM_Unknown &&
4753            "Widening decision should be ready at this moment");
4754     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4755       if (Ptr == Store->getValueOperand())
4756         return WideningDecision == CM_Scalarize;
4757     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4758            "Ptr is neither a value or pointer operand");
4759     return WideningDecision != CM_GatherScatter;
4760   };
4761 
4762   // A helper that returns true if the given value is a bitcast or
4763   // getelementptr instruction contained in the loop.
4764   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4765     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4766             isa<GetElementPtrInst>(V)) &&
4767            !TheLoop->isLoopInvariant(V);
4768   };
4769 
4770   // A helper that evaluates a memory access's use of a pointer. If the use will
4771   // be a scalar use and the pointer is only used by memory accesses, we place
4772   // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4773   // PossibleNonScalarPtrs.
4774   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4775     // We only care about bitcast and getelementptr instructions contained in
4776     // the loop.
4777     if (!isLoopVaryingBitCastOrGEP(Ptr))
4778       return;
4779 
4780     // If the pointer has already been identified as scalar (e.g., if it was
4781     // also identified as uniform), there's nothing to do.
4782     auto *I = cast<Instruction>(Ptr);
4783     if (Worklist.count(I))
4784       return;
4785 
4786     // If the use of the pointer will be a scalar use, and all users of the
4787     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4788     // place the pointer in PossibleNonScalarPtrs.
4789     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4790           return isa<LoadInst>(U) || isa<StoreInst>(U);
4791         }))
4792       ScalarPtrs.insert(I);
4793     else
4794       PossibleNonScalarPtrs.insert(I);
4795   };
4796 
4797   // We seed the scalars analysis with three classes of instructions: (1)
4798   // instructions marked uniform-after-vectorization and (2) bitcast,
4799   // getelementptr and (pointer) phi instructions used by memory accesses
4800   // requiring a scalar use.
4801   //
4802   // (1) Add to the worklist all instructions that have been identified as
4803   // uniform-after-vectorization.
4804   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4805 
4806   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4807   // memory accesses requiring a scalar use. The pointer operands of loads and
4808   // stores will be scalar as long as the memory accesses is not a gather or
4809   // scatter operation. The value operand of a store will remain scalar if the
4810   // store is scalarized.
4811   for (auto *BB : TheLoop->blocks())
4812     for (auto &I : *BB) {
4813       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4814         evaluatePtrUse(Load, Load->getPointerOperand());
4815       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4816         evaluatePtrUse(Store, Store->getPointerOperand());
4817         evaluatePtrUse(Store, Store->getValueOperand());
4818       }
4819     }
4820   for (auto *I : ScalarPtrs)
4821     if (!PossibleNonScalarPtrs.count(I)) {
4822       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4823       Worklist.insert(I);
4824     }
4825 
4826   // Insert the forced scalars.
4827   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4828   // induction variable when the PHI user is scalarized.
4829   auto ForcedScalar = ForcedScalars.find(VF);
4830   if (ForcedScalar != ForcedScalars.end())
4831     for (auto *I : ForcedScalar->second)
4832       Worklist.insert(I);
4833 
4834   // Expand the worklist by looking through any bitcasts and getelementptr
4835   // instructions we've already identified as scalar. This is similar to the
4836   // expansion step in collectLoopUniforms(); however, here we're only
4837   // expanding to include additional bitcasts and getelementptr instructions.
4838   unsigned Idx = 0;
4839   while (Idx != Worklist.size()) {
4840     Instruction *Dst = Worklist[Idx++];
4841     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4842       continue;
4843     auto *Src = cast<Instruction>(Dst->getOperand(0));
4844     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4845           auto *J = cast<Instruction>(U);
4846           return !TheLoop->contains(J) || Worklist.count(J) ||
4847                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4848                   isScalarUse(J, Src));
4849         })) {
4850       Worklist.insert(Src);
4851       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4852     }
4853   }
4854 
4855   // An induction variable will remain scalar if all users of the induction
4856   // variable and induction variable update remain scalar.
4857   for (auto &Induction : Legal->getInductionVars()) {
4858     auto *Ind = Induction.first;
4859     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4860 
4861     // If tail-folding is applied, the primary induction variable will be used
4862     // to feed a vector compare.
4863     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4864       continue;
4865 
4866     // Returns true if \p Indvar is a pointer induction that is used directly by
4867     // load/store instruction \p I.
4868     auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
4869                                               Instruction *I) {
4870       return Induction.second.getKind() ==
4871                  InductionDescriptor::IK_PtrInduction &&
4872              (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
4873              Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
4874     };
4875 
4876     // Determine if all users of the induction variable are scalar after
4877     // vectorization.
4878     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4879       auto *I = cast<Instruction>(U);
4880       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4881              IsDirectLoadStoreFromPtrIndvar(Ind, I);
4882     });
4883     if (!ScalarInd)
4884       continue;
4885 
4886     // Determine if all users of the induction variable update instruction are
4887     // scalar after vectorization.
4888     auto ScalarIndUpdate =
4889         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4890           auto *I = cast<Instruction>(U);
4891           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4892                  IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
4893         });
4894     if (!ScalarIndUpdate)
4895       continue;
4896 
4897     // The induction variable and its update instruction will remain scalar.
4898     Worklist.insert(Ind);
4899     Worklist.insert(IndUpdate);
4900     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4901     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4902                       << "\n");
4903   }
4904 
4905   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4906 }
4907 
4908 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const {
4909   if (!blockNeedsPredicationForAnyReason(I->getParent()))
4910     return false;
4911   switch(I->getOpcode()) {
4912   default:
4913     break;
4914   case Instruction::Load:
4915   case Instruction::Store: {
4916     if (!Legal->isMaskRequired(I))
4917       return false;
4918     auto *Ptr = getLoadStorePointerOperand(I);
4919     auto *Ty = getLoadStoreType(I);
4920     const Align Alignment = getLoadStoreAlignment(I);
4921     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4922                                 TTI.isLegalMaskedGather(Ty, Alignment))
4923                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4924                                 TTI.isLegalMaskedScatter(Ty, Alignment));
4925   }
4926   case Instruction::UDiv:
4927   case Instruction::SDiv:
4928   case Instruction::SRem:
4929   case Instruction::URem:
4930     return mayDivideByZero(*I);
4931   }
4932   return false;
4933 }
4934 
4935 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4936     Instruction *I, ElementCount VF) {
4937   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4938   assert(getWideningDecision(I, VF) == CM_Unknown &&
4939          "Decision should not be set yet.");
4940   auto *Group = getInterleavedAccessGroup(I);
4941   assert(Group && "Must have a group.");
4942 
4943   // If the instruction's allocated size doesn't equal it's type size, it
4944   // requires padding and will be scalarized.
4945   auto &DL = I->getModule()->getDataLayout();
4946   auto *ScalarTy = getLoadStoreType(I);
4947   if (hasIrregularType(ScalarTy, DL))
4948     return false;
4949 
4950   // Check if masking is required.
4951   // A Group may need masking for one of two reasons: it resides in a block that
4952   // needs predication, or it was decided to use masking to deal with gaps
4953   // (either a gap at the end of a load-access that may result in a speculative
4954   // load, or any gaps in a store-access).
4955   bool PredicatedAccessRequiresMasking =
4956       blockNeedsPredicationForAnyReason(I->getParent()) &&
4957       Legal->isMaskRequired(I);
4958   bool LoadAccessWithGapsRequiresEpilogMasking =
4959       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4960       !isScalarEpilogueAllowed();
4961   bool StoreAccessWithGapsRequiresMasking =
4962       isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4963   if (!PredicatedAccessRequiresMasking &&
4964       !LoadAccessWithGapsRequiresEpilogMasking &&
4965       !StoreAccessWithGapsRequiresMasking)
4966     return true;
4967 
4968   // If masked interleaving is required, we expect that the user/target had
4969   // enabled it, because otherwise it either wouldn't have been created or
4970   // it should have been invalidated by the CostModel.
4971   assert(useMaskedInterleavedAccesses(TTI) &&
4972          "Masked interleave-groups for predicated accesses are not enabled.");
4973 
4974   if (Group->isReverse())
4975     return false;
4976 
4977   auto *Ty = getLoadStoreType(I);
4978   const Align Alignment = getLoadStoreAlignment(I);
4979   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4980                           : TTI.isLegalMaskedStore(Ty, Alignment);
4981 }
4982 
4983 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4984     Instruction *I, ElementCount VF) {
4985   // Get and ensure we have a valid memory instruction.
4986   assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
4987 
4988   auto *Ptr = getLoadStorePointerOperand(I);
4989   auto *ScalarTy = getLoadStoreType(I);
4990 
4991   // In order to be widened, the pointer should be consecutive, first of all.
4992   if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
4993     return false;
4994 
4995   // If the instruction is a store located in a predicated block, it will be
4996   // scalarized.
4997   if (isScalarWithPredication(I))
4998     return false;
4999 
5000   // If the instruction's allocated size doesn't equal it's type size, it
5001   // requires padding and will be scalarized.
5002   auto &DL = I->getModule()->getDataLayout();
5003   if (hasIrregularType(ScalarTy, DL))
5004     return false;
5005 
5006   return true;
5007 }
5008 
5009 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
5010   // We should not collect Uniforms more than once per VF. Right now,
5011   // this function is called from collectUniformsAndScalars(), which
5012   // already does this check. Collecting Uniforms for VF=1 does not make any
5013   // sense.
5014 
5015   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
5016          "This function should not be visited twice for the same VF");
5017 
5018   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5019   // not analyze again.  Uniforms.count(VF) will return 1.
5020   Uniforms[VF].clear();
5021 
5022   // We now know that the loop is vectorizable!
5023   // Collect instructions inside the loop that will remain uniform after
5024   // vectorization.
5025 
5026   // Global values, params and instructions outside of current loop are out of
5027   // scope.
5028   auto isOutOfScope = [&](Value *V) -> bool {
5029     Instruction *I = dyn_cast<Instruction>(V);
5030     return (!I || !TheLoop->contains(I));
5031   };
5032 
5033   // Worklist containing uniform instructions demanding lane 0.
5034   SetVector<Instruction *> Worklist;
5035   BasicBlock *Latch = TheLoop->getLoopLatch();
5036 
5037   // Add uniform instructions demanding lane 0 to the worklist. Instructions
5038   // that are scalar with predication must not be considered uniform after
5039   // vectorization, because that would create an erroneous replicating region
5040   // where only a single instance out of VF should be formed.
5041   // TODO: optimize such seldom cases if found important, see PR40816.
5042   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
5043     if (isOutOfScope(I)) {
5044       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
5045                         << *I << "\n");
5046       return;
5047     }
5048     if (isScalarWithPredication(I)) {
5049       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
5050                         << *I << "\n");
5051       return;
5052     }
5053     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
5054     Worklist.insert(I);
5055   };
5056 
5057   // Start with the conditional branch. If the branch condition is an
5058   // instruction contained in the loop that is only used by the branch, it is
5059   // uniform.
5060   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5061   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5062     addToWorklistIfAllowed(Cmp);
5063 
5064   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5065     InstWidening WideningDecision = getWideningDecision(I, VF);
5066     assert(WideningDecision != CM_Unknown &&
5067            "Widening decision should be ready at this moment");
5068 
5069     // A uniform memory op is itself uniform.  We exclude uniform stores
5070     // here as they demand the last lane, not the first one.
5071     if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
5072       assert(WideningDecision == CM_Scalarize);
5073       return true;
5074     }
5075 
5076     return (WideningDecision == CM_Widen ||
5077             WideningDecision == CM_Widen_Reverse ||
5078             WideningDecision == CM_Interleave);
5079   };
5080 
5081 
5082   // Returns true if Ptr is the pointer operand of a memory access instruction
5083   // I, and I is known to not require scalarization.
5084   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5085     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5086   };
5087 
5088   // Holds a list of values which are known to have at least one uniform use.
5089   // Note that there may be other uses which aren't uniform.  A "uniform use"
5090   // here is something which only demands lane 0 of the unrolled iterations;
5091   // it does not imply that all lanes produce the same value (e.g. this is not
5092   // the usual meaning of uniform)
5093   SetVector<Value *> HasUniformUse;
5094 
5095   // Scan the loop for instructions which are either a) known to have only
5096   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
5097   for (auto *BB : TheLoop->blocks())
5098     for (auto &I : *BB) {
5099       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
5100         switch (II->getIntrinsicID()) {
5101         case Intrinsic::sideeffect:
5102         case Intrinsic::experimental_noalias_scope_decl:
5103         case Intrinsic::assume:
5104         case Intrinsic::lifetime_start:
5105         case Intrinsic::lifetime_end:
5106           if (TheLoop->hasLoopInvariantOperands(&I))
5107             addToWorklistIfAllowed(&I);
5108           break;
5109         default:
5110           break;
5111         }
5112       }
5113 
5114       // ExtractValue instructions must be uniform, because the operands are
5115       // known to be loop-invariant.
5116       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
5117         assert(isOutOfScope(EVI->getAggregateOperand()) &&
5118                "Expected aggregate value to be loop invariant");
5119         addToWorklistIfAllowed(EVI);
5120         continue;
5121       }
5122 
5123       // If there's no pointer operand, there's nothing to do.
5124       auto *Ptr = getLoadStorePointerOperand(&I);
5125       if (!Ptr)
5126         continue;
5127 
5128       // A uniform memory op is itself uniform.  We exclude uniform stores
5129       // here as they demand the last lane, not the first one.
5130       if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
5131         addToWorklistIfAllowed(&I);
5132 
5133       if (isUniformDecision(&I, VF)) {
5134         assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
5135         HasUniformUse.insert(Ptr);
5136       }
5137     }
5138 
5139   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
5140   // demanding) users.  Since loops are assumed to be in LCSSA form, this
5141   // disallows uses outside the loop as well.
5142   for (auto *V : HasUniformUse) {
5143     if (isOutOfScope(V))
5144       continue;
5145     auto *I = cast<Instruction>(V);
5146     auto UsersAreMemAccesses =
5147       llvm::all_of(I->users(), [&](User *U) -> bool {
5148         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
5149       });
5150     if (UsersAreMemAccesses)
5151       addToWorklistIfAllowed(I);
5152   }
5153 
5154   // Expand Worklist in topological order: whenever a new instruction
5155   // is added , its users should be already inside Worklist.  It ensures
5156   // a uniform instruction will only be used by uniform instructions.
5157   unsigned idx = 0;
5158   while (idx != Worklist.size()) {
5159     Instruction *I = Worklist[idx++];
5160 
5161     for (auto OV : I->operand_values()) {
5162       // isOutOfScope operands cannot be uniform instructions.
5163       if (isOutOfScope(OV))
5164         continue;
5165       // First order recurrence Phi's should typically be considered
5166       // non-uniform.
5167       auto *OP = dyn_cast<PHINode>(OV);
5168       if (OP && Legal->isFirstOrderRecurrence(OP))
5169         continue;
5170       // If all the users of the operand are uniform, then add the
5171       // operand into the uniform worklist.
5172       auto *OI = cast<Instruction>(OV);
5173       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5174             auto *J = cast<Instruction>(U);
5175             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
5176           }))
5177         addToWorklistIfAllowed(OI);
5178     }
5179   }
5180 
5181   // For an instruction to be added into Worklist above, all its users inside
5182   // the loop should also be in Worklist. However, this condition cannot be
5183   // true for phi nodes that form a cyclic dependence. We must process phi
5184   // nodes separately. An induction variable will remain uniform if all users
5185   // of the induction variable and induction variable update remain uniform.
5186   // The code below handles both pointer and non-pointer induction variables.
5187   for (auto &Induction : Legal->getInductionVars()) {
5188     auto *Ind = Induction.first;
5189     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5190 
5191     // Determine if all users of the induction variable are uniform after
5192     // vectorization.
5193     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5194       auto *I = cast<Instruction>(U);
5195       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5196              isVectorizedMemAccessUse(I, Ind);
5197     });
5198     if (!UniformInd)
5199       continue;
5200 
5201     // Determine if all users of the induction variable update instruction are
5202     // uniform after vectorization.
5203     auto UniformIndUpdate =
5204         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5205           auto *I = cast<Instruction>(U);
5206           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5207                  isVectorizedMemAccessUse(I, IndUpdate);
5208         });
5209     if (!UniformIndUpdate)
5210       continue;
5211 
5212     // The induction variable and its update instruction will remain uniform.
5213     addToWorklistIfAllowed(Ind);
5214     addToWorklistIfAllowed(IndUpdate);
5215   }
5216 
5217   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5218 }
5219 
5220 bool LoopVectorizationCostModel::runtimeChecksRequired() {
5221   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
5222 
5223   if (Legal->getRuntimePointerChecking()->Need) {
5224     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5225         "runtime pointer checks needed. Enable vectorization of this "
5226         "loop with '#pragma clang loop vectorize(enable)' when "
5227         "compiling with -Os/-Oz",
5228         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5229     return true;
5230   }
5231 
5232   if (!PSE.getUnionPredicate().getPredicates().empty()) {
5233     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5234         "runtime SCEV checks needed. Enable vectorization of this "
5235         "loop with '#pragma clang loop vectorize(enable)' when "
5236         "compiling with -Os/-Oz",
5237         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5238     return true;
5239   }
5240 
5241   // FIXME: Avoid specializing for stride==1 instead of bailing out.
5242   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5243     reportVectorizationFailure("Runtime stride check for small trip count",
5244         "runtime stride == 1 checks needed. Enable vectorization of "
5245         "this loop without such check by compiling with -Os/-Oz",
5246         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5247     return true;
5248   }
5249 
5250   return false;
5251 }
5252 
5253 ElementCount
5254 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
5255   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
5256     return ElementCount::getScalable(0);
5257 
5258   if (Hints->isScalableVectorizationDisabled()) {
5259     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
5260                             "ScalableVectorizationDisabled", ORE, TheLoop);
5261     return ElementCount::getScalable(0);
5262   }
5263 
5264   LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
5265 
5266   auto MaxScalableVF = ElementCount::getScalable(
5267       std::numeric_limits<ElementCount::ScalarTy>::max());
5268 
5269   // Test that the loop-vectorizer can legalize all operations for this MaxVF.
5270   // FIXME: While for scalable vectors this is currently sufficient, this should
5271   // be replaced by a more detailed mechanism that filters out specific VFs,
5272   // instead of invalidating vectorization for a whole set of VFs based on the
5273   // MaxVF.
5274 
5275   // Disable scalable vectorization if the loop contains unsupported reductions.
5276   if (!canVectorizeReductions(MaxScalableVF)) {
5277     reportVectorizationInfo(
5278         "Scalable vectorization not supported for the reduction "
5279         "operations found in this loop.",
5280         "ScalableVFUnfeasible", ORE, TheLoop);
5281     return ElementCount::getScalable(0);
5282   }
5283 
5284   // Disable scalable vectorization if the loop contains any instructions
5285   // with element types not supported for scalable vectors.
5286   if (any_of(ElementTypesInLoop, [&](Type *Ty) {
5287         return !Ty->isVoidTy() &&
5288                !this->TTI.isElementTypeLegalForScalableVector(Ty);
5289       })) {
5290     reportVectorizationInfo("Scalable vectorization is not supported "
5291                             "for all element types found in this loop.",
5292                             "ScalableVFUnfeasible", ORE, TheLoop);
5293     return ElementCount::getScalable(0);
5294   }
5295 
5296   if (Legal->isSafeForAnyVectorWidth())
5297     return MaxScalableVF;
5298 
5299   // Limit MaxScalableVF by the maximum safe dependence distance.
5300   Optional<unsigned> MaxVScale = TTI.getMaxVScale();
5301   if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange))
5302     MaxVScale =
5303         TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
5304   MaxScalableVF = ElementCount::getScalable(
5305       MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
5306   if (!MaxScalableVF)
5307     reportVectorizationInfo(
5308         "Max legal vector width too small, scalable vectorization "
5309         "unfeasible.",
5310         "ScalableVFUnfeasible", ORE, TheLoop);
5311 
5312   return MaxScalableVF;
5313 }
5314 
5315 FixedScalableVFPair
5316 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
5317                                                  ElementCount UserVF) {
5318   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5319   unsigned SmallestType, WidestType;
5320   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5321 
5322   // Get the maximum safe dependence distance in bits computed by LAA.
5323   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5324   // the memory accesses that is most restrictive (involved in the smallest
5325   // dependence distance).
5326   unsigned MaxSafeElements =
5327       PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
5328 
5329   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
5330   auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
5331 
5332   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
5333                     << ".\n");
5334   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
5335                     << ".\n");
5336 
5337   // First analyze the UserVF, fall back if the UserVF should be ignored.
5338   if (UserVF) {
5339     auto MaxSafeUserVF =
5340         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
5341 
5342     if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
5343       // If `VF=vscale x N` is safe, then so is `VF=N`
5344       if (UserVF.isScalable())
5345         return FixedScalableVFPair(
5346             ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
5347       else
5348         return UserVF;
5349     }
5350 
5351     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
5352 
5353     // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
5354     // is better to ignore the hint and let the compiler choose a suitable VF.
5355     if (!UserVF.isScalable()) {
5356       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5357                         << " is unsafe, clamping to max safe VF="
5358                         << MaxSafeFixedVF << ".\n");
5359       ORE->emit([&]() {
5360         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5361                                           TheLoop->getStartLoc(),
5362                                           TheLoop->getHeader())
5363                << "User-specified vectorization factor "
5364                << ore::NV("UserVectorizationFactor", UserVF)
5365                << " is unsafe, clamping to maximum safe vectorization factor "
5366                << ore::NV("VectorizationFactor", MaxSafeFixedVF);
5367       });
5368       return MaxSafeFixedVF;
5369     }
5370 
5371     if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
5372       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5373                         << " is ignored because scalable vectors are not "
5374                            "available.\n");
5375       ORE->emit([&]() {
5376         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5377                                           TheLoop->getStartLoc(),
5378                                           TheLoop->getHeader())
5379                << "User-specified vectorization factor "
5380                << ore::NV("UserVectorizationFactor", UserVF)
5381                << " is ignored because the target does not support scalable "
5382                   "vectors. The compiler will pick a more suitable value.";
5383       });
5384     } else {
5385       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5386                         << " is unsafe. Ignoring scalable UserVF.\n");
5387       ORE->emit([&]() {
5388         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5389                                           TheLoop->getStartLoc(),
5390                                           TheLoop->getHeader())
5391                << "User-specified vectorization factor "
5392                << ore::NV("UserVectorizationFactor", UserVF)
5393                << " is unsafe. Ignoring the hint to let the compiler pick a "
5394                   "more suitable value.";
5395       });
5396     }
5397   }
5398 
5399   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5400                     << " / " << WidestType << " bits.\n");
5401 
5402   FixedScalableVFPair Result(ElementCount::getFixed(1),
5403                              ElementCount::getScalable(0));
5404   if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType,
5405                                            WidestType, MaxSafeFixedVF))
5406     Result.FixedVF = MaxVF;
5407 
5408   if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType,
5409                                            WidestType, MaxSafeScalableVF))
5410     if (MaxVF.isScalable()) {
5411       Result.ScalableVF = MaxVF;
5412       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
5413                         << "\n");
5414     }
5415 
5416   return Result;
5417 }
5418 
5419 FixedScalableVFPair
5420 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5421   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5422     // TODO: It may by useful to do since it's still likely to be dynamically
5423     // uniform if the target can skip.
5424     reportVectorizationFailure(
5425         "Not inserting runtime ptr check for divergent target",
5426         "runtime pointer checks needed. Not enabled for divergent target",
5427         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5428     return FixedScalableVFPair::getNone();
5429   }
5430 
5431   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5432   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5433   if (TC == 1) {
5434     reportVectorizationFailure("Single iteration (non) loop",
5435         "loop trip count is one, irrelevant for vectorization",
5436         "SingleIterationLoop", ORE, TheLoop);
5437     return FixedScalableVFPair::getNone();
5438   }
5439 
5440   switch (ScalarEpilogueStatus) {
5441   case CM_ScalarEpilogueAllowed:
5442     return computeFeasibleMaxVF(TC, UserVF);
5443   case CM_ScalarEpilogueNotAllowedUsePredicate:
5444     LLVM_FALLTHROUGH;
5445   case CM_ScalarEpilogueNotNeededUsePredicate:
5446     LLVM_DEBUG(
5447         dbgs() << "LV: vector predicate hint/switch found.\n"
5448                << "LV: Not allowing scalar epilogue, creating predicated "
5449                << "vector loop.\n");
5450     break;
5451   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5452     // fallthrough as a special case of OptForSize
5453   case CM_ScalarEpilogueNotAllowedOptSize:
5454     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5455       LLVM_DEBUG(
5456           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5457     else
5458       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5459                         << "count.\n");
5460 
5461     // Bail if runtime checks are required, which are not good when optimising
5462     // for size.
5463     if (runtimeChecksRequired())
5464       return FixedScalableVFPair::getNone();
5465 
5466     break;
5467   }
5468 
5469   // The only loops we can vectorize without a scalar epilogue, are loops with
5470   // a bottom-test and a single exiting block. We'd have to handle the fact
5471   // that not every instruction executes on the last iteration.  This will
5472   // require a lane mask which varies through the vector loop body.  (TODO)
5473   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5474     // If there was a tail-folding hint/switch, but we can't fold the tail by
5475     // masking, fallback to a vectorization with a scalar epilogue.
5476     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5477       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5478                            "scalar epilogue instead.\n");
5479       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5480       return computeFeasibleMaxVF(TC, UserVF);
5481     }
5482     return FixedScalableVFPair::getNone();
5483   }
5484 
5485   // Now try the tail folding
5486 
5487   // Invalidate interleave groups that require an epilogue if we can't mask
5488   // the interleave-group.
5489   if (!useMaskedInterleavedAccesses(TTI)) {
5490     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5491            "No decisions should have been taken at this point");
5492     // Note: There is no need to invalidate any cost modeling decisions here, as
5493     // non where taken so far.
5494     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5495   }
5496 
5497   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF);
5498   // Avoid tail folding if the trip count is known to be a multiple of any VF
5499   // we chose.
5500   // FIXME: The condition below pessimises the case for fixed-width vectors,
5501   // when scalable VFs are also candidates for vectorization.
5502   if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
5503     ElementCount MaxFixedVF = MaxFactors.FixedVF;
5504     assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&
5505            "MaxFixedVF must be a power of 2");
5506     unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
5507                                    : MaxFixedVF.getFixedValue();
5508     ScalarEvolution *SE = PSE.getSE();
5509     const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5510     const SCEV *ExitCount = SE->getAddExpr(
5511         BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5512     const SCEV *Rem = SE->getURemExpr(
5513         SE->applyLoopGuards(ExitCount, TheLoop),
5514         SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5515     if (Rem->isZero()) {
5516       // Accept MaxFixedVF if we do not have a tail.
5517       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5518       return MaxFactors;
5519     }
5520   }
5521 
5522   // For scalable vectors, don't use tail folding as this is currently not yet
5523   // supported. The code is likely to have ended up here if the tripcount is
5524   // low, in which case it makes sense not to use scalable vectors.
5525   if (MaxFactors.ScalableVF.isVector())
5526     MaxFactors.ScalableVF = ElementCount::getScalable(0);
5527 
5528   // If we don't know the precise trip count, or if the trip count that we
5529   // found modulo the vectorization factor is not zero, try to fold the tail
5530   // by masking.
5531   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5532   if (Legal->prepareToFoldTailByMasking()) {
5533     FoldTailByMasking = true;
5534     return MaxFactors;
5535   }
5536 
5537   // If there was a tail-folding hint/switch, but we can't fold the tail by
5538   // masking, fallback to a vectorization with a scalar epilogue.
5539   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5540     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5541                          "scalar epilogue instead.\n");
5542     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5543     return MaxFactors;
5544   }
5545 
5546   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5547     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5548     return FixedScalableVFPair::getNone();
5549   }
5550 
5551   if (TC == 0) {
5552     reportVectorizationFailure(
5553         "Unable to calculate the loop count due to complex control flow",
5554         "unable to calculate the loop count due to complex control flow",
5555         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5556     return FixedScalableVFPair::getNone();
5557   }
5558 
5559   reportVectorizationFailure(
5560       "Cannot optimize for size and vectorize at the same time.",
5561       "cannot optimize for size and vectorize at the same time. "
5562       "Enable vectorization of this loop with '#pragma clang loop "
5563       "vectorize(enable)' when compiling with -Os/-Oz",
5564       "NoTailLoopWithOptForSize", ORE, TheLoop);
5565   return FixedScalableVFPair::getNone();
5566 }
5567 
5568 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
5569     unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
5570     const ElementCount &MaxSafeVF) {
5571   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
5572   TypeSize WidestRegister = TTI.getRegisterBitWidth(
5573       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5574                            : TargetTransformInfo::RGK_FixedWidthVector);
5575 
5576   // Convenience function to return the minimum of two ElementCounts.
5577   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
5578     assert((LHS.isScalable() == RHS.isScalable()) &&
5579            "Scalable flags must match");
5580     return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
5581   };
5582 
5583   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5584   // Note that both WidestRegister and WidestType may not be a powers of 2.
5585   auto MaxVectorElementCount = ElementCount::get(
5586       PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType),
5587       ComputeScalableMaxVF);
5588   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
5589   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5590                     << (MaxVectorElementCount * WidestType) << " bits.\n");
5591 
5592   if (!MaxVectorElementCount) {
5593     LLVM_DEBUG(dbgs() << "LV: The target has no "
5594                       << (ComputeScalableMaxVF ? "scalable" : "fixed")
5595                       << " vector registers.\n");
5596     return ElementCount::getFixed(1);
5597   }
5598 
5599   const auto TripCountEC = ElementCount::getFixed(ConstTripCount);
5600   if (ConstTripCount &&
5601       ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) &&
5602       isPowerOf2_32(ConstTripCount)) {
5603     // We need to clamp the VF to be the ConstTripCount. There is no point in
5604     // choosing a higher viable VF as done in the loop below. If
5605     // MaxVectorElementCount is scalable, we only fall back on a fixed VF when
5606     // the TC is less than or equal to the known number of lanes.
5607     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5608                       << ConstTripCount << "\n");
5609     return TripCountEC;
5610   }
5611 
5612   ElementCount MaxVF = MaxVectorElementCount;
5613   if (TTI.shouldMaximizeVectorBandwidth() ||
5614       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5615     auto MaxVectorElementCountMaxBW = ElementCount::get(
5616         PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
5617         ComputeScalableMaxVF);
5618     MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
5619 
5620     // Collect all viable vectorization factors larger than the default MaxVF
5621     // (i.e. MaxVectorElementCount).
5622     SmallVector<ElementCount, 8> VFs;
5623     for (ElementCount VS = MaxVectorElementCount * 2;
5624          ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
5625       VFs.push_back(VS);
5626 
5627     // For each VF calculate its register usage.
5628     auto RUs = calculateRegisterUsage(VFs);
5629 
5630     // Select the largest VF which doesn't require more registers than existing
5631     // ones.
5632     for (int i = RUs.size() - 1; i >= 0; --i) {
5633       bool Selected = true;
5634       for (auto &pair : RUs[i].MaxLocalUsers) {
5635         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5636         if (pair.second > TargetNumRegisters)
5637           Selected = false;
5638       }
5639       if (Selected) {
5640         MaxVF = VFs[i];
5641         break;
5642       }
5643     }
5644     if (ElementCount MinVF =
5645             TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
5646       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
5647         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5648                           << ") with target's minimum: " << MinVF << '\n');
5649         MaxVF = MinVF;
5650       }
5651     }
5652   }
5653   return MaxVF;
5654 }
5655 
5656 bool LoopVectorizationCostModel::isMoreProfitable(
5657     const VectorizationFactor &A, const VectorizationFactor &B) const {
5658   InstructionCost CostA = A.Cost;
5659   InstructionCost CostB = B.Cost;
5660 
5661   unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
5662 
5663   if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking &&
5664       MaxTripCount) {
5665     // If we are folding the tail and the trip count is a known (possibly small)
5666     // constant, the trip count will be rounded up to an integer number of
5667     // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),
5668     // which we compare directly. When not folding the tail, the total cost will
5669     // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is
5670     // approximated with the per-lane cost below instead of using the tripcount
5671     // as here.
5672     auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue());
5673     auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue());
5674     return RTCostA < RTCostB;
5675   }
5676 
5677   // Improve estimate for the vector width if it is scalable.
5678   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
5679   unsigned EstimatedWidthB = B.Width.getKnownMinValue();
5680   if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) {
5681     if (A.Width.isScalable())
5682       EstimatedWidthA *= VScale.getValue();
5683     if (B.Width.isScalable())
5684       EstimatedWidthB *= VScale.getValue();
5685   }
5686 
5687   // When set to preferred, for now assume vscale may be larger than 1 (or the
5688   // one being tuned for), so that scalable vectorization is slightly favorable
5689   // over fixed-width vectorization.
5690   if (Hints->isScalableVectorizationPreferred())
5691     if (A.Width.isScalable() && !B.Width.isScalable())
5692       return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
5693 
5694   // To avoid the need for FP division:
5695   //      (CostA / A.Width) < (CostB / B.Width)
5696   // <=>  (CostA * B.Width) < (CostB * A.Width)
5697   return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
5698 }
5699 
5700 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
5701     const ElementCountSet &VFCandidates) {
5702   InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
5703   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
5704   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
5705   assert(VFCandidates.count(ElementCount::getFixed(1)) &&
5706          "Expected Scalar VF to be a candidate");
5707 
5708   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost);
5709   VectorizationFactor ChosenFactor = ScalarCost;
5710 
5711   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5712   if (ForceVectorization && VFCandidates.size() > 1) {
5713     // Ignore scalar width, because the user explicitly wants vectorization.
5714     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5715     // evaluation.
5716     ChosenFactor.Cost = InstructionCost::getMax();
5717   }
5718 
5719   SmallVector<InstructionVFPair> InvalidCosts;
5720   for (const auto &i : VFCandidates) {
5721     // The cost for scalar VF=1 is already calculated, so ignore it.
5722     if (i.isScalar())
5723       continue;
5724 
5725     VectorizationCostTy C = expectedCost(i, &InvalidCosts);
5726     VectorizationFactor Candidate(i, C.first);
5727 
5728 #ifndef NDEBUG
5729     unsigned AssumedMinimumVscale = 1;
5730     if (Optional<unsigned> VScale = TTI.getVScaleForTuning())
5731       AssumedMinimumVscale = VScale.getValue();
5732     unsigned Width =
5733         Candidate.Width.isScalable()
5734             ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5735             : Candidate.Width.getFixedValue();
5736     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5737                       << " costs: " << (Candidate.Cost / Width));
5738     if (i.isScalable())
5739       LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
5740                         << AssumedMinimumVscale << ")");
5741     LLVM_DEBUG(dbgs() << ".\n");
5742 #endif
5743 
5744     if (!C.second && !ForceVectorization) {
5745       LLVM_DEBUG(
5746           dbgs() << "LV: Not considering vector loop of width " << i
5747                  << " because it will not generate any vector instructions.\n");
5748       continue;
5749     }
5750 
5751     // If profitable add it to ProfitableVF list.
5752     if (isMoreProfitable(Candidate, ScalarCost))
5753       ProfitableVFs.push_back(Candidate);
5754 
5755     if (isMoreProfitable(Candidate, ChosenFactor))
5756       ChosenFactor = Candidate;
5757   }
5758 
5759   // Emit a report of VFs with invalid costs in the loop.
5760   if (!InvalidCosts.empty()) {
5761     // Group the remarks per instruction, keeping the instruction order from
5762     // InvalidCosts.
5763     std::map<Instruction *, unsigned> Numbering;
5764     unsigned I = 0;
5765     for (auto &Pair : InvalidCosts)
5766       if (!Numbering.count(Pair.first))
5767         Numbering[Pair.first] = I++;
5768 
5769     // Sort the list, first on instruction(number) then on VF.
5770     llvm::sort(InvalidCosts,
5771                [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
5772                  if (Numbering[A.first] != Numbering[B.first])
5773                    return Numbering[A.first] < Numbering[B.first];
5774                  ElementCountComparator ECC;
5775                  return ECC(A.second, B.second);
5776                });
5777 
5778     // For a list of ordered instruction-vf pairs:
5779     //   [(load, vf1), (load, vf2), (store, vf1)]
5780     // Group the instructions together to emit separate remarks for:
5781     //   load  (vf1, vf2)
5782     //   store (vf1)
5783     auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
5784     auto Subset = ArrayRef<InstructionVFPair>();
5785     do {
5786       if (Subset.empty())
5787         Subset = Tail.take_front(1);
5788 
5789       Instruction *I = Subset.front().first;
5790 
5791       // If the next instruction is different, or if there are no other pairs,
5792       // emit a remark for the collated subset. e.g.
5793       //   [(load, vf1), (load, vf2))]
5794       // to emit:
5795       //  remark: invalid costs for 'load' at VF=(vf, vf2)
5796       if (Subset == Tail || Tail[Subset.size()].first != I) {
5797         std::string OutString;
5798         raw_string_ostream OS(OutString);
5799         assert(!Subset.empty() && "Unexpected empty range");
5800         OS << "Instruction with invalid costs prevented vectorization at VF=(";
5801         for (auto &Pair : Subset)
5802           OS << (Pair.second == Subset.front().second ? "" : ", ")
5803              << Pair.second;
5804         OS << "):";
5805         if (auto *CI = dyn_cast<CallInst>(I))
5806           OS << " call to " << CI->getCalledFunction()->getName();
5807         else
5808           OS << " " << I->getOpcodeName();
5809         OS.flush();
5810         reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
5811         Tail = Tail.drop_front(Subset.size());
5812         Subset = {};
5813       } else
5814         // Grow the subset by one element
5815         Subset = Tail.take_front(Subset.size() + 1);
5816     } while (!Tail.empty());
5817   }
5818 
5819   if (!EnableCondStoresVectorization && NumPredStores) {
5820     reportVectorizationFailure("There are conditional stores.",
5821         "store that is conditionally executed prevents vectorization",
5822         "ConditionalStore", ORE, TheLoop);
5823     ChosenFactor = ScalarCost;
5824   }
5825 
5826   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
5827                  ChosenFactor.Cost >= ScalarCost.Cost) dbgs()
5828              << "LV: Vectorization seems to be not beneficial, "
5829              << "but was forced by a user.\n");
5830   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
5831   return ChosenFactor;
5832 }
5833 
5834 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
5835     const Loop &L, ElementCount VF) const {
5836   // Cross iteration phis such as reductions need special handling and are
5837   // currently unsupported.
5838   if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) {
5839         return Legal->isFirstOrderRecurrence(&Phi) ||
5840                Legal->isReductionVariable(&Phi);
5841       }))
5842     return false;
5843 
5844   // Phis with uses outside of the loop require special handling and are
5845   // currently unsupported.
5846   for (auto &Entry : Legal->getInductionVars()) {
5847     // Look for uses of the value of the induction at the last iteration.
5848     Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
5849     for (User *U : PostInc->users())
5850       if (!L.contains(cast<Instruction>(U)))
5851         return false;
5852     // Look for uses of penultimate value of the induction.
5853     for (User *U : Entry.first->users())
5854       if (!L.contains(cast<Instruction>(U)))
5855         return false;
5856   }
5857 
5858   // Induction variables that are widened require special handling that is
5859   // currently not supported.
5860   if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
5861         return !(this->isScalarAfterVectorization(Entry.first, VF) ||
5862                  this->isProfitableToScalarize(Entry.first, VF));
5863       }))
5864     return false;
5865 
5866   // Epilogue vectorization code has not been auditted to ensure it handles
5867   // non-latch exits properly.  It may be fine, but it needs auditted and
5868   // tested.
5869   if (L.getExitingBlock() != L.getLoopLatch())
5870     return false;
5871 
5872   return true;
5873 }
5874 
5875 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5876     const ElementCount VF) const {
5877   // FIXME: We need a much better cost-model to take different parameters such
5878   // as register pressure, code size increase and cost of extra branches into
5879   // account. For now we apply a very crude heuristic and only consider loops
5880   // with vectorization factors larger than a certain value.
5881   // We also consider epilogue vectorization unprofitable for targets that don't
5882   // consider interleaving beneficial (eg. MVE).
5883   if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
5884     return false;
5885   if (VF.getFixedValue() >= EpilogueVectorizationMinVF)
5886     return true;
5887   return false;
5888 }
5889 
5890 VectorizationFactor
5891 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
5892     const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
5893   VectorizationFactor Result = VectorizationFactor::Disabled();
5894   if (!EnableEpilogueVectorization) {
5895     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
5896     return Result;
5897   }
5898 
5899   if (!isScalarEpilogueAllowed()) {
5900     LLVM_DEBUG(
5901         dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
5902                   "allowed.\n";);
5903     return Result;
5904   }
5905 
5906   // Not really a cost consideration, but check for unsupported cases here to
5907   // simplify the logic.
5908   if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
5909     LLVM_DEBUG(
5910         dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
5911                   "not a supported candidate.\n";);
5912     return Result;
5913   }
5914 
5915   if (EpilogueVectorizationForceVF > 1) {
5916     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
5917     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
5918     if (LVP.hasPlanWithVF(ForcedEC))
5919       return {ForcedEC, 0};
5920     else {
5921       LLVM_DEBUG(
5922           dbgs()
5923               << "LEV: Epilogue vectorization forced factor is not viable.\n";);
5924       return Result;
5925     }
5926   }
5927 
5928   if (TheLoop->getHeader()->getParent()->hasOptSize() ||
5929       TheLoop->getHeader()->getParent()->hasMinSize()) {
5930     LLVM_DEBUG(
5931         dbgs()
5932             << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
5933     return Result;
5934   }
5935 
5936   auto FixedMainLoopVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
5937   if (MainLoopVF.isScalable())
5938     LLVM_DEBUG(
5939         dbgs() << "LEV: Epilogue vectorization using scalable vectors not "
5940                   "yet supported. Converting to fixed-width (VF="
5941                << FixedMainLoopVF << ") instead\n");
5942 
5943   if (!isEpilogueVectorizationProfitable(FixedMainLoopVF)) {
5944     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
5945                          "this loop\n");
5946     return Result;
5947   }
5948 
5949   for (auto &NextVF : ProfitableVFs)
5950     if (ElementCount::isKnownLT(NextVF.Width, FixedMainLoopVF) &&
5951         (Result.Width.getFixedValue() == 1 ||
5952          isMoreProfitable(NextVF, Result)) &&
5953         LVP.hasPlanWithVF(NextVF.Width))
5954       Result = NextVF;
5955 
5956   if (Result != VectorizationFactor::Disabled())
5957     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5958                       << Result.Width.getFixedValue() << "\n";);
5959   return Result;
5960 }
5961 
5962 std::pair<unsigned, unsigned>
5963 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5964   unsigned MinWidth = -1U;
5965   unsigned MaxWidth = 8;
5966   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5967   for (Type *T : ElementTypesInLoop) {
5968     MinWidth = std::min<unsigned>(
5969         MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5970     MaxWidth = std::max<unsigned>(
5971         MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5972   }
5973   return {MinWidth, MaxWidth};
5974 }
5975 
5976 void LoopVectorizationCostModel::collectElementTypesForWidening() {
5977   ElementTypesInLoop.clear();
5978   // For each block.
5979   for (BasicBlock *BB : TheLoop->blocks()) {
5980     // For each instruction in the loop.
5981     for (Instruction &I : BB->instructionsWithoutDebug()) {
5982       Type *T = I.getType();
5983 
5984       // Skip ignored values.
5985       if (ValuesToIgnore.count(&I))
5986         continue;
5987 
5988       // Only examine Loads, Stores and PHINodes.
5989       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5990         continue;
5991 
5992       // Examine PHI nodes that are reduction variables. Update the type to
5993       // account for the recurrence type.
5994       if (auto *PN = dyn_cast<PHINode>(&I)) {
5995         if (!Legal->isReductionVariable(PN))
5996           continue;
5997         const RecurrenceDescriptor &RdxDesc =
5998             Legal->getReductionVars().find(PN)->second;
5999         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
6000             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
6001                                       RdxDesc.getRecurrenceType(),
6002                                       TargetTransformInfo::ReductionFlags()))
6003           continue;
6004         T = RdxDesc.getRecurrenceType();
6005       }
6006 
6007       // Examine the stored values.
6008       if (auto *ST = dyn_cast<StoreInst>(&I))
6009         T = ST->getValueOperand()->getType();
6010 
6011       // Ignore loaded pointer types and stored pointer types that are not
6012       // vectorizable.
6013       //
6014       // FIXME: The check here attempts to predict whether a load or store will
6015       //        be vectorized. We only know this for certain after a VF has
6016       //        been selected. Here, we assume that if an access can be
6017       //        vectorized, it will be. We should also look at extending this
6018       //        optimization to non-pointer types.
6019       //
6020       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
6021           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
6022         continue;
6023 
6024       ElementTypesInLoop.insert(T);
6025     }
6026   }
6027 }
6028 
6029 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
6030                                                            unsigned LoopCost) {
6031   // -- The interleave heuristics --
6032   // We interleave the loop in order to expose ILP and reduce the loop overhead.
6033   // There are many micro-architectural considerations that we can't predict
6034   // at this level. For example, frontend pressure (on decode or fetch) due to
6035   // code size, or the number and capabilities of the execution ports.
6036   //
6037   // We use the following heuristics to select the interleave count:
6038   // 1. If the code has reductions, then we interleave to break the cross
6039   // iteration dependency.
6040   // 2. If the loop is really small, then we interleave to reduce the loop
6041   // overhead.
6042   // 3. We don't interleave if we think that we will spill registers to memory
6043   // due to the increased register pressure.
6044 
6045   if (!isScalarEpilogueAllowed())
6046     return 1;
6047 
6048   // We used the distance for the interleave count.
6049   if (Legal->getMaxSafeDepDistBytes() != -1U)
6050     return 1;
6051 
6052   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
6053   const bool HasReductions = !Legal->getReductionVars().empty();
6054   // Do not interleave loops with a relatively small known or estimated trip
6055   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
6056   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
6057   // because with the above conditions interleaving can expose ILP and break
6058   // cross iteration dependences for reductions.
6059   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
6060       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
6061     return 1;
6062 
6063   RegisterUsage R = calculateRegisterUsage({VF})[0];
6064   // We divide by these constants so assume that we have at least one
6065   // instruction that uses at least one register.
6066   for (auto& pair : R.MaxLocalUsers) {
6067     pair.second = std::max(pair.second, 1U);
6068   }
6069 
6070   // We calculate the interleave count using the following formula.
6071   // Subtract the number of loop invariants from the number of available
6072   // registers. These registers are used by all of the interleaved instances.
6073   // Next, divide the remaining registers by the number of registers that is
6074   // required by the loop, in order to estimate how many parallel instances
6075   // fit without causing spills. All of this is rounded down if necessary to be
6076   // a power of two. We want power of two interleave count to simplify any
6077   // addressing operations or alignment considerations.
6078   // We also want power of two interleave counts to ensure that the induction
6079   // variable of the vector loop wraps to zero, when tail is folded by masking;
6080   // this currently happens when OptForSize, in which case IC is set to 1 above.
6081   unsigned IC = UINT_MAX;
6082 
6083   for (auto& pair : R.MaxLocalUsers) {
6084     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
6085     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
6086                       << " registers of "
6087                       << TTI.getRegisterClassName(pair.first) << " register class\n");
6088     if (VF.isScalar()) {
6089       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
6090         TargetNumRegisters = ForceTargetNumScalarRegs;
6091     } else {
6092       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
6093         TargetNumRegisters = ForceTargetNumVectorRegs;
6094     }
6095     unsigned MaxLocalUsers = pair.second;
6096     unsigned LoopInvariantRegs = 0;
6097     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
6098       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
6099 
6100     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
6101     // Don't count the induction variable as interleaved.
6102     if (EnableIndVarRegisterHeur) {
6103       TmpIC =
6104           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
6105                         std::max(1U, (MaxLocalUsers - 1)));
6106     }
6107 
6108     IC = std::min(IC, TmpIC);
6109   }
6110 
6111   // Clamp the interleave ranges to reasonable counts.
6112   unsigned MaxInterleaveCount =
6113       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
6114 
6115   // Check if the user has overridden the max.
6116   if (VF.isScalar()) {
6117     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
6118       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
6119   } else {
6120     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
6121       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
6122   }
6123 
6124   // If trip count is known or estimated compile time constant, limit the
6125   // interleave count to be less than the trip count divided by VF, provided it
6126   // is at least 1.
6127   //
6128   // For scalable vectors we can't know if interleaving is beneficial. It may
6129   // not be beneficial for small loops if none of the lanes in the second vector
6130   // iterations is enabled. However, for larger loops, there is likely to be a
6131   // similar benefit as for fixed-width vectors. For now, we choose to leave
6132   // the InterleaveCount as if vscale is '1', although if some information about
6133   // the vector is known (e.g. min vector size), we can make a better decision.
6134   if (BestKnownTC) {
6135     MaxInterleaveCount =
6136         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
6137     // Make sure MaxInterleaveCount is greater than 0.
6138     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
6139   }
6140 
6141   assert(MaxInterleaveCount > 0 &&
6142          "Maximum interleave count must be greater than 0");
6143 
6144   // Clamp the calculated IC to be between the 1 and the max interleave count
6145   // that the target and trip count allows.
6146   if (IC > MaxInterleaveCount)
6147     IC = MaxInterleaveCount;
6148   else
6149     // Make sure IC is greater than 0.
6150     IC = std::max(1u, IC);
6151 
6152   assert(IC > 0 && "Interleave count must be greater than 0.");
6153 
6154   // If we did not calculate the cost for VF (because the user selected the VF)
6155   // then we calculate the cost of VF here.
6156   if (LoopCost == 0) {
6157     InstructionCost C = expectedCost(VF).first;
6158     assert(C.isValid() && "Expected to have chosen a VF with valid cost");
6159     LoopCost = *C.getValue();
6160   }
6161 
6162   assert(LoopCost && "Non-zero loop cost expected");
6163 
6164   // Interleave if we vectorized this loop and there is a reduction that could
6165   // benefit from interleaving.
6166   if (VF.isVector() && HasReductions) {
6167     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
6168     return IC;
6169   }
6170 
6171   // Note that if we've already vectorized the loop we will have done the
6172   // runtime check and so interleaving won't require further checks.
6173   bool InterleavingRequiresRuntimePointerCheck =
6174       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
6175 
6176   // We want to interleave small loops in order to reduce the loop overhead and
6177   // potentially expose ILP opportunities.
6178   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
6179                     << "LV: IC is " << IC << '\n'
6180                     << "LV: VF is " << VF << '\n');
6181   const bool AggressivelyInterleaveReductions =
6182       TTI.enableAggressiveInterleaving(HasReductions);
6183   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
6184     // We assume that the cost overhead is 1 and we use the cost model
6185     // to estimate the cost of the loop and interleave until the cost of the
6186     // loop overhead is about 5% of the cost of the loop.
6187     unsigned SmallIC =
6188         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
6189 
6190     // Interleave until store/load ports (estimated by max interleave count) are
6191     // saturated.
6192     unsigned NumStores = Legal->getNumStores();
6193     unsigned NumLoads = Legal->getNumLoads();
6194     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
6195     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
6196 
6197     // There is little point in interleaving for reductions containing selects
6198     // and compares when VF=1 since it may just create more overhead than it's
6199     // worth for loops with small trip counts. This is because we still have to
6200     // do the final reduction after the loop.
6201     bool HasSelectCmpReductions =
6202         HasReductions &&
6203         any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
6204           const RecurrenceDescriptor &RdxDesc = Reduction.second;
6205           return RecurrenceDescriptor::isSelectCmpRecurrenceKind(
6206               RdxDesc.getRecurrenceKind());
6207         });
6208     if (HasSelectCmpReductions) {
6209       LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
6210       return 1;
6211     }
6212 
6213     // If we have a scalar reduction (vector reductions are already dealt with
6214     // by this point), we can increase the critical path length if the loop
6215     // we're interleaving is inside another loop. For tree-wise reductions
6216     // set the limit to 2, and for ordered reductions it's best to disable
6217     // interleaving entirely.
6218     if (HasReductions && TheLoop->getLoopDepth() > 1) {
6219       bool HasOrderedReductions =
6220           any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
6221             const RecurrenceDescriptor &RdxDesc = Reduction.second;
6222             return RdxDesc.isOrdered();
6223           });
6224       if (HasOrderedReductions) {
6225         LLVM_DEBUG(
6226             dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
6227         return 1;
6228       }
6229 
6230       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
6231       SmallIC = std::min(SmallIC, F);
6232       StoresIC = std::min(StoresIC, F);
6233       LoadsIC = std::min(LoadsIC, F);
6234     }
6235 
6236     if (EnableLoadStoreRuntimeInterleave &&
6237         std::max(StoresIC, LoadsIC) > SmallIC) {
6238       LLVM_DEBUG(
6239           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
6240       return std::max(StoresIC, LoadsIC);
6241     }
6242 
6243     // If there are scalar reductions and TTI has enabled aggressive
6244     // interleaving for reductions, we will interleave to expose ILP.
6245     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
6246         AggressivelyInterleaveReductions) {
6247       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6248       // Interleave no less than SmallIC but not as aggressive as the normal IC
6249       // to satisfy the rare situation when resources are too limited.
6250       return std::max(IC / 2, SmallIC);
6251     } else {
6252       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
6253       return SmallIC;
6254     }
6255   }
6256 
6257   // Interleave if this is a large loop (small loops are already dealt with by
6258   // this point) that could benefit from interleaving.
6259   if (AggressivelyInterleaveReductions) {
6260     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6261     return IC;
6262   }
6263 
6264   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
6265   return 1;
6266 }
6267 
6268 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
6269 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
6270   // This function calculates the register usage by measuring the highest number
6271   // of values that are alive at a single location. Obviously, this is a very
6272   // rough estimation. We scan the loop in a topological order in order and
6273   // assign a number to each instruction. We use RPO to ensure that defs are
6274   // met before their users. We assume that each instruction that has in-loop
6275   // users starts an interval. We record every time that an in-loop value is
6276   // used, so we have a list of the first and last occurrences of each
6277   // instruction. Next, we transpose this data structure into a multi map that
6278   // holds the list of intervals that *end* at a specific location. This multi
6279   // map allows us to perform a linear search. We scan the instructions linearly
6280   // and record each time that a new interval starts, by placing it in a set.
6281   // If we find this value in the multi-map then we remove it from the set.
6282   // The max register usage is the maximum size of the set.
6283   // We also search for instructions that are defined outside the loop, but are
6284   // used inside the loop. We need this number separately from the max-interval
6285   // usage number because when we unroll, loop-invariant values do not take
6286   // more register.
6287   LoopBlocksDFS DFS(TheLoop);
6288   DFS.perform(LI);
6289 
6290   RegisterUsage RU;
6291 
6292   // Each 'key' in the map opens a new interval. The values
6293   // of the map are the index of the 'last seen' usage of the
6294   // instruction that is the key.
6295   using IntervalMap = DenseMap<Instruction *, unsigned>;
6296 
6297   // Maps instruction to its index.
6298   SmallVector<Instruction *, 64> IdxToInstr;
6299   // Marks the end of each interval.
6300   IntervalMap EndPoint;
6301   // Saves the list of instruction indices that are used in the loop.
6302   SmallPtrSet<Instruction *, 8> Ends;
6303   // Saves the list of values that are used in the loop but are
6304   // defined outside the loop, such as arguments and constants.
6305   SmallPtrSet<Value *, 8> LoopInvariants;
6306 
6307   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6308     for (Instruction &I : BB->instructionsWithoutDebug()) {
6309       IdxToInstr.push_back(&I);
6310 
6311       // Save the end location of each USE.
6312       for (Value *U : I.operands()) {
6313         auto *Instr = dyn_cast<Instruction>(U);
6314 
6315         // Ignore non-instruction values such as arguments, constants, etc.
6316         if (!Instr)
6317           continue;
6318 
6319         // If this instruction is outside the loop then record it and continue.
6320         if (!TheLoop->contains(Instr)) {
6321           LoopInvariants.insert(Instr);
6322           continue;
6323         }
6324 
6325         // Overwrite previous end points.
6326         EndPoint[Instr] = IdxToInstr.size();
6327         Ends.insert(Instr);
6328       }
6329     }
6330   }
6331 
6332   // Saves the list of intervals that end with the index in 'key'.
6333   using InstrList = SmallVector<Instruction *, 2>;
6334   DenseMap<unsigned, InstrList> TransposeEnds;
6335 
6336   // Transpose the EndPoints to a list of values that end at each index.
6337   for (auto &Interval : EndPoint)
6338     TransposeEnds[Interval.second].push_back(Interval.first);
6339 
6340   SmallPtrSet<Instruction *, 8> OpenIntervals;
6341   SmallVector<RegisterUsage, 8> RUs(VFs.size());
6342   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
6343 
6344   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
6345 
6346   // A lambda that gets the register usage for the given type and VF.
6347   const auto &TTICapture = TTI;
6348   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
6349     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
6350       return 0;
6351     InstructionCost::CostType RegUsage =
6352         *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue();
6353     assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() &&
6354            "Nonsensical values for register usage.");
6355     return RegUsage;
6356   };
6357 
6358   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
6359     Instruction *I = IdxToInstr[i];
6360 
6361     // Remove all of the instructions that end at this location.
6362     InstrList &List = TransposeEnds[i];
6363     for (Instruction *ToRemove : List)
6364       OpenIntervals.erase(ToRemove);
6365 
6366     // Ignore instructions that are never used within the loop.
6367     if (!Ends.count(I))
6368       continue;
6369 
6370     // Skip ignored values.
6371     if (ValuesToIgnore.count(I))
6372       continue;
6373 
6374     // For each VF find the maximum usage of registers.
6375     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6376       // Count the number of live intervals.
6377       SmallMapVector<unsigned, unsigned, 4> RegUsage;
6378 
6379       if (VFs[j].isScalar()) {
6380         for (auto Inst : OpenIntervals) {
6381           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6382           if (RegUsage.find(ClassID) == RegUsage.end())
6383             RegUsage[ClassID] = 1;
6384           else
6385             RegUsage[ClassID] += 1;
6386         }
6387       } else {
6388         collectUniformsAndScalars(VFs[j]);
6389         for (auto Inst : OpenIntervals) {
6390           // Skip ignored values for VF > 1.
6391           if (VecValuesToIgnore.count(Inst))
6392             continue;
6393           if (isScalarAfterVectorization(Inst, VFs[j])) {
6394             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6395             if (RegUsage.find(ClassID) == RegUsage.end())
6396               RegUsage[ClassID] = 1;
6397             else
6398               RegUsage[ClassID] += 1;
6399           } else {
6400             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
6401             if (RegUsage.find(ClassID) == RegUsage.end())
6402               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
6403             else
6404               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6405           }
6406         }
6407       }
6408 
6409       for (auto& pair : RegUsage) {
6410         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
6411           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
6412         else
6413           MaxUsages[j][pair.first] = pair.second;
6414       }
6415     }
6416 
6417     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6418                       << OpenIntervals.size() << '\n');
6419 
6420     // Add the current instruction to the list of open intervals.
6421     OpenIntervals.insert(I);
6422   }
6423 
6424   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6425     SmallMapVector<unsigned, unsigned, 4> Invariant;
6426 
6427     for (auto Inst : LoopInvariants) {
6428       unsigned Usage =
6429           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6430       unsigned ClassID =
6431           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6432       if (Invariant.find(ClassID) == Invariant.end())
6433         Invariant[ClassID] = Usage;
6434       else
6435         Invariant[ClassID] += Usage;
6436     }
6437 
6438     LLVM_DEBUG({
6439       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
6440       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
6441              << " item\n";
6442       for (const auto &pair : MaxUsages[i]) {
6443         dbgs() << "LV(REG): RegisterClass: "
6444                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6445                << " registers\n";
6446       }
6447       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
6448              << " item\n";
6449       for (const auto &pair : Invariant) {
6450         dbgs() << "LV(REG): RegisterClass: "
6451                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6452                << " registers\n";
6453       }
6454     });
6455 
6456     RU.LoopInvariantRegs = Invariant;
6457     RU.MaxLocalUsers = MaxUsages[i];
6458     RUs[i] = RU;
6459   }
6460 
6461   return RUs;
6462 }
6463 
6464 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
6465   // TODO: Cost model for emulated masked load/store is completely
6466   // broken. This hack guides the cost model to use an artificially
6467   // high enough value to practically disable vectorization with such
6468   // operations, except where previously deployed legality hack allowed
6469   // using very low cost values. This is to avoid regressions coming simply
6470   // from moving "masked load/store" check from legality to cost model.
6471   // Masked Load/Gather emulation was previously never allowed.
6472   // Limited number of Masked Store/Scatter emulation was allowed.
6473   assert(isPredicatedInst(I) &&
6474          "Expecting a scalar emulated instruction");
6475   return isa<LoadInst>(I) ||
6476          (isa<StoreInst>(I) &&
6477           NumPredStores > NumberOfStoresToPredicate);
6478 }
6479 
6480 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6481   // If we aren't vectorizing the loop, or if we've already collected the
6482   // instructions to scalarize, there's nothing to do. Collection may already
6483   // have occurred if we have a user-selected VF and are now computing the
6484   // expected cost for interleaving.
6485   if (VF.isScalar() || VF.isZero() ||
6486       InstsToScalarize.find(VF) != InstsToScalarize.end())
6487     return;
6488 
6489   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6490   // not profitable to scalarize any instructions, the presence of VF in the
6491   // map will indicate that we've analyzed it already.
6492   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6493 
6494   // Find all the instructions that are scalar with predication in the loop and
6495   // determine if it would be better to not if-convert the blocks they are in.
6496   // If so, we also record the instructions to scalarize.
6497   for (BasicBlock *BB : TheLoop->blocks()) {
6498     if (!blockNeedsPredicationForAnyReason(BB))
6499       continue;
6500     for (Instruction &I : *BB)
6501       if (isScalarWithPredication(&I)) {
6502         ScalarCostsTy ScalarCosts;
6503         // Do not apply discount if scalable, because that would lead to
6504         // invalid scalarization costs.
6505         // Do not apply discount logic if hacked cost is needed
6506         // for emulated masked memrefs.
6507         if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I) &&
6508             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6509           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6510         // Remember that BB will remain after vectorization.
6511         PredicatedBBsAfterVectorization.insert(BB);
6512       }
6513   }
6514 }
6515 
6516 int LoopVectorizationCostModel::computePredInstDiscount(
6517     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
6518   assert(!isUniformAfterVectorization(PredInst, VF) &&
6519          "Instruction marked uniform-after-vectorization will be predicated");
6520 
6521   // Initialize the discount to zero, meaning that the scalar version and the
6522   // vector version cost the same.
6523   InstructionCost Discount = 0;
6524 
6525   // Holds instructions to analyze. The instructions we visit are mapped in
6526   // ScalarCosts. Those instructions are the ones that would be scalarized if
6527   // we find that the scalar version costs less.
6528   SmallVector<Instruction *, 8> Worklist;
6529 
6530   // Returns true if the given instruction can be scalarized.
6531   auto canBeScalarized = [&](Instruction *I) -> bool {
6532     // We only attempt to scalarize instructions forming a single-use chain
6533     // from the original predicated block that would otherwise be vectorized.
6534     // Although not strictly necessary, we give up on instructions we know will
6535     // already be scalar to avoid traversing chains that are unlikely to be
6536     // beneficial.
6537     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6538         isScalarAfterVectorization(I, VF))
6539       return false;
6540 
6541     // If the instruction is scalar with predication, it will be analyzed
6542     // separately. We ignore it within the context of PredInst.
6543     if (isScalarWithPredication(I))
6544       return false;
6545 
6546     // If any of the instruction's operands are uniform after vectorization,
6547     // the instruction cannot be scalarized. This prevents, for example, a
6548     // masked load from being scalarized.
6549     //
6550     // We assume we will only emit a value for lane zero of an instruction
6551     // marked uniform after vectorization, rather than VF identical values.
6552     // Thus, if we scalarize an instruction that uses a uniform, we would
6553     // create uses of values corresponding to the lanes we aren't emitting code
6554     // for. This behavior can be changed by allowing getScalarValue to clone
6555     // the lane zero values for uniforms rather than asserting.
6556     for (Use &U : I->operands())
6557       if (auto *J = dyn_cast<Instruction>(U.get()))
6558         if (isUniformAfterVectorization(J, VF))
6559           return false;
6560 
6561     // Otherwise, we can scalarize the instruction.
6562     return true;
6563   };
6564 
6565   // Compute the expected cost discount from scalarizing the entire expression
6566   // feeding the predicated instruction. We currently only consider expressions
6567   // that are single-use instruction chains.
6568   Worklist.push_back(PredInst);
6569   while (!Worklist.empty()) {
6570     Instruction *I = Worklist.pop_back_val();
6571 
6572     // If we've already analyzed the instruction, there's nothing to do.
6573     if (ScalarCosts.find(I) != ScalarCosts.end())
6574       continue;
6575 
6576     // Compute the cost of the vector instruction. Note that this cost already
6577     // includes the scalarization overhead of the predicated instruction.
6578     InstructionCost VectorCost = getInstructionCost(I, VF).first;
6579 
6580     // Compute the cost of the scalarized instruction. This cost is the cost of
6581     // the instruction as if it wasn't if-converted and instead remained in the
6582     // predicated block. We will scale this cost by block probability after
6583     // computing the scalarization overhead.
6584     InstructionCost ScalarCost =
6585         VF.getFixedValue() *
6586         getInstructionCost(I, ElementCount::getFixed(1)).first;
6587 
6588     // Compute the scalarization overhead of needed insertelement instructions
6589     // and phi nodes.
6590     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
6591       ScalarCost += TTI.getScalarizationOverhead(
6592           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6593           APInt::getAllOnes(VF.getFixedValue()), true, false);
6594       ScalarCost +=
6595           VF.getFixedValue() *
6596           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6597     }
6598 
6599     // Compute the scalarization overhead of needed extractelement
6600     // instructions. For each of the instruction's operands, if the operand can
6601     // be scalarized, add it to the worklist; otherwise, account for the
6602     // overhead.
6603     for (Use &U : I->operands())
6604       if (auto *J = dyn_cast<Instruction>(U.get())) {
6605         assert(VectorType::isValidElementType(J->getType()) &&
6606                "Instruction has non-scalar type");
6607         if (canBeScalarized(J))
6608           Worklist.push_back(J);
6609         else if (needsExtract(J, VF)) {
6610           ScalarCost += TTI.getScalarizationOverhead(
6611               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6612               APInt::getAllOnes(VF.getFixedValue()), false, true);
6613         }
6614       }
6615 
6616     // Scale the total scalar cost by block probability.
6617     ScalarCost /= getReciprocalPredBlockProb();
6618 
6619     // Compute the discount. A non-negative discount means the vector version
6620     // of the instruction costs more, and scalarizing would be beneficial.
6621     Discount += VectorCost - ScalarCost;
6622     ScalarCosts[I] = ScalarCost;
6623   }
6624 
6625   return *Discount.getValue();
6626 }
6627 
6628 LoopVectorizationCostModel::VectorizationCostTy
6629 LoopVectorizationCostModel::expectedCost(
6630     ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
6631   VectorizationCostTy Cost;
6632 
6633   // For each block.
6634   for (BasicBlock *BB : TheLoop->blocks()) {
6635     VectorizationCostTy BlockCost;
6636 
6637     // For each instruction in the old loop.
6638     for (Instruction &I : BB->instructionsWithoutDebug()) {
6639       // Skip ignored values.
6640       if (ValuesToIgnore.count(&I) ||
6641           (VF.isVector() && VecValuesToIgnore.count(&I)))
6642         continue;
6643 
6644       VectorizationCostTy C = getInstructionCost(&I, VF);
6645 
6646       // Check if we should override the cost.
6647       if (C.first.isValid() &&
6648           ForceTargetInstructionCost.getNumOccurrences() > 0)
6649         C.first = InstructionCost(ForceTargetInstructionCost);
6650 
6651       // Keep a list of instructions with invalid costs.
6652       if (Invalid && !C.first.isValid())
6653         Invalid->emplace_back(&I, VF);
6654 
6655       BlockCost.first += C.first;
6656       BlockCost.second |= C.second;
6657       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6658                         << " for VF " << VF << " For instruction: " << I
6659                         << '\n');
6660     }
6661 
6662     // If we are vectorizing a predicated block, it will have been
6663     // if-converted. This means that the block's instructions (aside from
6664     // stores and instructions that may divide by zero) will now be
6665     // unconditionally executed. For the scalar case, we may not always execute
6666     // the predicated block, if it is an if-else block. Thus, scale the block's
6667     // cost by the probability of executing it. blockNeedsPredication from
6668     // Legal is used so as to not include all blocks in tail folded loops.
6669     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6670       BlockCost.first /= getReciprocalPredBlockProb();
6671 
6672     Cost.first += BlockCost.first;
6673     Cost.second |= BlockCost.second;
6674   }
6675 
6676   return Cost;
6677 }
6678 
6679 /// Gets Address Access SCEV after verifying that the access pattern
6680 /// is loop invariant except the induction variable dependence.
6681 ///
6682 /// This SCEV can be sent to the Target in order to estimate the address
6683 /// calculation cost.
6684 static const SCEV *getAddressAccessSCEV(
6685               Value *Ptr,
6686               LoopVectorizationLegality *Legal,
6687               PredicatedScalarEvolution &PSE,
6688               const Loop *TheLoop) {
6689 
6690   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6691   if (!Gep)
6692     return nullptr;
6693 
6694   // We are looking for a gep with all loop invariant indices except for one
6695   // which should be an induction variable.
6696   auto SE = PSE.getSE();
6697   unsigned NumOperands = Gep->getNumOperands();
6698   for (unsigned i = 1; i < NumOperands; ++i) {
6699     Value *Opd = Gep->getOperand(i);
6700     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6701         !Legal->isInductionVariable(Opd))
6702       return nullptr;
6703   }
6704 
6705   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6706   return PSE.getSCEV(Ptr);
6707 }
6708 
6709 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6710   return Legal->hasStride(I->getOperand(0)) ||
6711          Legal->hasStride(I->getOperand(1));
6712 }
6713 
6714 InstructionCost
6715 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6716                                                         ElementCount VF) {
6717   assert(VF.isVector() &&
6718          "Scalarization cost of instruction implies vectorization.");
6719   if (VF.isScalable())
6720     return InstructionCost::getInvalid();
6721 
6722   Type *ValTy = getLoadStoreType(I);
6723   auto SE = PSE.getSE();
6724 
6725   unsigned AS = getLoadStoreAddressSpace(I);
6726   Value *Ptr = getLoadStorePointerOperand(I);
6727   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6728   // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
6729   //       that it is being called from this specific place.
6730 
6731   // Figure out whether the access is strided and get the stride value
6732   // if it's known in compile time
6733   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6734 
6735   // Get the cost of the scalar memory instruction and address computation.
6736   InstructionCost Cost =
6737       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6738 
6739   // Don't pass *I here, since it is scalar but will actually be part of a
6740   // vectorized loop where the user of it is a vectorized instruction.
6741   const Align Alignment = getLoadStoreAlignment(I);
6742   Cost += VF.getKnownMinValue() *
6743           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6744                               AS, TTI::TCK_RecipThroughput);
6745 
6746   // Get the overhead of the extractelement and insertelement instructions
6747   // we might create due to scalarization.
6748   Cost += getScalarizationOverhead(I, VF);
6749 
6750   // If we have a predicated load/store, it will need extra i1 extracts and
6751   // conditional branches, but may not be executed for each vector lane. Scale
6752   // the cost by the probability of executing the predicated block.
6753   if (isPredicatedInst(I)) {
6754     Cost /= getReciprocalPredBlockProb();
6755 
6756     // Add the cost of an i1 extract and a branch
6757     auto *Vec_i1Ty =
6758         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
6759     Cost += TTI.getScalarizationOverhead(
6760         Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
6761         /*Insert=*/false, /*Extract=*/true);
6762     Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
6763 
6764     if (useEmulatedMaskMemRefHack(I))
6765       // Artificially setting to a high enough value to practically disable
6766       // vectorization with such operations.
6767       Cost = 3000000;
6768   }
6769 
6770   return Cost;
6771 }
6772 
6773 InstructionCost
6774 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6775                                                     ElementCount VF) {
6776   Type *ValTy = getLoadStoreType(I);
6777   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6778   Value *Ptr = getLoadStorePointerOperand(I);
6779   unsigned AS = getLoadStoreAddressSpace(I);
6780   int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
6781   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6782 
6783   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6784          "Stride should be 1 or -1 for consecutive memory access");
6785   const Align Alignment = getLoadStoreAlignment(I);
6786   InstructionCost Cost = 0;
6787   if (Legal->isMaskRequired(I))
6788     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6789                                       CostKind);
6790   else
6791     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6792                                 CostKind, I);
6793 
6794   bool Reverse = ConsecutiveStride < 0;
6795   if (Reverse)
6796     Cost +=
6797         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
6798   return Cost;
6799 }
6800 
6801 InstructionCost
6802 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6803                                                 ElementCount VF) {
6804   assert(Legal->isUniformMemOp(*I));
6805 
6806   Type *ValTy = getLoadStoreType(I);
6807   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6808   const Align Alignment = getLoadStoreAlignment(I);
6809   unsigned AS = getLoadStoreAddressSpace(I);
6810   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6811   if (isa<LoadInst>(I)) {
6812     return TTI.getAddressComputationCost(ValTy) +
6813            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6814                                CostKind) +
6815            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6816   }
6817   StoreInst *SI = cast<StoreInst>(I);
6818 
6819   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6820   return TTI.getAddressComputationCost(ValTy) +
6821          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6822                              CostKind) +
6823          (isLoopInvariantStoreValue
6824               ? 0
6825               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6826                                        VF.getKnownMinValue() - 1));
6827 }
6828 
6829 InstructionCost
6830 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6831                                                  ElementCount VF) {
6832   Type *ValTy = getLoadStoreType(I);
6833   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6834   const Align Alignment = getLoadStoreAlignment(I);
6835   const Value *Ptr = getLoadStorePointerOperand(I);
6836 
6837   return TTI.getAddressComputationCost(VectorTy) +
6838          TTI.getGatherScatterOpCost(
6839              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6840              TargetTransformInfo::TCK_RecipThroughput, I);
6841 }
6842 
6843 InstructionCost
6844 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6845                                                    ElementCount VF) {
6846   // TODO: Once we have support for interleaving with scalable vectors
6847   // we can calculate the cost properly here.
6848   if (VF.isScalable())
6849     return InstructionCost::getInvalid();
6850 
6851   Type *ValTy = getLoadStoreType(I);
6852   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6853   unsigned AS = getLoadStoreAddressSpace(I);
6854 
6855   auto Group = getInterleavedAccessGroup(I);
6856   assert(Group && "Fail to get an interleaved access group.");
6857 
6858   unsigned InterleaveFactor = Group->getFactor();
6859   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6860 
6861   // Holds the indices of existing members in the interleaved group.
6862   SmallVector<unsigned, 4> Indices;
6863   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
6864     if (Group->getMember(IF))
6865       Indices.push_back(IF);
6866 
6867   // Calculate the cost of the whole interleaved group.
6868   bool UseMaskForGaps =
6869       (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6870       (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
6871   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
6872       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6873       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6874 
6875   if (Group->isReverse()) {
6876     // TODO: Add support for reversed masked interleaved access.
6877     assert(!Legal->isMaskRequired(I) &&
6878            "Reverse masked interleaved access not supported.");
6879     Cost +=
6880         Group->getNumMembers() *
6881         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
6882   }
6883   return Cost;
6884 }
6885 
6886 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
6887     Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
6888   using namespace llvm::PatternMatch;
6889   // Early exit for no inloop reductions
6890   if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6891     return None;
6892   auto *VectorTy = cast<VectorType>(Ty);
6893 
6894   // We are looking for a pattern of, and finding the minimal acceptable cost:
6895   //  reduce(mul(ext(A), ext(B))) or
6896   //  reduce(mul(A, B)) or
6897   //  reduce(ext(A)) or
6898   //  reduce(A).
6899   // The basic idea is that we walk down the tree to do that, finding the root
6900   // reduction instruction in InLoopReductionImmediateChains. From there we find
6901   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6902   // of the components. If the reduction cost is lower then we return it for the
6903   // reduction instruction and 0 for the other instructions in the pattern. If
6904   // it is not we return an invalid cost specifying the orignal cost method
6905   // should be used.
6906   Instruction *RetI = I;
6907   if (match(RetI, m_ZExtOrSExt(m_Value()))) {
6908     if (!RetI->hasOneUser())
6909       return None;
6910     RetI = RetI->user_back();
6911   }
6912   if (match(RetI, m_Mul(m_Value(), m_Value())) &&
6913       RetI->user_back()->getOpcode() == Instruction::Add) {
6914     if (!RetI->hasOneUser())
6915       return None;
6916     RetI = RetI->user_back();
6917   }
6918 
6919   // Test if the found instruction is a reduction, and if not return an invalid
6920   // cost specifying the parent to use the original cost modelling.
6921   if (!InLoopReductionImmediateChains.count(RetI))
6922     return None;
6923 
6924   // Find the reduction this chain is a part of and calculate the basic cost of
6925   // the reduction on its own.
6926   Instruction *LastChain = InLoopReductionImmediateChains[RetI];
6927   Instruction *ReductionPhi = LastChain;
6928   while (!isa<PHINode>(ReductionPhi))
6929     ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
6930 
6931   const RecurrenceDescriptor &RdxDesc =
6932       Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
6933 
6934   InstructionCost BaseCost = TTI.getArithmeticReductionCost(
6935       RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
6936 
6937   // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6938   // normal fmul instruction to the cost of the fadd reduction.
6939   if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
6940     BaseCost +=
6941         TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
6942 
6943   // If we're using ordered reductions then we can just return the base cost
6944   // here, since getArithmeticReductionCost calculates the full ordered
6945   // reduction cost when FP reassociation is not allowed.
6946   if (useOrderedReductions(RdxDesc))
6947     return BaseCost;
6948 
6949   // Get the operand that was not the reduction chain and match it to one of the
6950   // patterns, returning the better cost if it is found.
6951   Instruction *RedOp = RetI->getOperand(1) == LastChain
6952                            ? dyn_cast<Instruction>(RetI->getOperand(0))
6953                            : dyn_cast<Instruction>(RetI->getOperand(1));
6954 
6955   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6956 
6957   Instruction *Op0, *Op1;
6958   if (RedOp &&
6959       match(RedOp,
6960             m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
6961       match(Op0, m_ZExtOrSExt(m_Value())) &&
6962       Op0->getOpcode() == Op1->getOpcode() &&
6963       Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6964       !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
6965       (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
6966 
6967     // Matched reduce(ext(mul(ext(A), ext(B)))
6968     // Note that the extend opcodes need to all match, or if A==B they will have
6969     // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6970     // which is equally fine.
6971     bool IsUnsigned = isa<ZExtInst>(Op0);
6972     auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6973     auto *MulType = VectorType::get(Op0->getType(), VectorTy);
6974 
6975     InstructionCost ExtCost =
6976         TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
6977                              TTI::CastContextHint::None, CostKind, Op0);
6978     InstructionCost MulCost =
6979         TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
6980     InstructionCost Ext2Cost =
6981         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
6982                              TTI::CastContextHint::None, CostKind, RedOp);
6983 
6984     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6985         /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6986         CostKind);
6987 
6988     if (RedCost.isValid() &&
6989         RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6990       return I == RetI ? RedCost : 0;
6991   } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
6992              !TheLoop->isLoopInvariant(RedOp)) {
6993     // Matched reduce(ext(A))
6994     bool IsUnsigned = isa<ZExtInst>(RedOp);
6995     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6996     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6997         /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6998         CostKind);
6999 
7000     InstructionCost ExtCost =
7001         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
7002                              TTI::CastContextHint::None, CostKind, RedOp);
7003     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
7004       return I == RetI ? RedCost : 0;
7005   } else if (RedOp &&
7006              match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
7007     if (match(Op0, m_ZExtOrSExt(m_Value())) &&
7008         Op0->getOpcode() == Op1->getOpcode() &&
7009         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
7010       bool IsUnsigned = isa<ZExtInst>(Op0);
7011       Type *Op0Ty = Op0->getOperand(0)->getType();
7012       Type *Op1Ty = Op1->getOperand(0)->getType();
7013       Type *LargestOpTy =
7014           Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
7015                                                                     : Op0Ty;
7016       auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
7017 
7018       // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of
7019       // different sizes. We take the largest type as the ext to reduce, and add
7020       // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
7021       InstructionCost ExtCost0 = TTI.getCastInstrCost(
7022           Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
7023           TTI::CastContextHint::None, CostKind, Op0);
7024       InstructionCost ExtCost1 = TTI.getCastInstrCost(
7025           Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
7026           TTI::CastContextHint::None, CostKind, Op1);
7027       InstructionCost MulCost =
7028           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7029 
7030       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7031           /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
7032           CostKind);
7033       InstructionCost ExtraExtCost = 0;
7034       if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
7035         Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
7036         ExtraExtCost = TTI.getCastInstrCost(
7037             ExtraExtOp->getOpcode(), ExtType,
7038             VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
7039             TTI::CastContextHint::None, CostKind, ExtraExtOp);
7040       }
7041 
7042       if (RedCost.isValid() &&
7043           (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
7044         return I == RetI ? RedCost : 0;
7045     } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
7046       // Matched reduce(mul())
7047       InstructionCost MulCost =
7048           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7049 
7050       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7051           /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy,
7052           CostKind);
7053 
7054       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
7055         return I == RetI ? RedCost : 0;
7056     }
7057   }
7058 
7059   return I == RetI ? Optional<InstructionCost>(BaseCost) : None;
7060 }
7061 
7062 InstructionCost
7063 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
7064                                                      ElementCount VF) {
7065   // Calculate scalar cost only. Vectorization cost should be ready at this
7066   // moment.
7067   if (VF.isScalar()) {
7068     Type *ValTy = getLoadStoreType(I);
7069     const Align Alignment = getLoadStoreAlignment(I);
7070     unsigned AS = getLoadStoreAddressSpace(I);
7071 
7072     return TTI.getAddressComputationCost(ValTy) +
7073            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
7074                                TTI::TCK_RecipThroughput, I);
7075   }
7076   return getWideningCost(I, VF);
7077 }
7078 
7079 LoopVectorizationCostModel::VectorizationCostTy
7080 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
7081                                                ElementCount VF) {
7082   // If we know that this instruction will remain uniform, check the cost of
7083   // the scalar version.
7084   if (isUniformAfterVectorization(I, VF))
7085     VF = ElementCount::getFixed(1);
7086 
7087   if (VF.isVector() && isProfitableToScalarize(I, VF))
7088     return VectorizationCostTy(InstsToScalarize[VF][I], false);
7089 
7090   // Forced scalars do not have any scalarization overhead.
7091   auto ForcedScalar = ForcedScalars.find(VF);
7092   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
7093     auto InstSet = ForcedScalar->second;
7094     if (InstSet.count(I))
7095       return VectorizationCostTy(
7096           (getInstructionCost(I, ElementCount::getFixed(1)).first *
7097            VF.getKnownMinValue()),
7098           false);
7099   }
7100 
7101   Type *VectorTy;
7102   InstructionCost C = getInstructionCost(I, VF, VectorTy);
7103 
7104   bool TypeNotScalarized = false;
7105   if (VF.isVector() && VectorTy->isVectorTy()) {
7106     unsigned NumParts = TTI.getNumberOfParts(VectorTy);
7107     if (NumParts)
7108       TypeNotScalarized = NumParts < VF.getKnownMinValue();
7109     else
7110       C = InstructionCost::getInvalid();
7111   }
7112   return VectorizationCostTy(C, TypeNotScalarized);
7113 }
7114 
7115 InstructionCost
7116 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
7117                                                      ElementCount VF) const {
7118 
7119   // There is no mechanism yet to create a scalable scalarization loop,
7120   // so this is currently Invalid.
7121   if (VF.isScalable())
7122     return InstructionCost::getInvalid();
7123 
7124   if (VF.isScalar())
7125     return 0;
7126 
7127   InstructionCost Cost = 0;
7128   Type *RetTy = ToVectorTy(I->getType(), VF);
7129   if (!RetTy->isVoidTy() &&
7130       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
7131     Cost += TTI.getScalarizationOverhead(
7132         cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true,
7133         false);
7134 
7135   // Some targets keep addresses scalar.
7136   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
7137     return Cost;
7138 
7139   // Some targets support efficient element stores.
7140   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
7141     return Cost;
7142 
7143   // Collect operands to consider.
7144   CallInst *CI = dyn_cast<CallInst>(I);
7145   Instruction::op_range Ops = CI ? CI->args() : I->operands();
7146 
7147   // Skip operands that do not require extraction/scalarization and do not incur
7148   // any overhead.
7149   SmallVector<Type *> Tys;
7150   for (auto *V : filterExtractingOperands(Ops, VF))
7151     Tys.push_back(MaybeVectorizeType(V->getType(), VF));
7152   return Cost + TTI.getOperandsScalarizationOverhead(
7153                     filterExtractingOperands(Ops, VF), Tys);
7154 }
7155 
7156 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
7157   if (VF.isScalar())
7158     return;
7159   NumPredStores = 0;
7160   for (BasicBlock *BB : TheLoop->blocks()) {
7161     // For each instruction in the old loop.
7162     for (Instruction &I : *BB) {
7163       Value *Ptr =  getLoadStorePointerOperand(&I);
7164       if (!Ptr)
7165         continue;
7166 
7167       // TODO: We should generate better code and update the cost model for
7168       // predicated uniform stores. Today they are treated as any other
7169       // predicated store (see added test cases in
7170       // invariant-store-vectorization.ll).
7171       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
7172         NumPredStores++;
7173 
7174       if (Legal->isUniformMemOp(I)) {
7175         // TODO: Avoid replicating loads and stores instead of
7176         // relying on instcombine to remove them.
7177         // Load: Scalar load + broadcast
7178         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
7179         InstructionCost Cost;
7180         if (isa<StoreInst>(&I) && VF.isScalable() &&
7181             isLegalGatherOrScatter(&I)) {
7182           Cost = getGatherScatterCost(&I, VF);
7183           setWideningDecision(&I, VF, CM_GatherScatter, Cost);
7184         } else {
7185           assert((isa<LoadInst>(&I) || !VF.isScalable()) &&
7186                  "Cannot yet scalarize uniform stores");
7187           Cost = getUniformMemOpCost(&I, VF);
7188           setWideningDecision(&I, VF, CM_Scalarize, Cost);
7189         }
7190         continue;
7191       }
7192 
7193       // We assume that widening is the best solution when possible.
7194       if (memoryInstructionCanBeWidened(&I, VF)) {
7195         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
7196         int ConsecutiveStride = Legal->isConsecutivePtr(
7197             getLoadStoreType(&I), getLoadStorePointerOperand(&I));
7198         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
7199                "Expected consecutive stride.");
7200         InstWidening Decision =
7201             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
7202         setWideningDecision(&I, VF, Decision, Cost);
7203         continue;
7204       }
7205 
7206       // Choose between Interleaving, Gather/Scatter or Scalarization.
7207       InstructionCost InterleaveCost = InstructionCost::getInvalid();
7208       unsigned NumAccesses = 1;
7209       if (isAccessInterleaved(&I)) {
7210         auto Group = getInterleavedAccessGroup(&I);
7211         assert(Group && "Fail to get an interleaved access group.");
7212 
7213         // Make one decision for the whole group.
7214         if (getWideningDecision(&I, VF) != CM_Unknown)
7215           continue;
7216 
7217         NumAccesses = Group->getNumMembers();
7218         if (interleavedAccessCanBeWidened(&I, VF))
7219           InterleaveCost = getInterleaveGroupCost(&I, VF);
7220       }
7221 
7222       InstructionCost GatherScatterCost =
7223           isLegalGatherOrScatter(&I)
7224               ? getGatherScatterCost(&I, VF) * NumAccesses
7225               : InstructionCost::getInvalid();
7226 
7227       InstructionCost ScalarizationCost =
7228           getMemInstScalarizationCost(&I, VF) * NumAccesses;
7229 
7230       // Choose better solution for the current VF,
7231       // write down this decision and use it during vectorization.
7232       InstructionCost Cost;
7233       InstWidening Decision;
7234       if (InterleaveCost <= GatherScatterCost &&
7235           InterleaveCost < ScalarizationCost) {
7236         Decision = CM_Interleave;
7237         Cost = InterleaveCost;
7238       } else if (GatherScatterCost < ScalarizationCost) {
7239         Decision = CM_GatherScatter;
7240         Cost = GatherScatterCost;
7241       } else {
7242         Decision = CM_Scalarize;
7243         Cost = ScalarizationCost;
7244       }
7245       // If the instructions belongs to an interleave group, the whole group
7246       // receives the same decision. The whole group receives the cost, but
7247       // the cost will actually be assigned to one instruction.
7248       if (auto Group = getInterleavedAccessGroup(&I))
7249         setWideningDecision(Group, VF, Decision, Cost);
7250       else
7251         setWideningDecision(&I, VF, Decision, Cost);
7252     }
7253   }
7254 
7255   // Make sure that any load of address and any other address computation
7256   // remains scalar unless there is gather/scatter support. This avoids
7257   // inevitable extracts into address registers, and also has the benefit of
7258   // activating LSR more, since that pass can't optimize vectorized
7259   // addresses.
7260   if (TTI.prefersVectorizedAddressing())
7261     return;
7262 
7263   // Start with all scalar pointer uses.
7264   SmallPtrSet<Instruction *, 8> AddrDefs;
7265   for (BasicBlock *BB : TheLoop->blocks())
7266     for (Instruction &I : *BB) {
7267       Instruction *PtrDef =
7268         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
7269       if (PtrDef && TheLoop->contains(PtrDef) &&
7270           getWideningDecision(&I, VF) != CM_GatherScatter)
7271         AddrDefs.insert(PtrDef);
7272     }
7273 
7274   // Add all instructions used to generate the addresses.
7275   SmallVector<Instruction *, 4> Worklist;
7276   append_range(Worklist, AddrDefs);
7277   while (!Worklist.empty()) {
7278     Instruction *I = Worklist.pop_back_val();
7279     for (auto &Op : I->operands())
7280       if (auto *InstOp = dyn_cast<Instruction>(Op))
7281         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
7282             AddrDefs.insert(InstOp).second)
7283           Worklist.push_back(InstOp);
7284   }
7285 
7286   for (auto *I : AddrDefs) {
7287     if (isa<LoadInst>(I)) {
7288       // Setting the desired widening decision should ideally be handled in
7289       // by cost functions, but since this involves the task of finding out
7290       // if the loaded register is involved in an address computation, it is
7291       // instead changed here when we know this is the case.
7292       InstWidening Decision = getWideningDecision(I, VF);
7293       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
7294         // Scalarize a widened load of address.
7295         setWideningDecision(
7296             I, VF, CM_Scalarize,
7297             (VF.getKnownMinValue() *
7298              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
7299       else if (auto Group = getInterleavedAccessGroup(I)) {
7300         // Scalarize an interleave group of address loads.
7301         for (unsigned I = 0; I < Group->getFactor(); ++I) {
7302           if (Instruction *Member = Group->getMember(I))
7303             setWideningDecision(
7304                 Member, VF, CM_Scalarize,
7305                 (VF.getKnownMinValue() *
7306                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
7307         }
7308       }
7309     } else
7310       // Make sure I gets scalarized and a cost estimate without
7311       // scalarization overhead.
7312       ForcedScalars[VF].insert(I);
7313   }
7314 }
7315 
7316 InstructionCost
7317 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
7318                                                Type *&VectorTy) {
7319   Type *RetTy = I->getType();
7320   if (canTruncateToMinimalBitwidth(I, VF))
7321     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
7322   auto SE = PSE.getSE();
7323   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7324 
7325   auto hasSingleCopyAfterVectorization = [this](Instruction *I,
7326                                                 ElementCount VF) -> bool {
7327     if (VF.isScalar())
7328       return true;
7329 
7330     auto Scalarized = InstsToScalarize.find(VF);
7331     assert(Scalarized != InstsToScalarize.end() &&
7332            "VF not yet analyzed for scalarization profitability");
7333     return !Scalarized->second.count(I) &&
7334            llvm::all_of(I->users(), [&](User *U) {
7335              auto *UI = cast<Instruction>(U);
7336              return !Scalarized->second.count(UI);
7337            });
7338   };
7339   (void) hasSingleCopyAfterVectorization;
7340 
7341   if (isScalarAfterVectorization(I, VF)) {
7342     // With the exception of GEPs and PHIs, after scalarization there should
7343     // only be one copy of the instruction generated in the loop. This is
7344     // because the VF is either 1, or any instructions that need scalarizing
7345     // have already been dealt with by the the time we get here. As a result,
7346     // it means we don't have to multiply the instruction cost by VF.
7347     assert(I->getOpcode() == Instruction::GetElementPtr ||
7348            I->getOpcode() == Instruction::PHI ||
7349            (I->getOpcode() == Instruction::BitCast &&
7350             I->getType()->isPointerTy()) ||
7351            hasSingleCopyAfterVectorization(I, VF));
7352     VectorTy = RetTy;
7353   } else
7354     VectorTy = ToVectorTy(RetTy, VF);
7355 
7356   // TODO: We need to estimate the cost of intrinsic calls.
7357   switch (I->getOpcode()) {
7358   case Instruction::GetElementPtr:
7359     // We mark this instruction as zero-cost because the cost of GEPs in
7360     // vectorized code depends on whether the corresponding memory instruction
7361     // is scalarized or not. Therefore, we handle GEPs with the memory
7362     // instruction cost.
7363     return 0;
7364   case Instruction::Br: {
7365     // In cases of scalarized and predicated instructions, there will be VF
7366     // predicated blocks in the vectorized loop. Each branch around these
7367     // blocks requires also an extract of its vector compare i1 element.
7368     bool ScalarPredicatedBB = false;
7369     BranchInst *BI = cast<BranchInst>(I);
7370     if (VF.isVector() && BI->isConditional() &&
7371         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
7372          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
7373       ScalarPredicatedBB = true;
7374 
7375     if (ScalarPredicatedBB) {
7376       // Not possible to scalarize scalable vector with predicated instructions.
7377       if (VF.isScalable())
7378         return InstructionCost::getInvalid();
7379       // Return cost for branches around scalarized and predicated blocks.
7380       auto *Vec_i1Ty =
7381           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
7382       return (
7383           TTI.getScalarizationOverhead(
7384               Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) +
7385           (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
7386     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
7387       // The back-edge branch will remain, as will all scalar branches.
7388       return TTI.getCFInstrCost(Instruction::Br, CostKind);
7389     else
7390       // This branch will be eliminated by if-conversion.
7391       return 0;
7392     // Note: We currently assume zero cost for an unconditional branch inside
7393     // a predicated block since it will become a fall-through, although we
7394     // may decide in the future to call TTI for all branches.
7395   }
7396   case Instruction::PHI: {
7397     auto *Phi = cast<PHINode>(I);
7398 
7399     // First-order recurrences are replaced by vector shuffles inside the loop.
7400     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
7401     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
7402       return TTI.getShuffleCost(
7403           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
7404           None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
7405 
7406     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7407     // converted into select instructions. We require N - 1 selects per phi
7408     // node, where N is the number of incoming values.
7409     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
7410       return (Phi->getNumIncomingValues() - 1) *
7411              TTI.getCmpSelInstrCost(
7412                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
7413                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
7414                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
7415 
7416     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
7417   }
7418   case Instruction::UDiv:
7419   case Instruction::SDiv:
7420   case Instruction::URem:
7421   case Instruction::SRem:
7422     // If we have a predicated instruction, it may not be executed for each
7423     // vector lane. Get the scalarization cost and scale this amount by the
7424     // probability of executing the predicated block. If the instruction is not
7425     // predicated, we fall through to the next case.
7426     if (VF.isVector() && isScalarWithPredication(I)) {
7427       InstructionCost Cost = 0;
7428 
7429       // These instructions have a non-void type, so account for the phi nodes
7430       // that we will create. This cost is likely to be zero. The phi node
7431       // cost, if any, should be scaled by the block probability because it
7432       // models a copy at the end of each predicated block.
7433       Cost += VF.getKnownMinValue() *
7434               TTI.getCFInstrCost(Instruction::PHI, CostKind);
7435 
7436       // The cost of the non-predicated instruction.
7437       Cost += VF.getKnownMinValue() *
7438               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
7439 
7440       // The cost of insertelement and extractelement instructions needed for
7441       // scalarization.
7442       Cost += getScalarizationOverhead(I, VF);
7443 
7444       // Scale the cost by the probability of executing the predicated blocks.
7445       // This assumes the predicated block for each vector lane is equally
7446       // likely.
7447       return Cost / getReciprocalPredBlockProb();
7448     }
7449     LLVM_FALLTHROUGH;
7450   case Instruction::Add:
7451   case Instruction::FAdd:
7452   case Instruction::Sub:
7453   case Instruction::FSub:
7454   case Instruction::Mul:
7455   case Instruction::FMul:
7456   case Instruction::FDiv:
7457   case Instruction::FRem:
7458   case Instruction::Shl:
7459   case Instruction::LShr:
7460   case Instruction::AShr:
7461   case Instruction::And:
7462   case Instruction::Or:
7463   case Instruction::Xor: {
7464     // Since we will replace the stride by 1 the multiplication should go away.
7465     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7466       return 0;
7467 
7468     // Detect reduction patterns
7469     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7470       return *RedCost;
7471 
7472     // Certain instructions can be cheaper to vectorize if they have a constant
7473     // second vector operand. One example of this are shifts on x86.
7474     Value *Op2 = I->getOperand(1);
7475     TargetTransformInfo::OperandValueProperties Op2VP;
7476     TargetTransformInfo::OperandValueKind Op2VK =
7477         TTI.getOperandInfo(Op2, Op2VP);
7478     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7479       Op2VK = TargetTransformInfo::OK_UniformValue;
7480 
7481     SmallVector<const Value *, 4> Operands(I->operand_values());
7482     return TTI.getArithmeticInstrCost(
7483         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7484         Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
7485   }
7486   case Instruction::FNeg: {
7487     return TTI.getArithmeticInstrCost(
7488         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7489         TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None,
7490         TargetTransformInfo::OP_None, I->getOperand(0), I);
7491   }
7492   case Instruction::Select: {
7493     SelectInst *SI = cast<SelectInst>(I);
7494     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7495     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7496 
7497     const Value *Op0, *Op1;
7498     using namespace llvm::PatternMatch;
7499     if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
7500                         match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
7501       // select x, y, false --> x & y
7502       // select x, true, y --> x | y
7503       TTI::OperandValueProperties Op1VP = TTI::OP_None;
7504       TTI::OperandValueProperties Op2VP = TTI::OP_None;
7505       TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP);
7506       TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP);
7507       assert(Op0->getType()->getScalarSizeInBits() == 1 &&
7508               Op1->getType()->getScalarSizeInBits() == 1);
7509 
7510       SmallVector<const Value *, 2> Operands{Op0, Op1};
7511       return TTI.getArithmeticInstrCost(
7512           match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
7513           CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I);
7514     }
7515 
7516     Type *CondTy = SI->getCondition()->getType();
7517     if (!ScalarCond)
7518       CondTy = VectorType::get(CondTy, VF);
7519 
7520     CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
7521     if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
7522       Pred = Cmp->getPredicate();
7523     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
7524                                   CostKind, I);
7525   }
7526   case Instruction::ICmp:
7527   case Instruction::FCmp: {
7528     Type *ValTy = I->getOperand(0)->getType();
7529     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7530     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7531       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7532     VectorTy = ToVectorTy(ValTy, VF);
7533     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7534                                   cast<CmpInst>(I)->getPredicate(), CostKind,
7535                                   I);
7536   }
7537   case Instruction::Store:
7538   case Instruction::Load: {
7539     ElementCount Width = VF;
7540     if (Width.isVector()) {
7541       InstWidening Decision = getWideningDecision(I, Width);
7542       assert(Decision != CM_Unknown &&
7543              "CM decision should be taken at this point");
7544       if (Decision == CM_Scalarize)
7545         Width = ElementCount::getFixed(1);
7546     }
7547     VectorTy = ToVectorTy(getLoadStoreType(I), Width);
7548     return getMemoryInstructionCost(I, VF);
7549   }
7550   case Instruction::BitCast:
7551     if (I->getType()->isPointerTy())
7552       return 0;
7553     LLVM_FALLTHROUGH;
7554   case Instruction::ZExt:
7555   case Instruction::SExt:
7556   case Instruction::FPToUI:
7557   case Instruction::FPToSI:
7558   case Instruction::FPExt:
7559   case Instruction::PtrToInt:
7560   case Instruction::IntToPtr:
7561   case Instruction::SIToFP:
7562   case Instruction::UIToFP:
7563   case Instruction::Trunc:
7564   case Instruction::FPTrunc: {
7565     // Computes the CastContextHint from a Load/Store instruction.
7566     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7567       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7568              "Expected a load or a store!");
7569 
7570       if (VF.isScalar() || !TheLoop->contains(I))
7571         return TTI::CastContextHint::Normal;
7572 
7573       switch (getWideningDecision(I, VF)) {
7574       case LoopVectorizationCostModel::CM_GatherScatter:
7575         return TTI::CastContextHint::GatherScatter;
7576       case LoopVectorizationCostModel::CM_Interleave:
7577         return TTI::CastContextHint::Interleave;
7578       case LoopVectorizationCostModel::CM_Scalarize:
7579       case LoopVectorizationCostModel::CM_Widen:
7580         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7581                                         : TTI::CastContextHint::Normal;
7582       case LoopVectorizationCostModel::CM_Widen_Reverse:
7583         return TTI::CastContextHint::Reversed;
7584       case LoopVectorizationCostModel::CM_Unknown:
7585         llvm_unreachable("Instr did not go through cost modelling?");
7586       }
7587 
7588       llvm_unreachable("Unhandled case!");
7589     };
7590 
7591     unsigned Opcode = I->getOpcode();
7592     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7593     // For Trunc, the context is the only user, which must be a StoreInst.
7594     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7595       if (I->hasOneUse())
7596         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7597           CCH = ComputeCCH(Store);
7598     }
7599     // For Z/Sext, the context is the operand, which must be a LoadInst.
7600     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7601              Opcode == Instruction::FPExt) {
7602       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7603         CCH = ComputeCCH(Load);
7604     }
7605 
7606     // We optimize the truncation of induction variables having constant
7607     // integer steps. The cost of these truncations is the same as the scalar
7608     // operation.
7609     if (isOptimizableIVTruncate(I, VF)) {
7610       auto *Trunc = cast<TruncInst>(I);
7611       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7612                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7613     }
7614 
7615     // Detect reduction patterns
7616     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7617       return *RedCost;
7618 
7619     Type *SrcScalarTy = I->getOperand(0)->getType();
7620     Type *SrcVecTy =
7621         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7622     if (canTruncateToMinimalBitwidth(I, VF)) {
7623       // This cast is going to be shrunk. This may remove the cast or it might
7624       // turn it into slightly different cast. For example, if MinBW == 16,
7625       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7626       //
7627       // Calculate the modified src and dest types.
7628       Type *MinVecTy = VectorTy;
7629       if (Opcode == Instruction::Trunc) {
7630         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7631         VectorTy =
7632             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7633       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7634         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7635         VectorTy =
7636             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7637       }
7638     }
7639 
7640     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7641   }
7642   case Instruction::Call: {
7643     if (RecurrenceDescriptor::isFMulAddIntrinsic(I))
7644       if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7645         return *RedCost;
7646     bool NeedToScalarize;
7647     CallInst *CI = cast<CallInst>(I);
7648     InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7649     if (getVectorIntrinsicIDForCall(CI, TLI)) {
7650       InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
7651       return std::min(CallCost, IntrinsicCost);
7652     }
7653     return CallCost;
7654   }
7655   case Instruction::ExtractValue:
7656     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7657   case Instruction::Alloca:
7658     // We cannot easily widen alloca to a scalable alloca, as
7659     // the result would need to be a vector of pointers.
7660     if (VF.isScalable())
7661       return InstructionCost::getInvalid();
7662     LLVM_FALLTHROUGH;
7663   default:
7664     // This opcode is unknown. Assume that it is the same as 'mul'.
7665     return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7666   } // end of switch.
7667 }
7668 
7669 char LoopVectorize::ID = 0;
7670 
7671 static const char lv_name[] = "Loop Vectorization";
7672 
7673 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7674 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7675 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7676 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
7677 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7678 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7679 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7680 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7681 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7682 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7683 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7684 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7685 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7686 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
7687 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
7688 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7689 
7690 namespace llvm {
7691 
7692 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7693 
7694 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7695                               bool VectorizeOnlyWhenForced) {
7696   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7697 }
7698 
7699 } // end namespace llvm
7700 
7701 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
7702   // Check if the pointer operand of a load or store instruction is
7703   // consecutive.
7704   if (auto *Ptr = getLoadStorePointerOperand(Inst))
7705     return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr);
7706   return false;
7707 }
7708 
7709 void LoopVectorizationCostModel::collectValuesToIgnore() {
7710   // Ignore ephemeral values.
7711   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7712 
7713   // Ignore type-promoting instructions we identified during reduction
7714   // detection.
7715   for (auto &Reduction : Legal->getReductionVars()) {
7716     const RecurrenceDescriptor &RedDes = Reduction.second;
7717     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7718     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7719   }
7720   // Ignore type-casting instructions we identified during induction
7721   // detection.
7722   for (auto &Induction : Legal->getInductionVars()) {
7723     const InductionDescriptor &IndDes = Induction.second;
7724     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7725     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7726   }
7727 }
7728 
7729 void LoopVectorizationCostModel::collectInLoopReductions() {
7730   for (auto &Reduction : Legal->getReductionVars()) {
7731     PHINode *Phi = Reduction.first;
7732     const RecurrenceDescriptor &RdxDesc = Reduction.second;
7733 
7734     // We don't collect reductions that are type promoted (yet).
7735     if (RdxDesc.getRecurrenceType() != Phi->getType())
7736       continue;
7737 
7738     // If the target would prefer this reduction to happen "in-loop", then we
7739     // want to record it as such.
7740     unsigned Opcode = RdxDesc.getOpcode();
7741     if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7742         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7743                                    TargetTransformInfo::ReductionFlags()))
7744       continue;
7745 
7746     // Check that we can correctly put the reductions into the loop, by
7747     // finding the chain of operations that leads from the phi to the loop
7748     // exit value.
7749     SmallVector<Instruction *, 4> ReductionOperations =
7750         RdxDesc.getReductionOpChain(Phi, TheLoop);
7751     bool InLoop = !ReductionOperations.empty();
7752     if (InLoop) {
7753       InLoopReductionChains[Phi] = ReductionOperations;
7754       // Add the elements to InLoopReductionImmediateChains for cost modelling.
7755       Instruction *LastChain = Phi;
7756       for (auto *I : ReductionOperations) {
7757         InLoopReductionImmediateChains[I] = LastChain;
7758         LastChain = I;
7759       }
7760     }
7761     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7762                       << " reduction for phi: " << *Phi << "\n");
7763   }
7764 }
7765 
7766 // TODO: we could return a pair of values that specify the max VF and
7767 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7768 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7769 // doesn't have a cost model that can choose which plan to execute if
7770 // more than one is generated.
7771 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7772                                  LoopVectorizationCostModel &CM) {
7773   unsigned WidestType;
7774   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7775   return WidestVectorRegBits / WidestType;
7776 }
7777 
7778 VectorizationFactor
7779 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7780   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
7781   ElementCount VF = UserVF;
7782   // Outer loop handling: They may require CFG and instruction level
7783   // transformations before even evaluating whether vectorization is profitable.
7784   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7785   // the vectorization pipeline.
7786   if (!OrigLoop->isInnermost()) {
7787     // If the user doesn't provide a vectorization factor, determine a
7788     // reasonable one.
7789     if (UserVF.isZero()) {
7790       VF = ElementCount::getFixed(determineVPlanVF(
7791           TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
7792               .getFixedSize(),
7793           CM));
7794       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7795 
7796       // Make sure we have a VF > 1 for stress testing.
7797       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7798         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7799                           << "overriding computed VF.\n");
7800         VF = ElementCount::getFixed(4);
7801       }
7802     }
7803     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7804     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7805            "VF needs to be a power of two");
7806     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7807                       << "VF " << VF << " to build VPlans.\n");
7808     buildVPlans(VF, VF);
7809 
7810     // For VPlan build stress testing, we bail out after VPlan construction.
7811     if (VPlanBuildStressTest)
7812       return VectorizationFactor::Disabled();
7813 
7814     return {VF, 0 /*Cost*/};
7815   }
7816 
7817   LLVM_DEBUG(
7818       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7819                 "VPlan-native path.\n");
7820   return VectorizationFactor::Disabled();
7821 }
7822 
7823 Optional<VectorizationFactor>
7824 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7825   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7826   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7827   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7828     return None;
7829 
7830   // Invalidate interleave groups if all blocks of loop will be predicated.
7831   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7832       !useMaskedInterleavedAccesses(*TTI)) {
7833     LLVM_DEBUG(
7834         dbgs()
7835         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7836            "which requires masked-interleaved support.\n");
7837     if (CM.InterleaveInfo.invalidateGroups())
7838       // Invalidating interleave groups also requires invalidating all decisions
7839       // based on them, which includes widening decisions and uniform and scalar
7840       // values.
7841       CM.invalidateCostModelingDecisions();
7842   }
7843 
7844   ElementCount MaxUserVF =
7845       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7846   bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
7847   if (!UserVF.isZero() && UserVFIsLegal) {
7848     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7849            "VF needs to be a power of two");
7850     // Collect the instructions (and their associated costs) that will be more
7851     // profitable to scalarize.
7852     if (CM.selectUserVectorizationFactor(UserVF)) {
7853       LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7854       CM.collectInLoopReductions();
7855       buildVPlansWithVPRecipes(UserVF, UserVF);
7856       LLVM_DEBUG(printPlans(dbgs()));
7857       return {{UserVF, 0}};
7858     } else
7859       reportVectorizationInfo("UserVF ignored because of invalid costs.",
7860                               "InvalidCost", ORE, OrigLoop);
7861   }
7862 
7863   // Populate the set of Vectorization Factor Candidates.
7864   ElementCountSet VFCandidates;
7865   for (auto VF = ElementCount::getFixed(1);
7866        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7867     VFCandidates.insert(VF);
7868   for (auto VF = ElementCount::getScalable(1);
7869        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7870     VFCandidates.insert(VF);
7871 
7872   for (const auto &VF : VFCandidates) {
7873     // Collect Uniform and Scalar instructions after vectorization with VF.
7874     CM.collectUniformsAndScalars(VF);
7875 
7876     // Collect the instructions (and their associated costs) that will be more
7877     // profitable to scalarize.
7878     if (VF.isVector())
7879       CM.collectInstsToScalarize(VF);
7880   }
7881 
7882   CM.collectInLoopReductions();
7883   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7884   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7885 
7886   LLVM_DEBUG(printPlans(dbgs()));
7887   if (!MaxFactors.hasVector())
7888     return VectorizationFactor::Disabled();
7889 
7890   // Select the optimal vectorization factor.
7891   auto SelectedVF = CM.selectVectorizationFactor(VFCandidates);
7892 
7893   // Check if it is profitable to vectorize with runtime checks.
7894   unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
7895   if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) {
7896     bool PragmaThresholdReached =
7897         NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
7898     bool ThresholdReached =
7899         NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
7900     if ((ThresholdReached && !Hints.allowReordering()) ||
7901         PragmaThresholdReached) {
7902       ORE->emit([&]() {
7903         return OptimizationRemarkAnalysisAliasing(
7904                    DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(),
7905                    OrigLoop->getHeader())
7906                << "loop not vectorized: cannot prove it is safe to reorder "
7907                   "memory operations";
7908       });
7909       LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
7910       Hints.emitRemarkWithHints();
7911       return VectorizationFactor::Disabled();
7912     }
7913   }
7914   return SelectedVF;
7915 }
7916 
7917 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
7918   assert(count_if(VPlans,
7919                   [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7920              1 &&
7921          "Best VF has not a single VPlan.");
7922 
7923   for (const VPlanPtr &Plan : VPlans) {
7924     if (Plan->hasVF(VF))
7925       return *Plan.get();
7926   }
7927   llvm_unreachable("No plan found!");
7928 }
7929 
7930 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
7931                                            VPlan &BestVPlan,
7932                                            InnerLoopVectorizer &ILV,
7933                                            DominatorTree *DT) {
7934   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
7935                     << '\n');
7936 
7937   // Perform the actual loop transformation.
7938 
7939   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
7940   VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
7941   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7942   State.TripCount = ILV.getOrCreateTripCount(nullptr);
7943   State.CanonicalIV = ILV.Induction;
7944   ILV.collectPoisonGeneratingRecipes(State);
7945 
7946   ILV.printDebugTracesAtStart();
7947 
7948   //===------------------------------------------------===//
7949   //
7950   // Notice: any optimization or new instruction that go
7951   // into the code below should also be implemented in
7952   // the cost-model.
7953   //
7954   //===------------------------------------------------===//
7955 
7956   // 2. Copy and widen instructions from the old loop into the new loop.
7957   BestVPlan.execute(&State);
7958 
7959   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7960   //    predication, updating analyses.
7961   ILV.fixVectorizedLoop(State);
7962 
7963   ILV.printDebugTracesAtEnd();
7964 }
7965 
7966 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
7967 void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
7968   for (const auto &Plan : VPlans)
7969     if (PrintVPlansInDotFormat)
7970       Plan->printDOT(O);
7971     else
7972       Plan->print(O);
7973 }
7974 #endif
7975 
7976 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
7977     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7978 
7979   // We create new control-flow for the vectorized loop, so the original exit
7980   // conditions will be dead after vectorization if it's only used by the
7981   // terminator
7982   SmallVector<BasicBlock*> ExitingBlocks;
7983   OrigLoop->getExitingBlocks(ExitingBlocks);
7984   for (auto *BB : ExitingBlocks) {
7985     auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));
7986     if (!Cmp || !Cmp->hasOneUse())
7987       continue;
7988 
7989     // TODO: we should introduce a getUniqueExitingBlocks on Loop
7990     if (!DeadInstructions.insert(Cmp).second)
7991       continue;
7992 
7993     // The operands of the icmp is often a dead trunc, used by IndUpdate.
7994     // TODO: can recurse through operands in general
7995     for (Value *Op : Cmp->operands()) {
7996       if (isa<TruncInst>(Op) && Op->hasOneUse())
7997           DeadInstructions.insert(cast<Instruction>(Op));
7998     }
7999   }
8000 
8001   // We create new "steps" for induction variable updates to which the original
8002   // induction variables map. An original update instruction will be dead if
8003   // all its users except the induction variable are dead.
8004   auto *Latch = OrigLoop->getLoopLatch();
8005   for (auto &Induction : Legal->getInductionVars()) {
8006     PHINode *Ind = Induction.first;
8007     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
8008 
8009     // If the tail is to be folded by masking, the primary induction variable,
8010     // if exists, isn't dead: it will be used for masking. Don't kill it.
8011     if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
8012       continue;
8013 
8014     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
8015           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
8016         }))
8017       DeadInstructions.insert(IndUpdate);
8018   }
8019 }
8020 
8021 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
8022 
8023 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
8024 
8025 Value *InnerLoopUnroller::getStepVector(Value *Val, Value *StartIdx,
8026                                         Value *Step,
8027                                         Instruction::BinaryOps BinOp) {
8028   // When unrolling and the VF is 1, we only need to add a simple scalar.
8029   Type *Ty = Val->getType();
8030   assert(!Ty->isVectorTy() && "Val must be a scalar");
8031 
8032   if (Ty->isFloatingPointTy()) {
8033     // Floating-point operations inherit FMF via the builder's flags.
8034     Value *MulOp = Builder.CreateFMul(StartIdx, Step);
8035     return Builder.CreateBinOp(BinOp, Val, MulOp);
8036   }
8037   return Builder.CreateAdd(Val, Builder.CreateMul(StartIdx, Step), "induction");
8038 }
8039 
8040 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
8041   SmallVector<Metadata *, 4> MDs;
8042   // Reserve first location for self reference to the LoopID metadata node.
8043   MDs.push_back(nullptr);
8044   bool IsUnrollMetadata = false;
8045   MDNode *LoopID = L->getLoopID();
8046   if (LoopID) {
8047     // First find existing loop unrolling disable metadata.
8048     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
8049       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
8050       if (MD) {
8051         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
8052         IsUnrollMetadata =
8053             S && S->getString().startswith("llvm.loop.unroll.disable");
8054       }
8055       MDs.push_back(LoopID->getOperand(i));
8056     }
8057   }
8058 
8059   if (!IsUnrollMetadata) {
8060     // Add runtime unroll disable metadata.
8061     LLVMContext &Context = L->getHeader()->getContext();
8062     SmallVector<Metadata *, 1> DisableOperands;
8063     DisableOperands.push_back(
8064         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
8065     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
8066     MDs.push_back(DisableNode);
8067     MDNode *NewLoopID = MDNode::get(Context, MDs);
8068     // Set operand 0 to refer to the loop id itself.
8069     NewLoopID->replaceOperandWith(0, NewLoopID);
8070     L->setLoopID(NewLoopID);
8071   }
8072 }
8073 
8074 //===--------------------------------------------------------------------===//
8075 // EpilogueVectorizerMainLoop
8076 //===--------------------------------------------------------------------===//
8077 
8078 /// This function is partially responsible for generating the control flow
8079 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
8080 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
8081   MDNode *OrigLoopID = OrigLoop->getLoopID();
8082   Loop *Lp = createVectorLoopSkeleton("");
8083 
8084   // Generate the code to check the minimum iteration count of the vector
8085   // epilogue (see below).
8086   EPI.EpilogueIterationCountCheck =
8087       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true);
8088   EPI.EpilogueIterationCountCheck->setName("iter.check");
8089 
8090   // Generate the code to check any assumptions that we've made for SCEV
8091   // expressions.
8092   EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader);
8093 
8094   // Generate the code that checks at runtime if arrays overlap. We put the
8095   // checks into a separate block to make the more common case of few elements
8096   // faster.
8097   EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
8098 
8099   // Generate the iteration count check for the main loop, *after* the check
8100   // for the epilogue loop, so that the path-length is shorter for the case
8101   // that goes directly through the vector epilogue. The longer-path length for
8102   // the main loop is compensated for, by the gain from vectorizing the larger
8103   // trip count. Note: the branch will get updated later on when we vectorize
8104   // the epilogue.
8105   EPI.MainLoopIterationCountCheck =
8106       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
8107 
8108   // Generate the induction variable.
8109   OldInduction = Legal->getPrimaryInduction();
8110   Type *IdxTy = Legal->getWidestInductionType();
8111   Value *StartIdx = ConstantInt::get(IdxTy, 0);
8112 
8113   IRBuilder<> B(&*Lp->getLoopPreheader()->getFirstInsertionPt());
8114   Value *Step = getRuntimeVF(B, IdxTy, VF * UF);
8115   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
8116   EPI.VectorTripCount = CountRoundDown;
8117   Induction =
8118       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
8119                               getDebugLocFromInstOrOperands(OldInduction));
8120 
8121   // Skip induction resume value creation here because they will be created in
8122   // the second pass. If we created them here, they wouldn't be used anyway,
8123   // because the vplan in the second pass still contains the inductions from the
8124   // original loop.
8125 
8126   return completeLoopSkeleton(Lp, OrigLoopID);
8127 }
8128 
8129 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
8130   LLVM_DEBUG({
8131     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
8132            << "Main Loop VF:" << EPI.MainLoopVF
8133            << ", Main Loop UF:" << EPI.MainLoopUF
8134            << ", Epilogue Loop VF:" << EPI.EpilogueVF
8135            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8136   });
8137 }
8138 
8139 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
8140   DEBUG_WITH_TYPE(VerboseDebug, {
8141     dbgs() << "intermediate fn:\n"
8142            << *OrigLoop->getHeader()->getParent() << "\n";
8143   });
8144 }
8145 
8146 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
8147     Loop *L, BasicBlock *Bypass, bool ForEpilogue) {
8148   assert(L && "Expected valid Loop.");
8149   assert(Bypass && "Expected valid bypass basic block.");
8150   ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
8151   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
8152   Value *Count = getOrCreateTripCount(L);
8153   // Reuse existing vector loop preheader for TC checks.
8154   // Note that new preheader block is generated for vector loop.
8155   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
8156   IRBuilder<> Builder(TCCheckBlock->getTerminator());
8157 
8158   // Generate code to check if the loop's trip count is less than VF * UF of the
8159   // main vector loop.
8160   auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ?
8161       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
8162 
8163   Value *CheckMinIters = Builder.CreateICmp(
8164       P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
8165       "min.iters.check");
8166 
8167   if (!ForEpilogue)
8168     TCCheckBlock->setName("vector.main.loop.iter.check");
8169 
8170   // Create new preheader for vector loop.
8171   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
8172                                    DT, LI, nullptr, "vector.ph");
8173 
8174   if (ForEpilogue) {
8175     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
8176                                  DT->getNode(Bypass)->getIDom()) &&
8177            "TC check is expected to dominate Bypass");
8178 
8179     // Update dominator for Bypass & LoopExit.
8180     DT->changeImmediateDominator(Bypass, TCCheckBlock);
8181     if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
8182       // For loops with multiple exits, there's no edge from the middle block
8183       // to exit blocks (as the epilogue must run) and thus no need to update
8184       // the immediate dominator of the exit blocks.
8185       DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
8186 
8187     LoopBypassBlocks.push_back(TCCheckBlock);
8188 
8189     // Save the trip count so we don't have to regenerate it in the
8190     // vec.epilog.iter.check. This is safe to do because the trip count
8191     // generated here dominates the vector epilog iter check.
8192     EPI.TripCount = Count;
8193   }
8194 
8195   ReplaceInstWithInst(
8196       TCCheckBlock->getTerminator(),
8197       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
8198 
8199   return TCCheckBlock;
8200 }
8201 
8202 //===--------------------------------------------------------------------===//
8203 // EpilogueVectorizerEpilogueLoop
8204 //===--------------------------------------------------------------------===//
8205 
8206 /// This function is partially responsible for generating the control flow
8207 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
8208 BasicBlock *
8209 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
8210   MDNode *OrigLoopID = OrigLoop->getLoopID();
8211   Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
8212 
8213   // Now, compare the remaining count and if there aren't enough iterations to
8214   // execute the vectorized epilogue skip to the scalar part.
8215   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
8216   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
8217   LoopVectorPreHeader =
8218       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
8219                  LI, nullptr, "vec.epilog.ph");
8220   emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader,
8221                                           VecEpilogueIterationCountCheck);
8222 
8223   // Adjust the control flow taking the state info from the main loop
8224   // vectorization into account.
8225   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
8226          "expected this to be saved from the previous pass.");
8227   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
8228       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
8229 
8230   DT->changeImmediateDominator(LoopVectorPreHeader,
8231                                EPI.MainLoopIterationCountCheck);
8232 
8233   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
8234       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8235 
8236   if (EPI.SCEVSafetyCheck)
8237     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
8238         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8239   if (EPI.MemSafetyCheck)
8240     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
8241         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8242 
8243   DT->changeImmediateDominator(
8244       VecEpilogueIterationCountCheck,
8245       VecEpilogueIterationCountCheck->getSinglePredecessor());
8246 
8247   DT->changeImmediateDominator(LoopScalarPreHeader,
8248                                EPI.EpilogueIterationCountCheck);
8249   if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
8250     // If there is an epilogue which must run, there's no edge from the
8251     // middle block to exit blocks  and thus no need to update the immediate
8252     // dominator of the exit blocks.
8253     DT->changeImmediateDominator(LoopExitBlock,
8254                                  EPI.EpilogueIterationCountCheck);
8255 
8256   // Keep track of bypass blocks, as they feed start values to the induction
8257   // phis in the scalar loop preheader.
8258   if (EPI.SCEVSafetyCheck)
8259     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
8260   if (EPI.MemSafetyCheck)
8261     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
8262   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
8263 
8264   // Generate a resume induction for the vector epilogue and put it in the
8265   // vector epilogue preheader
8266   Type *IdxTy = Legal->getWidestInductionType();
8267   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
8268                                          LoopVectorPreHeader->getFirstNonPHI());
8269   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
8270   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
8271                            EPI.MainLoopIterationCountCheck);
8272 
8273   // Generate the induction variable.
8274   OldInduction = Legal->getPrimaryInduction();
8275   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
8276   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
8277   Value *StartIdx = EPResumeVal;
8278   Induction =
8279       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
8280                               getDebugLocFromInstOrOperands(OldInduction));
8281 
8282   // Generate induction resume values. These variables save the new starting
8283   // indexes for the scalar loop. They are used to test if there are any tail
8284   // iterations left once the vector loop has completed.
8285   // Note that when the vectorized epilogue is skipped due to iteration count
8286   // check, then the resume value for the induction variable comes from
8287   // the trip count of the main vector loop, hence passing the AdditionalBypass
8288   // argument.
8289   createInductionResumeValues(Lp, CountRoundDown,
8290                               {VecEpilogueIterationCountCheck,
8291                                EPI.VectorTripCount} /* AdditionalBypass */);
8292 
8293   AddRuntimeUnrollDisableMetaData(Lp);
8294   return completeLoopSkeleton(Lp, OrigLoopID);
8295 }
8296 
8297 BasicBlock *
8298 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
8299     Loop *L, BasicBlock *Bypass, BasicBlock *Insert) {
8300 
8301   assert(EPI.TripCount &&
8302          "Expected trip count to have been safed in the first pass.");
8303   assert(
8304       (!isa<Instruction>(EPI.TripCount) ||
8305        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
8306       "saved trip count does not dominate insertion point.");
8307   Value *TC = EPI.TripCount;
8308   IRBuilder<> Builder(Insert->getTerminator());
8309   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
8310 
8311   // Generate code to check if the loop's trip count is less than VF * UF of the
8312   // vector epilogue loop.
8313   auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ?
8314       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
8315 
8316   Value *CheckMinIters =
8317       Builder.CreateICmp(P, Count,
8318                          createStepForVF(Builder, Count->getType(),
8319                                          EPI.EpilogueVF, EPI.EpilogueUF),
8320                          "min.epilog.iters.check");
8321 
8322   ReplaceInstWithInst(
8323       Insert->getTerminator(),
8324       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
8325 
8326   LoopBypassBlocks.push_back(Insert);
8327   return Insert;
8328 }
8329 
8330 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
8331   LLVM_DEBUG({
8332     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
8333            << "Epilogue Loop VF:" << EPI.EpilogueVF
8334            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8335   });
8336 }
8337 
8338 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
8339   DEBUG_WITH_TYPE(VerboseDebug, {
8340     dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
8341   });
8342 }
8343 
8344 bool LoopVectorizationPlanner::getDecisionAndClampRange(
8345     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
8346   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
8347   bool PredicateAtRangeStart = Predicate(Range.Start);
8348 
8349   for (ElementCount TmpVF = Range.Start * 2;
8350        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
8351     if (Predicate(TmpVF) != PredicateAtRangeStart) {
8352       Range.End = TmpVF;
8353       break;
8354     }
8355 
8356   return PredicateAtRangeStart;
8357 }
8358 
8359 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
8360 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
8361 /// of VF's starting at a given VF and extending it as much as possible. Each
8362 /// vectorization decision can potentially shorten this sub-range during
8363 /// buildVPlan().
8364 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
8365                                            ElementCount MaxVF) {
8366   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8367   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8368     VFRange SubRange = {VF, MaxVFPlusOne};
8369     VPlans.push_back(buildVPlan(SubRange));
8370     VF = SubRange.End;
8371   }
8372 }
8373 
8374 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
8375                                          VPlanPtr &Plan) {
8376   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8377 
8378   // Look for cached value.
8379   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8380   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8381   if (ECEntryIt != EdgeMaskCache.end())
8382     return ECEntryIt->second;
8383 
8384   VPValue *SrcMask = createBlockInMask(Src, Plan);
8385 
8386   // The terminator has to be a branch inst!
8387   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8388   assert(BI && "Unexpected terminator found");
8389 
8390   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8391     return EdgeMaskCache[Edge] = SrcMask;
8392 
8393   // If source is an exiting block, we know the exit edge is dynamically dead
8394   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
8395   // adding uses of an otherwise potentially dead instruction.
8396   if (OrigLoop->isLoopExiting(Src))
8397     return EdgeMaskCache[Edge] = SrcMask;
8398 
8399   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
8400   assert(EdgeMask && "No Edge Mask found for condition");
8401 
8402   if (BI->getSuccessor(0) != Dst)
8403     EdgeMask = Builder.createNot(EdgeMask);
8404 
8405   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8406     // The condition is 'SrcMask && EdgeMask', which is equivalent to
8407     // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8408     // The select version does not introduce new UB if SrcMask is false and
8409     // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8410     VPValue *False = Plan->getOrAddVPValue(
8411         ConstantInt::getFalse(BI->getCondition()->getType()));
8412     EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False);
8413   }
8414 
8415   return EdgeMaskCache[Edge] = EdgeMask;
8416 }
8417 
8418 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
8419   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8420 
8421   // Look for cached value.
8422   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
8423   if (BCEntryIt != BlockMaskCache.end())
8424     return BCEntryIt->second;
8425 
8426   // All-one mask is modelled as no-mask following the convention for masked
8427   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8428   VPValue *BlockMask = nullptr;
8429 
8430   if (OrigLoop->getHeader() == BB) {
8431     if (!CM.blockNeedsPredicationForAnyReason(BB))
8432       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
8433 
8434     // Create the block in mask as the first non-phi instruction in the block.
8435     VPBuilder::InsertPointGuard Guard(Builder);
8436     auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
8437     Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
8438 
8439     // Introduce the early-exit compare IV <= BTC to form header block mask.
8440     // This is used instead of IV < TC because TC may wrap, unlike BTC.
8441     // Start by constructing the desired canonical IV.
8442     VPValue *IV = nullptr;
8443     if (Legal->getPrimaryInduction())
8444       IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
8445     else {
8446       auto *IVRecipe = new VPWidenCanonicalIVRecipe();
8447       Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint);
8448       IV = IVRecipe;
8449     }
8450     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
8451     bool TailFolded = !CM.isScalarEpilogueAllowed();
8452 
8453     if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
8454       // While ActiveLaneMask is a binary op that consumes the loop tripcount
8455       // as a second argument, we only pass the IV here and extract the
8456       // tripcount from the transform state where codegen of the VP instructions
8457       // happen.
8458       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
8459     } else {
8460       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
8461     }
8462     return BlockMaskCache[BB] = BlockMask;
8463   }
8464 
8465   // This is the block mask. We OR all incoming edges.
8466   for (auto *Predecessor : predecessors(BB)) {
8467     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8468     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
8469       return BlockMaskCache[BB] = EdgeMask;
8470 
8471     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8472       BlockMask = EdgeMask;
8473       continue;
8474     }
8475 
8476     BlockMask = Builder.createOr(BlockMask, EdgeMask);
8477   }
8478 
8479   return BlockMaskCache[BB] = BlockMask;
8480 }
8481 
8482 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
8483                                                 ArrayRef<VPValue *> Operands,
8484                                                 VFRange &Range,
8485                                                 VPlanPtr &Plan) {
8486   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8487          "Must be called with either a load or store");
8488 
8489   auto willWiden = [&](ElementCount VF) -> bool {
8490     if (VF.isScalar())
8491       return false;
8492     LoopVectorizationCostModel::InstWidening Decision =
8493         CM.getWideningDecision(I, VF);
8494     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8495            "CM decision should be taken at this point.");
8496     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8497       return true;
8498     if (CM.isScalarAfterVectorization(I, VF) ||
8499         CM.isProfitableToScalarize(I, VF))
8500       return false;
8501     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8502   };
8503 
8504   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8505     return nullptr;
8506 
8507   VPValue *Mask = nullptr;
8508   if (Legal->isMaskRequired(I))
8509     Mask = createBlockInMask(I->getParent(), Plan);
8510 
8511   // Determine if the pointer operand of the access is either consecutive or
8512   // reverse consecutive.
8513   LoopVectorizationCostModel::InstWidening Decision =
8514       CM.getWideningDecision(I, Range.Start);
8515   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8516   bool Consecutive =
8517       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8518 
8519   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8520     return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
8521                                               Consecutive, Reverse);
8522 
8523   StoreInst *Store = cast<StoreInst>(I);
8524   return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
8525                                             Mask, Consecutive, Reverse);
8526 }
8527 
8528 VPWidenIntOrFpInductionRecipe *
8529 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi,
8530                                            ArrayRef<VPValue *> Operands) const {
8531   // Check if this is an integer or fp induction. If so, build the recipe that
8532   // produces its scalar and vector values.
8533   if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi)) {
8534     assert(II->getStartValue() ==
8535            Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8536     return new VPWidenIntOrFpInductionRecipe(Phi, Operands[0], *II);
8537   }
8538 
8539   return nullptr;
8540 }
8541 
8542 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8543     TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range,
8544     VPlan &Plan) const {
8545   // Optimize the special case where the source is a constant integer
8546   // induction variable. Notice that we can only optimize the 'trunc' case
8547   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8548   // (c) other casts depend on pointer size.
8549 
8550   // Determine whether \p K is a truncation based on an induction variable that
8551   // can be optimized.
8552   auto isOptimizableIVTruncate =
8553       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8554     return [=](ElementCount VF) -> bool {
8555       return CM.isOptimizableIVTruncate(K, VF);
8556     };
8557   };
8558 
8559   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8560           isOptimizableIVTruncate(I), Range)) {
8561 
8562     auto *Phi = cast<PHINode>(I->getOperand(0));
8563     const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8564     VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
8565     return new VPWidenIntOrFpInductionRecipe(Phi, Start, II, I);
8566   }
8567   return nullptr;
8568 }
8569 
8570 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
8571                                                 ArrayRef<VPValue *> Operands,
8572                                                 VPlanPtr &Plan) {
8573   // If all incoming values are equal, the incoming VPValue can be used directly
8574   // instead of creating a new VPBlendRecipe.
8575   VPValue *FirstIncoming = Operands[0];
8576   if (all_of(Operands, [FirstIncoming](const VPValue *Inc) {
8577         return FirstIncoming == Inc;
8578       })) {
8579     return Operands[0];
8580   }
8581 
8582   // We know that all PHIs in non-header blocks are converted into selects, so
8583   // we don't have to worry about the insertion order and we can just use the
8584   // builder. At this point we generate the predication tree. There may be
8585   // duplications since this is a simple recursive scan, but future
8586   // optimizations will clean it up.
8587   SmallVector<VPValue *, 2> OperandsWithMask;
8588   unsigned NumIncoming = Phi->getNumIncomingValues();
8589 
8590   for (unsigned In = 0; In < NumIncoming; In++) {
8591     VPValue *EdgeMask =
8592       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8593     assert((EdgeMask || NumIncoming == 1) &&
8594            "Multiple predecessors with one having a full mask");
8595     OperandsWithMask.push_back(Operands[In]);
8596     if (EdgeMask)
8597       OperandsWithMask.push_back(EdgeMask);
8598   }
8599   return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
8600 }
8601 
8602 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8603                                                    ArrayRef<VPValue *> Operands,
8604                                                    VFRange &Range) const {
8605 
8606   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8607       [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); },
8608       Range);
8609 
8610   if (IsPredicated)
8611     return nullptr;
8612 
8613   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8614   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8615              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8616              ID == Intrinsic::pseudoprobe ||
8617              ID == Intrinsic::experimental_noalias_scope_decl))
8618     return nullptr;
8619 
8620   auto willWiden = [&](ElementCount VF) -> bool {
8621     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8622     // The following case may be scalarized depending on the VF.
8623     // The flag shows whether we use Intrinsic or a usual Call for vectorized
8624     // version of the instruction.
8625     // Is it beneficial to perform intrinsic call compared to lib call?
8626     bool NeedToScalarize = false;
8627     InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
8628     InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
8629     bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
8630     return UseVectorIntrinsic || !NeedToScalarize;
8631   };
8632 
8633   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8634     return nullptr;
8635 
8636   ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());
8637   return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()));
8638 }
8639 
8640 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8641   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8642          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8643   // Instruction should be widened, unless it is scalar after vectorization,
8644   // scalarization is profitable or it is predicated.
8645   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8646     return CM.isScalarAfterVectorization(I, VF) ||
8647            CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I);
8648   };
8649   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8650                                                              Range);
8651 }
8652 
8653 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8654                                            ArrayRef<VPValue *> Operands) const {
8655   auto IsVectorizableOpcode = [](unsigned Opcode) {
8656     switch (Opcode) {
8657     case Instruction::Add:
8658     case Instruction::And:
8659     case Instruction::AShr:
8660     case Instruction::BitCast:
8661     case Instruction::FAdd:
8662     case Instruction::FCmp:
8663     case Instruction::FDiv:
8664     case Instruction::FMul:
8665     case Instruction::FNeg:
8666     case Instruction::FPExt:
8667     case Instruction::FPToSI:
8668     case Instruction::FPToUI:
8669     case Instruction::FPTrunc:
8670     case Instruction::FRem:
8671     case Instruction::FSub:
8672     case Instruction::ICmp:
8673     case Instruction::IntToPtr:
8674     case Instruction::LShr:
8675     case Instruction::Mul:
8676     case Instruction::Or:
8677     case Instruction::PtrToInt:
8678     case Instruction::SDiv:
8679     case Instruction::Select:
8680     case Instruction::SExt:
8681     case Instruction::Shl:
8682     case Instruction::SIToFP:
8683     case Instruction::SRem:
8684     case Instruction::Sub:
8685     case Instruction::Trunc:
8686     case Instruction::UDiv:
8687     case Instruction::UIToFP:
8688     case Instruction::URem:
8689     case Instruction::Xor:
8690     case Instruction::ZExt:
8691       return true;
8692     }
8693     return false;
8694   };
8695 
8696   if (!IsVectorizableOpcode(I->getOpcode()))
8697     return nullptr;
8698 
8699   // Success: widen this instruction.
8700   return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8701 }
8702 
8703 void VPRecipeBuilder::fixHeaderPhis() {
8704   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8705   for (VPWidenPHIRecipe *R : PhisToFix) {
8706     auto *PN = cast<PHINode>(R->getUnderlyingValue());
8707     VPRecipeBase *IncR =
8708         getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8709     R->addOperand(IncR->getVPSingleValue());
8710   }
8711 }
8712 
8713 VPBasicBlock *VPRecipeBuilder::handleReplication(
8714     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8715     VPlanPtr &Plan) {
8716   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8717       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8718       Range);
8719 
8720   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8721       [&](ElementCount VF) { return CM.isPredicatedInst(I, IsUniform); },
8722       Range);
8723 
8724   // Even if the instruction is not marked as uniform, there are certain
8725   // intrinsic calls that can be effectively treated as such, so we check for
8726   // them here. Conservatively, we only do this for scalable vectors, since
8727   // for fixed-width VFs we can always fall back on full scalarization.
8728   if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8729     switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8730     case Intrinsic::assume:
8731     case Intrinsic::lifetime_start:
8732     case Intrinsic::lifetime_end:
8733       // For scalable vectors if one of the operands is variant then we still
8734       // want to mark as uniform, which will generate one instruction for just
8735       // the first lane of the vector. We can't scalarize the call in the same
8736       // way as for fixed-width vectors because we don't know how many lanes
8737       // there are.
8738       //
8739       // The reasons for doing it this way for scalable vectors are:
8740       //   1. For the assume intrinsic generating the instruction for the first
8741       //      lane is still be better than not generating any at all. For
8742       //      example, the input may be a splat across all lanes.
8743       //   2. For the lifetime start/end intrinsics the pointer operand only
8744       //      does anything useful when the input comes from a stack object,
8745       //      which suggests it should always be uniform. For non-stack objects
8746       //      the effect is to poison the object, which still allows us to
8747       //      remove the call.
8748       IsUniform = true;
8749       break;
8750     default:
8751       break;
8752     }
8753   }
8754 
8755   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8756                                        IsUniform, IsPredicated);
8757   setRecipe(I, Recipe);
8758   Plan->addVPValue(I, Recipe);
8759 
8760   // Find if I uses a predicated instruction. If so, it will use its scalar
8761   // value. Avoid hoisting the insert-element which packs the scalar value into
8762   // a vector value, as that happens iff all users use the vector value.
8763   for (VPValue *Op : Recipe->operands()) {
8764     auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef());
8765     if (!PredR)
8766       continue;
8767     auto *RepR =
8768         cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef());
8769     assert(RepR->isPredicated() &&
8770            "expected Replicate recipe to be predicated");
8771     RepR->setAlsoPack(false);
8772   }
8773 
8774   // Finalize the recipe for Instr, first if it is not predicated.
8775   if (!IsPredicated) {
8776     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8777     VPBB->appendRecipe(Recipe);
8778     return VPBB;
8779   }
8780   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8781   assert(VPBB->getSuccessors().empty() &&
8782          "VPBB has successors when handling predicated replication.");
8783   // Record predicated instructions for above packing optimizations.
8784   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
8785   VPBlockUtils::insertBlockAfter(Region, VPBB);
8786   auto *RegSucc = new VPBasicBlock();
8787   VPBlockUtils::insertBlockAfter(RegSucc, Region);
8788   return RegSucc;
8789 }
8790 
8791 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
8792                                                       VPRecipeBase *PredRecipe,
8793                                                       VPlanPtr &Plan) {
8794   // Instructions marked for predication are replicated and placed under an
8795   // if-then construct to prevent side-effects.
8796 
8797   // Generate recipes to compute the block mask for this region.
8798   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
8799 
8800   // Build the triangular if-then region.
8801   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8802   assert(Instr->getParent() && "Predicated instruction not in any basic block");
8803   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8804   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8805   auto *PHIRecipe = Instr->getType()->isVoidTy()
8806                         ? nullptr
8807                         : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
8808   if (PHIRecipe) {
8809     Plan->removeVPValueFor(Instr);
8810     Plan->addVPValue(Instr, PHIRecipe);
8811   }
8812   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8813   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8814   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
8815 
8816   // Note: first set Entry as region entry and then connect successors starting
8817   // from it in order, to propagate the "parent" of each VPBasicBlock.
8818   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
8819   VPBlockUtils::connectBlocks(Pred, Exit);
8820 
8821   return Region;
8822 }
8823 
8824 VPRecipeOrVPValueTy
8825 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8826                                         ArrayRef<VPValue *> Operands,
8827                                         VFRange &Range, VPlanPtr &Plan) {
8828   // First, check for specific widening recipes that deal with calls, memory
8829   // operations, inductions and Phi nodes.
8830   if (auto *CI = dyn_cast<CallInst>(Instr))
8831     return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
8832 
8833   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8834     return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
8835 
8836   VPRecipeBase *Recipe;
8837   if (auto Phi = dyn_cast<PHINode>(Instr)) {
8838     if (Phi->getParent() != OrigLoop->getHeader())
8839       return tryToBlend(Phi, Operands, Plan);
8840     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands)))
8841       return toVPRecipeResult(Recipe);
8842 
8843     VPWidenPHIRecipe *PhiRecipe = nullptr;
8844     if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) {
8845       VPValue *StartV = Operands[0];
8846       if (Legal->isReductionVariable(Phi)) {
8847         const RecurrenceDescriptor &RdxDesc =
8848             Legal->getReductionVars().find(Phi)->second;
8849         assert(RdxDesc.getRecurrenceStartValue() ==
8850                Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8851         PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8852                                              CM.isInLoopReduction(Phi),
8853                                              CM.useOrderedReductions(RdxDesc));
8854       } else {
8855         PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8856       }
8857 
8858       // Record the incoming value from the backedge, so we can add the incoming
8859       // value from the backedge after all recipes have been created.
8860       recordRecipeOf(cast<Instruction>(
8861           Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())));
8862       PhisToFix.push_back(PhiRecipe);
8863     } else {
8864       // TODO: record start and backedge value for remaining pointer induction
8865       // phis.
8866       assert(Phi->getType()->isPointerTy() &&
8867              "only pointer phis should be handled here");
8868       PhiRecipe = new VPWidenPHIRecipe(Phi);
8869     }
8870 
8871     return toVPRecipeResult(PhiRecipe);
8872   }
8873 
8874   if (isa<TruncInst>(Instr) &&
8875       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
8876                                                Range, *Plan)))
8877     return toVPRecipeResult(Recipe);
8878 
8879   if (!shouldWiden(Instr, Range))
8880     return nullptr;
8881 
8882   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8883     return toVPRecipeResult(new VPWidenGEPRecipe(
8884         GEP, make_range(Operands.begin(), Operands.end()), OrigLoop));
8885 
8886   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8887     bool InvariantCond =
8888         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
8889     return toVPRecipeResult(new VPWidenSelectRecipe(
8890         *SI, make_range(Operands.begin(), Operands.end()), InvariantCond));
8891   }
8892 
8893   return toVPRecipeResult(tryToWiden(Instr, Operands));
8894 }
8895 
8896 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8897                                                         ElementCount MaxVF) {
8898   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8899 
8900   // Collect instructions from the original loop that will become trivially dead
8901   // in the vectorized loop. We don't need to vectorize these instructions. For
8902   // example, original induction update instructions can become dead because we
8903   // separately emit induction "steps" when generating code for the new loop.
8904   // Similarly, we create a new latch condition when setting up the structure
8905   // of the new loop, so the old one can become dead.
8906   SmallPtrSet<Instruction *, 4> DeadInstructions;
8907   collectTriviallyDeadInstructions(DeadInstructions);
8908 
8909   // Add assume instructions we need to drop to DeadInstructions, to prevent
8910   // them from being added to the VPlan.
8911   // TODO: We only need to drop assumes in blocks that get flattend. If the
8912   // control flow is preserved, we should keep them.
8913   auto &ConditionalAssumes = Legal->getConditionalAssumes();
8914   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
8915 
8916   MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
8917   // Dead instructions do not need sinking. Remove them from SinkAfter.
8918   for (Instruction *I : DeadInstructions)
8919     SinkAfter.erase(I);
8920 
8921   // Cannot sink instructions after dead instructions (there won't be any
8922   // recipes for them). Instead, find the first non-dead previous instruction.
8923   for (auto &P : Legal->getSinkAfter()) {
8924     Instruction *SinkTarget = P.second;
8925     Instruction *FirstInst = &*SinkTarget->getParent()->begin();
8926     (void)FirstInst;
8927     while (DeadInstructions.contains(SinkTarget)) {
8928       assert(
8929           SinkTarget != FirstInst &&
8930           "Must find a live instruction (at least the one feeding the "
8931           "first-order recurrence PHI) before reaching beginning of the block");
8932       SinkTarget = SinkTarget->getPrevNode();
8933       assert(SinkTarget != P.first &&
8934              "sink source equals target, no sinking required");
8935     }
8936     P.second = SinkTarget;
8937   }
8938 
8939   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8940   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8941     VFRange SubRange = {VF, MaxVFPlusOne};
8942     VPlans.push_back(
8943         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
8944     VF = SubRange.End;
8945   }
8946 }
8947 
8948 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
8949     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
8950     const MapVector<Instruction *, Instruction *> &SinkAfter) {
8951 
8952   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8953 
8954   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8955 
8956   // ---------------------------------------------------------------------------
8957   // Pre-construction: record ingredients whose recipes we'll need to further
8958   // process after constructing the initial VPlan.
8959   // ---------------------------------------------------------------------------
8960 
8961   // Mark instructions we'll need to sink later and their targets as
8962   // ingredients whose recipe we'll need to record.
8963   for (auto &Entry : SinkAfter) {
8964     RecipeBuilder.recordRecipeOf(Entry.first);
8965     RecipeBuilder.recordRecipeOf(Entry.second);
8966   }
8967   for (auto &Reduction : CM.getInLoopReductionChains()) {
8968     PHINode *Phi = Reduction.first;
8969     RecurKind Kind =
8970         Legal->getReductionVars().find(Phi)->second.getRecurrenceKind();
8971     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8972 
8973     RecipeBuilder.recordRecipeOf(Phi);
8974     for (auto &R : ReductionOperations) {
8975       RecipeBuilder.recordRecipeOf(R);
8976       // For min/max reducitons, where we have a pair of icmp/select, we also
8977       // need to record the ICmp recipe, so it can be removed later.
8978       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
8979              "Only min/max recurrences allowed for inloop reductions");
8980       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
8981         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
8982     }
8983   }
8984 
8985   // For each interleave group which is relevant for this (possibly trimmed)
8986   // Range, add it to the set of groups to be later applied to the VPlan and add
8987   // placeholders for its members' Recipes which we'll be replacing with a
8988   // single VPInterleaveRecipe.
8989   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8990     auto applyIG = [IG, this](ElementCount VF) -> bool {
8991       return (VF.isVector() && // Query is illegal for VF == 1
8992               CM.getWideningDecision(IG->getInsertPos(), VF) ==
8993                   LoopVectorizationCostModel::CM_Interleave);
8994     };
8995     if (!getDecisionAndClampRange(applyIG, Range))
8996       continue;
8997     InterleaveGroups.insert(IG);
8998     for (unsigned i = 0; i < IG->getFactor(); i++)
8999       if (Instruction *Member = IG->getMember(i))
9000         RecipeBuilder.recordRecipeOf(Member);
9001   };
9002 
9003   // ---------------------------------------------------------------------------
9004   // Build initial VPlan: Scan the body of the loop in a topological order to
9005   // visit each basic block after having visited its predecessor basic blocks.
9006   // ---------------------------------------------------------------------------
9007 
9008   auto Plan = std::make_unique<VPlan>();
9009 
9010   // Scan the body of the loop in a topological order to visit each basic block
9011   // after having visited its predecessor basic blocks.
9012   LoopBlocksDFS DFS(OrigLoop);
9013   DFS.perform(LI);
9014 
9015   VPBasicBlock *VPBB = nullptr;
9016   VPBasicBlock *HeaderVPBB = nullptr;
9017   SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove;
9018   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
9019     // Relevant instructions from basic block BB will be grouped into VPRecipe
9020     // ingredients and fill a new VPBasicBlock.
9021     unsigned VPBBsForBB = 0;
9022     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
9023     if (VPBB)
9024       VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
9025     else {
9026       auto *TopRegion = new VPRegionBlock("vector loop");
9027       TopRegion->setEntry(FirstVPBBForBB);
9028       Plan->setEntry(TopRegion);
9029       HeaderVPBB = FirstVPBBForBB;
9030     }
9031     VPBB = FirstVPBBForBB;
9032     Builder.setInsertPoint(VPBB);
9033 
9034     // Introduce each ingredient into VPlan.
9035     // TODO: Model and preserve debug instrinsics in VPlan.
9036     for (Instruction &I : BB->instructionsWithoutDebug()) {
9037       Instruction *Instr = &I;
9038 
9039       // First filter out irrelevant instructions, to ensure no recipes are
9040       // built for them.
9041       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
9042         continue;
9043 
9044       SmallVector<VPValue *, 4> Operands;
9045       auto *Phi = dyn_cast<PHINode>(Instr);
9046       if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
9047         Operands.push_back(Plan->getOrAddVPValue(
9048             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
9049       } else {
9050         auto OpRange = Plan->mapToVPValues(Instr->operands());
9051         Operands = {OpRange.begin(), OpRange.end()};
9052       }
9053       if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
9054               Instr, Operands, Range, Plan)) {
9055         // If Instr can be simplified to an existing VPValue, use it.
9056         if (RecipeOrValue.is<VPValue *>()) {
9057           auto *VPV = RecipeOrValue.get<VPValue *>();
9058           Plan->addVPValue(Instr, VPV);
9059           // If the re-used value is a recipe, register the recipe for the
9060           // instruction, in case the recipe for Instr needs to be recorded.
9061           if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef()))
9062             RecipeBuilder.setRecipe(Instr, R);
9063           continue;
9064         }
9065         // Otherwise, add the new recipe.
9066         VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>();
9067         for (auto *Def : Recipe->definedValues()) {
9068           auto *UV = Def->getUnderlyingValue();
9069           Plan->addVPValue(UV, Def);
9070         }
9071 
9072         if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) &&
9073             HeaderVPBB->getFirstNonPhi() != VPBB->end()) {
9074           // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section
9075           // of the header block. That can happen for truncates of induction
9076           // variables. Those recipes are moved to the phi section of the header
9077           // block after applying SinkAfter, which relies on the original
9078           // position of the trunc.
9079           assert(isa<TruncInst>(Instr));
9080           InductionsToMove.push_back(
9081               cast<VPWidenIntOrFpInductionRecipe>(Recipe));
9082         }
9083         RecipeBuilder.setRecipe(Instr, Recipe);
9084         VPBB->appendRecipe(Recipe);
9085         continue;
9086       }
9087 
9088       // Otherwise, if all widening options failed, Instruction is to be
9089       // replicated. This may create a successor for VPBB.
9090       VPBasicBlock *NextVPBB =
9091           RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan);
9092       if (NextVPBB != VPBB) {
9093         VPBB = NextVPBB;
9094         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
9095                                     : "");
9096       }
9097     }
9098   }
9099 
9100   assert(isa<VPRegionBlock>(Plan->getEntry()) &&
9101          !Plan->getEntry()->getEntryBasicBlock()->empty() &&
9102          "entry block must be set to a VPRegionBlock having a non-empty entry "
9103          "VPBasicBlock");
9104   RecipeBuilder.fixHeaderPhis();
9105 
9106   // ---------------------------------------------------------------------------
9107   // Transform initial VPlan: Apply previously taken decisions, in order, to
9108   // bring the VPlan to its final state.
9109   // ---------------------------------------------------------------------------
9110 
9111   // Apply Sink-After legal constraints.
9112   auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {
9113     auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
9114     if (Region && Region->isReplicator()) {
9115       assert(Region->getNumSuccessors() == 1 &&
9116              Region->getNumPredecessors() == 1 && "Expected SESE region!");
9117       assert(R->getParent()->size() == 1 &&
9118              "A recipe in an original replicator region must be the only "
9119              "recipe in its block");
9120       return Region;
9121     }
9122     return nullptr;
9123   };
9124   for (auto &Entry : SinkAfter) {
9125     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
9126     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
9127 
9128     auto *TargetRegion = GetReplicateRegion(Target);
9129     auto *SinkRegion = GetReplicateRegion(Sink);
9130     if (!SinkRegion) {
9131       // If the sink source is not a replicate region, sink the recipe directly.
9132       if (TargetRegion) {
9133         // The target is in a replication region, make sure to move Sink to
9134         // the block after it, not into the replication region itself.
9135         VPBasicBlock *NextBlock =
9136             cast<VPBasicBlock>(TargetRegion->getSuccessors().front());
9137         Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
9138       } else
9139         Sink->moveAfter(Target);
9140       continue;
9141     }
9142 
9143     // The sink source is in a replicate region. Unhook the region from the CFG.
9144     auto *SinkPred = SinkRegion->getSinglePredecessor();
9145     auto *SinkSucc = SinkRegion->getSingleSuccessor();
9146     VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion);
9147     VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc);
9148     VPBlockUtils::connectBlocks(SinkPred, SinkSucc);
9149 
9150     if (TargetRegion) {
9151       // The target recipe is also in a replicate region, move the sink region
9152       // after the target region.
9153       auto *TargetSucc = TargetRegion->getSingleSuccessor();
9154       VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc);
9155       VPBlockUtils::connectBlocks(TargetRegion, SinkRegion);
9156       VPBlockUtils::connectBlocks(SinkRegion, TargetSucc);
9157     } else {
9158       // The sink source is in a replicate region, we need to move the whole
9159       // replicate region, which should only contain a single recipe in the
9160       // main block.
9161       auto *SplitBlock =
9162           Target->getParent()->splitAt(std::next(Target->getIterator()));
9163 
9164       auto *SplitPred = SplitBlock->getSinglePredecessor();
9165 
9166       VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock);
9167       VPBlockUtils::connectBlocks(SplitPred, SinkRegion);
9168       VPBlockUtils::connectBlocks(SinkRegion, SplitBlock);
9169       if (VPBB == SplitPred)
9170         VPBB = SplitBlock;
9171     }
9172   }
9173 
9174   cast<VPRegionBlock>(Plan->getEntry())->setExit(VPBB);
9175 
9176   VPlanTransforms::removeRedundantInductionCasts(*Plan);
9177 
9178   // Now that sink-after is done, move induction recipes for optimized truncates
9179   // to the phi section of the header block.
9180   for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove)
9181     Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
9182 
9183   // Adjust the recipes for any inloop reductions.
9184   adjustRecipesForReductions(VPBB, Plan, RecipeBuilder, Range.Start);
9185 
9186   // Introduce a recipe to combine the incoming and previous values of a
9187   // first-order recurrence.
9188   for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
9189     auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R);
9190     if (!RecurPhi)
9191       continue;
9192 
9193     VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe();
9194     VPBasicBlock *InsertBlock = PrevRecipe->getParent();
9195     auto *Region = GetReplicateRegion(PrevRecipe);
9196     if (Region)
9197       InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor());
9198     if (Region || PrevRecipe->isPhi())
9199       Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
9200     else
9201       Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator()));
9202 
9203     auto *RecurSplice = cast<VPInstruction>(
9204         Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
9205                              {RecurPhi, RecurPhi->getBackedgeValue()}));
9206 
9207     RecurPhi->replaceAllUsesWith(RecurSplice);
9208     // Set the first operand of RecurSplice to RecurPhi again, after replacing
9209     // all users.
9210     RecurSplice->setOperand(0, RecurPhi);
9211   }
9212 
9213   // Interleave memory: for each Interleave Group we marked earlier as relevant
9214   // for this VPlan, replace the Recipes widening its memory instructions with a
9215   // single VPInterleaveRecipe at its insertion point.
9216   for (auto IG : InterleaveGroups) {
9217     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
9218         RecipeBuilder.getRecipe(IG->getInsertPos()));
9219     SmallVector<VPValue *, 4> StoredValues;
9220     for (unsigned i = 0; i < IG->getFactor(); ++i)
9221       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
9222         auto *StoreR =
9223             cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
9224         StoredValues.push_back(StoreR->getStoredValue());
9225       }
9226 
9227     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
9228                                         Recipe->getMask());
9229     VPIG->insertBefore(Recipe);
9230     unsigned J = 0;
9231     for (unsigned i = 0; i < IG->getFactor(); ++i)
9232       if (Instruction *Member = IG->getMember(i)) {
9233         if (!Member->getType()->isVoidTy()) {
9234           VPValue *OriginalV = Plan->getVPValue(Member);
9235           Plan->removeVPValueFor(Member);
9236           Plan->addVPValue(Member, VPIG->getVPValue(J));
9237           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
9238           J++;
9239         }
9240         RecipeBuilder.getRecipe(Member)->eraseFromParent();
9241       }
9242   }
9243 
9244   // From this point onwards, VPlan-to-VPlan transformations may change the plan
9245   // in ways that accessing values using original IR values is incorrect.
9246   Plan->disableValue2VPValue();
9247 
9248   VPlanTransforms::sinkScalarOperands(*Plan);
9249   VPlanTransforms::mergeReplicateRegions(*Plan);
9250 
9251   std::string PlanName;
9252   raw_string_ostream RSO(PlanName);
9253   ElementCount VF = Range.Start;
9254   Plan->addVF(VF);
9255   RSO << "Initial VPlan for VF={" << VF;
9256   for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
9257     Plan->addVF(VF);
9258     RSO << "," << VF;
9259   }
9260   RSO << "},UF>=1";
9261   RSO.flush();
9262   Plan->setName(PlanName);
9263 
9264   assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
9265   return Plan;
9266 }
9267 
9268 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9269   // Outer loop handling: They may require CFG and instruction level
9270   // transformations before even evaluating whether vectorization is profitable.
9271   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9272   // the vectorization pipeline.
9273   assert(!OrigLoop->isInnermost());
9274   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9275 
9276   // Create new empty VPlan
9277   auto Plan = std::make_unique<VPlan>();
9278 
9279   // Build hierarchical CFG
9280   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9281   HCFGBuilder.buildHierarchicalCFG();
9282 
9283   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
9284        VF *= 2)
9285     Plan->addVF(VF);
9286 
9287   if (EnableVPlanPredication) {
9288     VPlanPredicator VPP(*Plan);
9289     VPP.predicate();
9290 
9291     // Avoid running transformation to recipes until masked code generation in
9292     // VPlan-native path is in place.
9293     return Plan;
9294   }
9295 
9296   SmallPtrSet<Instruction *, 1> DeadInstructions;
9297   VPlanTransforms::VPInstructionsToVPRecipes(
9298       OrigLoop, Plan,
9299       [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
9300       DeadInstructions, *PSE.getSE());
9301   return Plan;
9302 }
9303 
9304 // Adjust the recipes for reductions. For in-loop reductions the chain of
9305 // instructions leading from the loop exit instr to the phi need to be converted
9306 // to reductions, with one operand being vector and the other being the scalar
9307 // reduction chain. For other reductions, a select is introduced between the phi
9308 // and live-out recipes when folding the tail.
9309 void LoopVectorizationPlanner::adjustRecipesForReductions(
9310     VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
9311     ElementCount MinVF) {
9312   for (auto &Reduction : CM.getInLoopReductionChains()) {
9313     PHINode *Phi = Reduction.first;
9314     const RecurrenceDescriptor &RdxDesc =
9315         Legal->getReductionVars().find(Phi)->second;
9316     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9317 
9318     if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc))
9319       continue;
9320 
9321     // ReductionOperations are orders top-down from the phi's use to the
9322     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
9323     // which of the two operands will remain scalar and which will be reduced.
9324     // For minmax the chain will be the select instructions.
9325     Instruction *Chain = Phi;
9326     for (Instruction *R : ReductionOperations) {
9327       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
9328       RecurKind Kind = RdxDesc.getRecurrenceKind();
9329 
9330       VPValue *ChainOp = Plan->getVPValue(Chain);
9331       unsigned FirstOpId;
9332       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
9333              "Only min/max recurrences allowed for inloop reductions");
9334       // Recognize a call to the llvm.fmuladd intrinsic.
9335       bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9336       assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) &&
9337              "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9338       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9339         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
9340                "Expected to replace a VPWidenSelectSC");
9341         FirstOpId = 1;
9342       } else {
9343         assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) ||
9344                 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) &&
9345                "Expected to replace a VPWidenSC");
9346         FirstOpId = 0;
9347       }
9348       unsigned VecOpId =
9349           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
9350       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
9351 
9352       auto *CondOp = CM.foldTailByMasking()
9353                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
9354                          : nullptr;
9355 
9356       if (IsFMulAdd) {
9357         // If the instruction is a call to the llvm.fmuladd intrinsic then we
9358         // need to create an fmul recipe to use as the vector operand for the
9359         // fadd reduction.
9360         VPInstruction *FMulRecipe = new VPInstruction(
9361             Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))});
9362         FMulRecipe->setFastMathFlags(R->getFastMathFlags());
9363         WidenRecipe->getParent()->insert(FMulRecipe,
9364                                          WidenRecipe->getIterator());
9365         VecOp = FMulRecipe;
9366       }
9367       VPReductionRecipe *RedRecipe =
9368           new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
9369       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9370       Plan->removeVPValueFor(R);
9371       Plan->addVPValue(R, RedRecipe);
9372       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
9373       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9374       WidenRecipe->eraseFromParent();
9375 
9376       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9377         VPRecipeBase *CompareRecipe =
9378             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
9379         assert(isa<VPWidenRecipe>(CompareRecipe) &&
9380                "Expected to replace a VPWidenSC");
9381         assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
9382                "Expected no remaining users");
9383         CompareRecipe->eraseFromParent();
9384       }
9385       Chain = R;
9386     }
9387   }
9388 
9389   // If tail is folded by masking, introduce selects between the phi
9390   // and the live-out instruction of each reduction, at the end of the latch.
9391   if (CM.foldTailByMasking()) {
9392     for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
9393       VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9394       if (!PhiR || PhiR->isInLoop())
9395         continue;
9396       Builder.setInsertPoint(LatchVPBB);
9397       VPValue *Cond =
9398           RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
9399       VPValue *Red = PhiR->getBackedgeValue();
9400       Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});
9401     }
9402   }
9403 }
9404 
9405 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9406 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9407                                VPSlotTracker &SlotTracker) const {
9408   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9409   IG->getInsertPos()->printAsOperand(O, false);
9410   O << ", ";
9411   getAddr()->printAsOperand(O, SlotTracker);
9412   VPValue *Mask = getMask();
9413   if (Mask) {
9414     O << ", ";
9415     Mask->printAsOperand(O, SlotTracker);
9416   }
9417 
9418   unsigned OpIdx = 0;
9419   for (unsigned i = 0; i < IG->getFactor(); ++i) {
9420     if (!IG->getMember(i))
9421       continue;
9422     if (getNumStoreOperands() > 0) {
9423       O << "\n" << Indent << "  store ";
9424       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9425       O << " to index " << i;
9426     } else {
9427       O << "\n" << Indent << "  ";
9428       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
9429       O << " = load from index " << i;
9430     }
9431     ++OpIdx;
9432   }
9433 }
9434 #endif
9435 
9436 void VPWidenCallRecipe::execute(VPTransformState &State) {
9437   State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
9438                                   *this, State);
9439 }
9440 
9441 void VPWidenSelectRecipe::execute(VPTransformState &State) {
9442   auto &I = *cast<SelectInst>(getUnderlyingInstr());
9443   State.ILV->setDebugLocFromInst(&I);
9444 
9445   // The condition can be loop invariant  but still defined inside the
9446   // loop. This means that we can't just use the original 'cond' value.
9447   // We have to take the 'vectorized' value and pick the first lane.
9448   // Instcombine will make this a no-op.
9449   auto *InvarCond =
9450       InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr;
9451 
9452   for (unsigned Part = 0; Part < State.UF; ++Part) {
9453     Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part);
9454     Value *Op0 = State.get(getOperand(1), Part);
9455     Value *Op1 = State.get(getOperand(2), Part);
9456     Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
9457     State.set(this, Sel, Part);
9458     State.ILV->addMetadata(Sel, &I);
9459   }
9460 }
9461 
9462 void VPWidenRecipe::execute(VPTransformState &State) {
9463   auto &I = *cast<Instruction>(getUnderlyingValue());
9464   auto &Builder = State.Builder;
9465   switch (I.getOpcode()) {
9466   case Instruction::Call:
9467   case Instruction::Br:
9468   case Instruction::PHI:
9469   case Instruction::GetElementPtr:
9470   case Instruction::Select:
9471     llvm_unreachable("This instruction is handled by a different recipe.");
9472   case Instruction::UDiv:
9473   case Instruction::SDiv:
9474   case Instruction::SRem:
9475   case Instruction::URem:
9476   case Instruction::Add:
9477   case Instruction::FAdd:
9478   case Instruction::Sub:
9479   case Instruction::FSub:
9480   case Instruction::FNeg:
9481   case Instruction::Mul:
9482   case Instruction::FMul:
9483   case Instruction::FDiv:
9484   case Instruction::FRem:
9485   case Instruction::Shl:
9486   case Instruction::LShr:
9487   case Instruction::AShr:
9488   case Instruction::And:
9489   case Instruction::Or:
9490   case Instruction::Xor: {
9491     // Just widen unops and binops.
9492     State.ILV->setDebugLocFromInst(&I);
9493 
9494     for (unsigned Part = 0; Part < State.UF; ++Part) {
9495       SmallVector<Value *, 2> Ops;
9496       for (VPValue *VPOp : operands())
9497         Ops.push_back(State.get(VPOp, Part));
9498 
9499       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
9500 
9501       if (auto *VecOp = dyn_cast<Instruction>(V)) {
9502         VecOp->copyIRFlags(&I);
9503 
9504         // If the instruction is vectorized and was in a basic block that needed
9505         // predication, we can't propagate poison-generating flags (nuw/nsw,
9506         // exact, etc.). The control flow has been linearized and the
9507         // instruction is no longer guarded by the predicate, which could make
9508         // the flag properties to no longer hold.
9509         if (State.MayGeneratePoisonRecipes.count(this) > 0)
9510           VecOp->dropPoisonGeneratingFlags();
9511       }
9512 
9513       // Use this vector value for all users of the original instruction.
9514       State.set(this, V, Part);
9515       State.ILV->addMetadata(V, &I);
9516     }
9517 
9518     break;
9519   }
9520   case Instruction::ICmp:
9521   case Instruction::FCmp: {
9522     // Widen compares. Generate vector compares.
9523     bool FCmp = (I.getOpcode() == Instruction::FCmp);
9524     auto *Cmp = cast<CmpInst>(&I);
9525     State.ILV->setDebugLocFromInst(Cmp);
9526     for (unsigned Part = 0; Part < State.UF; ++Part) {
9527       Value *A = State.get(getOperand(0), Part);
9528       Value *B = State.get(getOperand(1), Part);
9529       Value *C = nullptr;
9530       if (FCmp) {
9531         // Propagate fast math flags.
9532         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
9533         Builder.setFastMathFlags(Cmp->getFastMathFlags());
9534         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
9535       } else {
9536         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
9537       }
9538       State.set(this, C, Part);
9539       State.ILV->addMetadata(C, &I);
9540     }
9541 
9542     break;
9543   }
9544 
9545   case Instruction::ZExt:
9546   case Instruction::SExt:
9547   case Instruction::FPToUI:
9548   case Instruction::FPToSI:
9549   case Instruction::FPExt:
9550   case Instruction::PtrToInt:
9551   case Instruction::IntToPtr:
9552   case Instruction::SIToFP:
9553   case Instruction::UIToFP:
9554   case Instruction::Trunc:
9555   case Instruction::FPTrunc:
9556   case Instruction::BitCast: {
9557     auto *CI = cast<CastInst>(&I);
9558     State.ILV->setDebugLocFromInst(CI);
9559 
9560     /// Vectorize casts.
9561     Type *DestTy = (State.VF.isScalar())
9562                        ? CI->getType()
9563                        : VectorType::get(CI->getType(), State.VF);
9564 
9565     for (unsigned Part = 0; Part < State.UF; ++Part) {
9566       Value *A = State.get(getOperand(0), Part);
9567       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
9568       State.set(this, Cast, Part);
9569       State.ILV->addMetadata(Cast, &I);
9570     }
9571     break;
9572   }
9573   default:
9574     // This instruction is not vectorized by simple widening.
9575     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
9576     llvm_unreachable("Unhandled instruction!");
9577   } // end of switch.
9578 }
9579 
9580 void VPWidenGEPRecipe::execute(VPTransformState &State) {
9581   auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
9582   // Construct a vector GEP by widening the operands of the scalar GEP as
9583   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
9584   // results in a vector of pointers when at least one operand of the GEP
9585   // is vector-typed. Thus, to keep the representation compact, we only use
9586   // vector-typed operands for loop-varying values.
9587 
9588   if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
9589     // If we are vectorizing, but the GEP has only loop-invariant operands,
9590     // the GEP we build (by only using vector-typed operands for
9591     // loop-varying values) would be a scalar pointer. Thus, to ensure we
9592     // produce a vector of pointers, we need to either arbitrarily pick an
9593     // operand to broadcast, or broadcast a clone of the original GEP.
9594     // Here, we broadcast a clone of the original.
9595     //
9596     // TODO: If at some point we decide to scalarize instructions having
9597     //       loop-invariant operands, this special case will no longer be
9598     //       required. We would add the scalarization decision to
9599     //       collectLoopScalars() and teach getVectorValue() to broadcast
9600     //       the lane-zero scalar value.
9601     auto *Clone = State.Builder.Insert(GEP->clone());
9602     for (unsigned Part = 0; Part < State.UF; ++Part) {
9603       Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone);
9604       State.set(this, EntryPart, Part);
9605       State.ILV->addMetadata(EntryPart, GEP);
9606     }
9607   } else {
9608     // If the GEP has at least one loop-varying operand, we are sure to
9609     // produce a vector of pointers. But if we are only unrolling, we want
9610     // to produce a scalar GEP for each unroll part. Thus, the GEP we
9611     // produce with the code below will be scalar (if VF == 1) or vector
9612     // (otherwise). Note that for the unroll-only case, we still maintain
9613     // values in the vector mapping with initVector, as we do for other
9614     // instructions.
9615     for (unsigned Part = 0; Part < State.UF; ++Part) {
9616       // The pointer operand of the new GEP. If it's loop-invariant, we
9617       // won't broadcast it.
9618       auto *Ptr = IsPtrLoopInvariant
9619                       ? State.get(getOperand(0), VPIteration(0, 0))
9620                       : State.get(getOperand(0), Part);
9621 
9622       // Collect all the indices for the new GEP. If any index is
9623       // loop-invariant, we won't broadcast it.
9624       SmallVector<Value *, 4> Indices;
9625       for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
9626         VPValue *Operand = getOperand(I);
9627         if (IsIndexLoopInvariant[I - 1])
9628           Indices.push_back(State.get(Operand, VPIteration(0, 0)));
9629         else
9630           Indices.push_back(State.get(Operand, Part));
9631       }
9632 
9633       // If the GEP instruction is vectorized and was in a basic block that
9634       // needed predication, we can't propagate the poison-generating 'inbounds'
9635       // flag. The control flow has been linearized and the GEP is no longer
9636       // guarded by the predicate, which could make the 'inbounds' properties to
9637       // no longer hold.
9638       bool IsInBounds =
9639           GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0;
9640 
9641       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
9642       // but it should be a vector, otherwise.
9643       auto *NewGEP = IsInBounds
9644                          ? State.Builder.CreateInBoundsGEP(
9645                                GEP->getSourceElementType(), Ptr, Indices)
9646                          : State.Builder.CreateGEP(GEP->getSourceElementType(),
9647                                                    Ptr, Indices);
9648       assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
9649              "NewGEP is not a pointer vector");
9650       State.set(this, NewGEP, Part);
9651       State.ILV->addMetadata(NewGEP, GEP);
9652     }
9653   }
9654 }
9655 
9656 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
9657   assert(!State.Instance && "Int or FP induction being replicated.");
9658   State.ILV->widenIntOrFpInduction(IV, getInductionDescriptor(),
9659                                    getStartValue()->getLiveInIRValue(),
9660                                    getTruncInst(), getVPValue(0), State);
9661 }
9662 
9663 void VPWidenPHIRecipe::execute(VPTransformState &State) {
9664   State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this,
9665                                  State);
9666 }
9667 
9668 void VPBlendRecipe::execute(VPTransformState &State) {
9669   State.ILV->setDebugLocFromInst(Phi, &State.Builder);
9670   // We know that all PHIs in non-header blocks are converted into
9671   // selects, so we don't have to worry about the insertion order and we
9672   // can just use the builder.
9673   // At this point we generate the predication tree. There may be
9674   // duplications since this is a simple recursive scan, but future
9675   // optimizations will clean it up.
9676 
9677   unsigned NumIncoming = getNumIncomingValues();
9678 
9679   // Generate a sequence of selects of the form:
9680   // SELECT(Mask3, In3,
9681   //        SELECT(Mask2, In2,
9682   //               SELECT(Mask1, In1,
9683   //                      In0)))
9684   // Note that Mask0 is never used: lanes for which no path reaches this phi and
9685   // are essentially undef are taken from In0.
9686   InnerLoopVectorizer::VectorParts Entry(State.UF);
9687   for (unsigned In = 0; In < NumIncoming; ++In) {
9688     for (unsigned Part = 0; Part < State.UF; ++Part) {
9689       // We might have single edge PHIs (blocks) - use an identity
9690       // 'select' for the first PHI operand.
9691       Value *In0 = State.get(getIncomingValue(In), Part);
9692       if (In == 0)
9693         Entry[Part] = In0; // Initialize with the first incoming value.
9694       else {
9695         // Select between the current value and the previous incoming edge
9696         // based on the incoming mask.
9697         Value *Cond = State.get(getMask(In), Part);
9698         Entry[Part] =
9699             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
9700       }
9701     }
9702   }
9703   for (unsigned Part = 0; Part < State.UF; ++Part)
9704     State.set(this, Entry[Part], Part);
9705 }
9706 
9707 void VPInterleaveRecipe::execute(VPTransformState &State) {
9708   assert(!State.Instance && "Interleave group being replicated.");
9709   State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9710                                       getStoredValues(), getMask());
9711 }
9712 
9713 void VPReductionRecipe::execute(VPTransformState &State) {
9714   assert(!State.Instance && "Reduction being replicated.");
9715   Value *PrevInChain = State.get(getChainOp(), 0);
9716   RecurKind Kind = RdxDesc->getRecurrenceKind();
9717   bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc);
9718   // Propagate the fast-math flags carried by the underlying instruction.
9719   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
9720   State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags());
9721   for (unsigned Part = 0; Part < State.UF; ++Part) {
9722     Value *NewVecOp = State.get(getVecOp(), Part);
9723     if (VPValue *Cond = getCondOp()) {
9724       Value *NewCond = State.get(Cond, Part);
9725       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
9726       Value *Iden = RdxDesc->getRecurrenceIdentity(
9727           Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags());
9728       Value *IdenVec =
9729           State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
9730       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
9731       NewVecOp = Select;
9732     }
9733     Value *NewRed;
9734     Value *NextInChain;
9735     if (IsOrdered) {
9736       if (State.VF.isVector())
9737         NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp,
9738                                         PrevInChain);
9739       else
9740         NewRed = State.Builder.CreateBinOp(
9741             (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain,
9742             NewVecOp);
9743       PrevInChain = NewRed;
9744     } else {
9745       PrevInChain = State.get(getChainOp(), Part);
9746       NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
9747     }
9748     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9749       NextInChain =
9750           createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
9751                          NewRed, PrevInChain);
9752     } else if (IsOrdered)
9753       NextInChain = NewRed;
9754     else
9755       NextInChain = State.Builder.CreateBinOp(
9756           (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed,
9757           PrevInChain);
9758     State.set(this, NextInChain, Part);
9759   }
9760 }
9761 
9762 void VPReplicateRecipe::execute(VPTransformState &State) {
9763   if (State.Instance) { // Generate a single instance.
9764     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9765     State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance,
9766                                     IsPredicated, State);
9767     // Insert scalar instance packing it into a vector.
9768     if (AlsoPack && State.VF.isVector()) {
9769       // If we're constructing lane 0, initialize to start from poison.
9770       if (State.Instance->Lane.isFirstLane()) {
9771         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9772         Value *Poison = PoisonValue::get(
9773             VectorType::get(getUnderlyingValue()->getType(), State.VF));
9774         State.set(this, Poison, State.Instance->Part);
9775       }
9776       State.ILV->packScalarIntoVectorValue(this, *State.Instance, State);
9777     }
9778     return;
9779   }
9780 
9781   // Generate scalar instances for all VF lanes of all UF parts, unless the
9782   // instruction is uniform inwhich case generate only the first lane for each
9783   // of the UF parts.
9784   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
9785   assert((!State.VF.isScalable() || IsUniform) &&
9786          "Can't scalarize a scalable vector");
9787   for (unsigned Part = 0; Part < State.UF; ++Part)
9788     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9789       State.ILV->scalarizeInstruction(getUnderlyingInstr(), this,
9790                                       VPIteration(Part, Lane), IsPredicated,
9791                                       State);
9792 }
9793 
9794 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
9795   assert(State.Instance && "Branch on Mask works only on single instance.");
9796 
9797   unsigned Part = State.Instance->Part;
9798   unsigned Lane = State.Instance->Lane.getKnownLane();
9799 
9800   Value *ConditionBit = nullptr;
9801   VPValue *BlockInMask = getMask();
9802   if (BlockInMask) {
9803     ConditionBit = State.get(BlockInMask, Part);
9804     if (ConditionBit->getType()->isVectorTy())
9805       ConditionBit = State.Builder.CreateExtractElement(
9806           ConditionBit, State.Builder.getInt32(Lane));
9807   } else // Block in mask is all-one.
9808     ConditionBit = State.Builder.getTrue();
9809 
9810   // Replace the temporary unreachable terminator with a new conditional branch,
9811   // whose two destinations will be set later when they are created.
9812   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
9813   assert(isa<UnreachableInst>(CurrentTerminator) &&
9814          "Expected to replace unreachable terminator with conditional branch.");
9815   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
9816   CondBr->setSuccessor(0, nullptr);
9817   ReplaceInstWithInst(CurrentTerminator, CondBr);
9818 }
9819 
9820 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
9821   assert(State.Instance && "Predicated instruction PHI works per instance.");
9822   Instruction *ScalarPredInst =
9823       cast<Instruction>(State.get(getOperand(0), *State.Instance));
9824   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
9825   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
9826   assert(PredicatingBB && "Predicated block has no single predecessor.");
9827   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
9828          "operand must be VPReplicateRecipe");
9829 
9830   // By current pack/unpack logic we need to generate only a single phi node: if
9831   // a vector value for the predicated instruction exists at this point it means
9832   // the instruction has vector users only, and a phi for the vector value is
9833   // needed. In this case the recipe of the predicated instruction is marked to
9834   // also do that packing, thereby "hoisting" the insert-element sequence.
9835   // Otherwise, a phi node for the scalar value is needed.
9836   unsigned Part = State.Instance->Part;
9837   if (State.hasVectorValue(getOperand(0), Part)) {
9838     Value *VectorValue = State.get(getOperand(0), Part);
9839     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
9840     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
9841     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
9842     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
9843     if (State.hasVectorValue(this, Part))
9844       State.reset(this, VPhi, Part);
9845     else
9846       State.set(this, VPhi, Part);
9847     // NOTE: Currently we need to update the value of the operand, so the next
9848     // predicated iteration inserts its generated value in the correct vector.
9849     State.reset(getOperand(0), VPhi, Part);
9850   } else {
9851     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
9852     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
9853     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
9854                      PredicatingBB);
9855     Phi->addIncoming(ScalarPredInst, PredicatedBB);
9856     if (State.hasScalarValue(this, *State.Instance))
9857       State.reset(this, Phi, *State.Instance);
9858     else
9859       State.set(this, Phi, *State.Instance);
9860     // NOTE: Currently we need to update the value of the operand, so the next
9861     // predicated iteration inserts its generated value in the correct vector.
9862     State.reset(getOperand(0), Phi, *State.Instance);
9863   }
9864 }
9865 
9866 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9867   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9868 
9869   // Attempt to issue a wide load.
9870   LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
9871   StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
9872 
9873   assert((LI || SI) && "Invalid Load/Store instruction");
9874   assert((!SI || StoredValue) && "No stored value provided for widened store");
9875   assert((!LI || !StoredValue) && "Stored value provided for widened load");
9876 
9877   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9878 
9879   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9880   const Align Alignment = getLoadStoreAlignment(&Ingredient);
9881   bool CreateGatherScatter = !Consecutive;
9882 
9883   auto &Builder = State.Builder;
9884   InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
9885   bool isMaskRequired = getMask();
9886   if (isMaskRequired)
9887     for (unsigned Part = 0; Part < State.UF; ++Part)
9888       BlockInMaskParts[Part] = State.get(getMask(), Part);
9889 
9890   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
9891     // Calculate the pointer for the specific unroll-part.
9892     GetElementPtrInst *PartPtr = nullptr;
9893 
9894     bool InBounds = false;
9895     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
9896       InBounds = gep->isInBounds();
9897     if (Reverse) {
9898       // If the address is consecutive but reversed, then the
9899       // wide store needs to start at the last vector element.
9900       // RunTimeVF =  VScale * VF.getKnownMinValue()
9901       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
9902       Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF);
9903       // NumElt = -Part * RunTimeVF
9904       Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
9905       // LastLane = 1 - RunTimeVF
9906       Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
9907       PartPtr =
9908           cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
9909       PartPtr->setIsInBounds(InBounds);
9910       PartPtr = cast<GetElementPtrInst>(
9911           Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
9912       PartPtr->setIsInBounds(InBounds);
9913       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
9914         BlockInMaskParts[Part] =
9915             Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse");
9916     } else {
9917       Value *Increment =
9918           createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part);
9919       PartPtr = cast<GetElementPtrInst>(
9920           Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
9921       PartPtr->setIsInBounds(InBounds);
9922     }
9923 
9924     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
9925     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
9926   };
9927 
9928   // Handle Stores:
9929   if (SI) {
9930     State.ILV->setDebugLocFromInst(SI);
9931 
9932     for (unsigned Part = 0; Part < State.UF; ++Part) {
9933       Instruction *NewSI = nullptr;
9934       Value *StoredVal = State.get(StoredValue, Part);
9935       if (CreateGatherScatter) {
9936         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9937         Value *VectorGep = State.get(getAddr(), Part);
9938         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
9939                                             MaskPart);
9940       } else {
9941         if (Reverse) {
9942           // If we store to reverse consecutive memory locations, then we need
9943           // to reverse the order of elements in the stored value.
9944           StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
9945           // We don't want to update the value in the map as it might be used in
9946           // another expression. So don't call resetVectorValue(StoredVal).
9947         }
9948         auto *VecPtr =
9949             CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9950         if (isMaskRequired)
9951           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
9952                                             BlockInMaskParts[Part]);
9953         else
9954           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
9955       }
9956       State.ILV->addMetadata(NewSI, SI);
9957     }
9958     return;
9959   }
9960 
9961   // Handle loads.
9962   assert(LI && "Must have a load instruction");
9963   State.ILV->setDebugLocFromInst(LI);
9964   for (unsigned Part = 0; Part < State.UF; ++Part) {
9965     Value *NewLI;
9966     if (CreateGatherScatter) {
9967       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9968       Value *VectorGep = State.get(getAddr(), Part);
9969       NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
9970                                          nullptr, "wide.masked.gather");
9971       State.ILV->addMetadata(NewLI, LI);
9972     } else {
9973       auto *VecPtr =
9974           CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9975       if (isMaskRequired)
9976         NewLI = Builder.CreateMaskedLoad(
9977             DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
9978             PoisonValue::get(DataTy), "wide.masked.load");
9979       else
9980         NewLI =
9981             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
9982 
9983       // Add metadata to the load, but setVectorValue to the reverse shuffle.
9984       State.ILV->addMetadata(NewLI, LI);
9985       if (Reverse)
9986         NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
9987     }
9988 
9989     State.set(getVPSingleValue(), NewLI, Part);
9990   }
9991 }
9992 
9993 // Determine how to lower the scalar epilogue, which depends on 1) optimising
9994 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9995 // predication, and 4) a TTI hook that analyses whether the loop is suitable
9996 // for predication.
9997 static ScalarEpilogueLowering getScalarEpilogueLowering(
9998     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9999     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
10000     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
10001     LoopVectorizationLegality &LVL) {
10002   // 1) OptSize takes precedence over all other options, i.e. if this is set,
10003   // don't look at hints or options, and don't request a scalar epilogue.
10004   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
10005   // LoopAccessInfo (due to code dependency and not being able to reliably get
10006   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
10007   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
10008   // versioning when the vectorization is forced, unlike hasOptSize. So revert
10009   // back to the old way and vectorize with versioning when forced. See D81345.)
10010   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
10011                                                       PGSOQueryType::IRPass) &&
10012                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
10013     return CM_ScalarEpilogueNotAllowedOptSize;
10014 
10015   // 2) If set, obey the directives
10016   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
10017     switch (PreferPredicateOverEpilogue) {
10018     case PreferPredicateTy::ScalarEpilogue:
10019       return CM_ScalarEpilogueAllowed;
10020     case PreferPredicateTy::PredicateElseScalarEpilogue:
10021       return CM_ScalarEpilogueNotNeededUsePredicate;
10022     case PreferPredicateTy::PredicateOrDontVectorize:
10023       return CM_ScalarEpilogueNotAllowedUsePredicate;
10024     };
10025   }
10026 
10027   // 3) If set, obey the hints
10028   switch (Hints.getPredicate()) {
10029   case LoopVectorizeHints::FK_Enabled:
10030     return CM_ScalarEpilogueNotNeededUsePredicate;
10031   case LoopVectorizeHints::FK_Disabled:
10032     return CM_ScalarEpilogueAllowed;
10033   };
10034 
10035   // 4) if the TTI hook indicates this is profitable, request predication.
10036   if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
10037                                        LVL.getLAI()))
10038     return CM_ScalarEpilogueNotNeededUsePredicate;
10039 
10040   return CM_ScalarEpilogueAllowed;
10041 }
10042 
10043 Value *VPTransformState::get(VPValue *Def, unsigned Part) {
10044   // If Values have been set for this Def return the one relevant for \p Part.
10045   if (hasVectorValue(Def, Part))
10046     return Data.PerPartOutput[Def][Part];
10047 
10048   if (!hasScalarValue(Def, {Part, 0})) {
10049     Value *IRV = Def->getLiveInIRValue();
10050     Value *B = ILV->getBroadcastInstrs(IRV);
10051     set(Def, B, Part);
10052     return B;
10053   }
10054 
10055   Value *ScalarValue = get(Def, {Part, 0});
10056   // If we aren't vectorizing, we can just copy the scalar map values over
10057   // to the vector map.
10058   if (VF.isScalar()) {
10059     set(Def, ScalarValue, Part);
10060     return ScalarValue;
10061   }
10062 
10063   auto *RepR = dyn_cast<VPReplicateRecipe>(Def);
10064   bool IsUniform = RepR && RepR->isUniform();
10065 
10066   unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
10067   // Check if there is a scalar value for the selected lane.
10068   if (!hasScalarValue(Def, {Part, LastLane})) {
10069     // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform.
10070     assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) &&
10071            "unexpected recipe found to be invariant");
10072     IsUniform = true;
10073     LastLane = 0;
10074   }
10075 
10076   auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));
10077   // Set the insert point after the last scalarized instruction or after the
10078   // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
10079   // will directly follow the scalar definitions.
10080   auto OldIP = Builder.saveIP();
10081   auto NewIP =
10082       isa<PHINode>(LastInst)
10083           ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
10084           : std::next(BasicBlock::iterator(LastInst));
10085   Builder.SetInsertPoint(&*NewIP);
10086 
10087   // However, if we are vectorizing, we need to construct the vector values.
10088   // If the value is known to be uniform after vectorization, we can just
10089   // broadcast the scalar value corresponding to lane zero for each unroll
10090   // iteration. Otherwise, we construct the vector values using
10091   // insertelement instructions. Since the resulting vectors are stored in
10092   // State, we will only generate the insertelements once.
10093   Value *VectorValue = nullptr;
10094   if (IsUniform) {
10095     VectorValue = ILV->getBroadcastInstrs(ScalarValue);
10096     set(Def, VectorValue, Part);
10097   } else {
10098     // Initialize packing with insertelements to start from undef.
10099     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
10100     Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
10101     set(Def, Undef, Part);
10102     for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
10103       ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this);
10104     VectorValue = get(Def, Part);
10105   }
10106   Builder.restoreIP(OldIP);
10107   return VectorValue;
10108 }
10109 
10110 // Process the loop in the VPlan-native vectorization path. This path builds
10111 // VPlan upfront in the vectorization pipeline, which allows to apply
10112 // VPlan-to-VPlan transformations from the very beginning without modifying the
10113 // input LLVM IR.
10114 static bool processLoopInVPlanNativePath(
10115     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
10116     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
10117     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
10118     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
10119     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
10120     LoopVectorizationRequirements &Requirements) {
10121 
10122   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
10123     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
10124     return false;
10125   }
10126   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
10127   Function *F = L->getHeader()->getParent();
10128   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
10129 
10130   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10131       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
10132 
10133   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
10134                                 &Hints, IAI);
10135   // Use the planner for outer loop vectorization.
10136   // TODO: CM is not used at this point inside the planner. Turn CM into an
10137   // optional argument if we don't need it in the future.
10138   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints,
10139                                Requirements, ORE);
10140 
10141   // Get user vectorization factor.
10142   ElementCount UserVF = Hints.getWidth();
10143 
10144   CM.collectElementTypesForWidening();
10145 
10146   // Plan how to best vectorize, return the best VF and its cost.
10147   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
10148 
10149   // If we are stress testing VPlan builds, do not attempt to generate vector
10150   // code. Masked vector code generation support will follow soon.
10151   // Also, do not attempt to vectorize if no vector code will be produced.
10152   if (VPlanBuildStressTest || EnableVPlanPredication ||
10153       VectorizationFactor::Disabled() == VF)
10154     return false;
10155 
10156   VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10157 
10158   {
10159     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10160                              F->getParent()->getDataLayout());
10161     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
10162                            &CM, BFI, PSI, Checks);
10163     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
10164                       << L->getHeader()->getParent()->getName() << "\"\n");
10165     LVP.executePlan(VF.Width, 1, BestPlan, LB, DT);
10166   }
10167 
10168   // Mark the loop as already vectorized to avoid vectorizing again.
10169   Hints.setAlreadyVectorized();
10170   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10171   return true;
10172 }
10173 
10174 // Emit a remark if there are stores to floats that required a floating point
10175 // extension. If the vectorized loop was generated with floating point there
10176 // will be a performance penalty from the conversion overhead and the change in
10177 // the vector width.
10178 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
10179   SmallVector<Instruction *, 4> Worklist;
10180   for (BasicBlock *BB : L->getBlocks()) {
10181     for (Instruction &Inst : *BB) {
10182       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
10183         if (S->getValueOperand()->getType()->isFloatTy())
10184           Worklist.push_back(S);
10185       }
10186     }
10187   }
10188 
10189   // Traverse the floating point stores upwards searching, for floating point
10190   // conversions.
10191   SmallPtrSet<const Instruction *, 4> Visited;
10192   SmallPtrSet<const Instruction *, 4> EmittedRemark;
10193   while (!Worklist.empty()) {
10194     auto *I = Worklist.pop_back_val();
10195     if (!L->contains(I))
10196       continue;
10197     if (!Visited.insert(I).second)
10198       continue;
10199 
10200     // Emit a remark if the floating point store required a floating
10201     // point conversion.
10202     // TODO: More work could be done to identify the root cause such as a
10203     // constant or a function return type and point the user to it.
10204     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
10205       ORE->emit([&]() {
10206         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
10207                                           I->getDebugLoc(), L->getHeader())
10208                << "floating point conversion changes vector width. "
10209                << "Mixed floating point precision requires an up/down "
10210                << "cast that will negatively impact performance.";
10211       });
10212 
10213     for (Use &Op : I->operands())
10214       if (auto *OpI = dyn_cast<Instruction>(Op))
10215         Worklist.push_back(OpI);
10216   }
10217 }
10218 
10219 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
10220     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10221                                !EnableLoopInterleaving),
10222       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10223                               !EnableLoopVectorization) {}
10224 
10225 bool LoopVectorizePass::processLoop(Loop *L) {
10226   assert((EnableVPlanNativePath || L->isInnermost()) &&
10227          "VPlan-native path is not enabled. Only process inner loops.");
10228 
10229 #ifndef NDEBUG
10230   const std::string DebugLocStr = getDebugLocString(L);
10231 #endif /* NDEBUG */
10232 
10233   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
10234                     << L->getHeader()->getParent()->getName() << "\" from "
10235                     << DebugLocStr << "\n");
10236 
10237   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
10238 
10239   LLVM_DEBUG(
10240       dbgs() << "LV: Loop hints:"
10241              << " force="
10242              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
10243                      ? "disabled"
10244                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
10245                             ? "enabled"
10246                             : "?"))
10247              << " width=" << Hints.getWidth()
10248              << " interleave=" << Hints.getInterleave() << "\n");
10249 
10250   // Function containing loop
10251   Function *F = L->getHeader()->getParent();
10252 
10253   // Looking at the diagnostic output is the only way to determine if a loop
10254   // was vectorized (other than looking at the IR or machine code), so it
10255   // is important to generate an optimization remark for each loop. Most of
10256   // these messages are generated as OptimizationRemarkAnalysis. Remarks
10257   // generated as OptimizationRemark and OptimizationRemarkMissed are
10258   // less verbose reporting vectorized loops and unvectorized loops that may
10259   // benefit from vectorization, respectively.
10260 
10261   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10262     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10263     return false;
10264   }
10265 
10266   PredicatedScalarEvolution PSE(*SE, *L);
10267 
10268   // Check if it is legal to vectorize the loop.
10269   LoopVectorizationRequirements Requirements;
10270   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
10271                                 &Requirements, &Hints, DB, AC, BFI, PSI);
10272   if (!LVL.canVectorize(EnableVPlanNativePath)) {
10273     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10274     Hints.emitRemarkWithHints();
10275     return false;
10276   }
10277 
10278   // Check the function attributes and profiles to find out if this function
10279   // should be optimized for size.
10280   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10281       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
10282 
10283   // Entrance to the VPlan-native vectorization path. Outer loops are processed
10284   // here. They may require CFG and instruction level transformations before
10285   // even evaluating whether vectorization is profitable. Since we cannot modify
10286   // the incoming IR, we need to build VPlan upfront in the vectorization
10287   // pipeline.
10288   if (!L->isInnermost())
10289     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10290                                         ORE, BFI, PSI, Hints, Requirements);
10291 
10292   assert(L->isInnermost() && "Inner loop expected.");
10293 
10294   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10295   // count by optimizing for size, to minimize overheads.
10296   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
10297   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10298     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10299                       << "This loop is worth vectorizing only if no scalar "
10300                       << "iteration overheads are incurred.");
10301     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10302       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10303     else {
10304       LLVM_DEBUG(dbgs() << "\n");
10305       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10306     }
10307   }
10308 
10309   // Check the function attributes to see if implicit floats are allowed.
10310   // FIXME: This check doesn't seem possibly correct -- what if the loop is
10311   // an integer loop and the vector instructions selected are purely integer
10312   // vector instructions?
10313   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10314     reportVectorizationFailure(
10315         "Can't vectorize when the NoImplicitFloat attribute is used",
10316         "loop not vectorized due to NoImplicitFloat attribute",
10317         "NoImplicitFloat", ORE, L);
10318     Hints.emitRemarkWithHints();
10319     return false;
10320   }
10321 
10322   // Check if the target supports potentially unsafe FP vectorization.
10323   // FIXME: Add a check for the type of safety issue (denormal, signaling)
10324   // for the target we're vectorizing for, to make sure none of the
10325   // additional fp-math flags can help.
10326   if (Hints.isPotentiallyUnsafe() &&
10327       TTI->isFPVectorizationPotentiallyUnsafe()) {
10328     reportVectorizationFailure(
10329         "Potentially unsafe FP op prevents vectorization",
10330         "loop not vectorized due to unsafe FP support.",
10331         "UnsafeFP", ORE, L);
10332     Hints.emitRemarkWithHints();
10333     return false;
10334   }
10335 
10336   bool AllowOrderedReductions;
10337   // If the flag is set, use that instead and override the TTI behaviour.
10338   if (ForceOrderedReductions.getNumOccurrences() > 0)
10339     AllowOrderedReductions = ForceOrderedReductions;
10340   else
10341     AllowOrderedReductions = TTI->enableOrderedReductions();
10342   if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10343     ORE->emit([&]() {
10344       auto *ExactFPMathInst = Requirements.getExactFPInst();
10345       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10346                                                  ExactFPMathInst->getDebugLoc(),
10347                                                  ExactFPMathInst->getParent())
10348              << "loop not vectorized: cannot prove it is safe to reorder "
10349                 "floating-point operations";
10350     });
10351     LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10352                          "reorder floating-point operations\n");
10353     Hints.emitRemarkWithHints();
10354     return false;
10355   }
10356 
10357   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10358   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10359 
10360   // If an override option has been passed in for interleaved accesses, use it.
10361   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
10362     UseInterleaved = EnableInterleavedMemAccesses;
10363 
10364   // Analyze interleaved memory accesses.
10365   if (UseInterleaved) {
10366     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
10367   }
10368 
10369   // Use the cost model.
10370   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10371                                 F, &Hints, IAI);
10372   CM.collectValuesToIgnore();
10373   CM.collectElementTypesForWidening();
10374 
10375   // Use the planner for vectorization.
10376   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints,
10377                                Requirements, ORE);
10378 
10379   // Get user vectorization factor and interleave count.
10380   ElementCount UserVF = Hints.getWidth();
10381   unsigned UserIC = Hints.getInterleave();
10382 
10383   // Plan how to best vectorize, return the best VF and its cost.
10384   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
10385 
10386   VectorizationFactor VF = VectorizationFactor::Disabled();
10387   unsigned IC = 1;
10388 
10389   if (MaybeVF) {
10390     VF = *MaybeVF;
10391     // Select the interleave count.
10392     IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());
10393   }
10394 
10395   // Identify the diagnostic messages that should be produced.
10396   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10397   bool VectorizeLoop = true, InterleaveLoop = true;
10398   if (VF.Width.isScalar()) {
10399     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10400     VecDiagMsg = std::make_pair(
10401         "VectorizationNotBeneficial",
10402         "the cost-model indicates that vectorization is not beneficial");
10403     VectorizeLoop = false;
10404   }
10405 
10406   if (!MaybeVF && UserIC > 1) {
10407     // Tell the user interleaving was avoided up-front, despite being explicitly
10408     // requested.
10409     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10410                          "interleaving should be avoided up front\n");
10411     IntDiagMsg = std::make_pair(
10412         "InterleavingAvoided",
10413         "Ignoring UserIC, because interleaving was avoided up front");
10414     InterleaveLoop = false;
10415   } else if (IC == 1 && UserIC <= 1) {
10416     // Tell the user interleaving is not beneficial.
10417     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10418     IntDiagMsg = std::make_pair(
10419         "InterleavingNotBeneficial",
10420         "the cost-model indicates that interleaving is not beneficial");
10421     InterleaveLoop = false;
10422     if (UserIC == 1) {
10423       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10424       IntDiagMsg.second +=
10425           " and is explicitly disabled or interleave count is set to 1";
10426     }
10427   } else if (IC > 1 && UserIC == 1) {
10428     // Tell the user interleaving is beneficial, but it explicitly disabled.
10429     LLVM_DEBUG(
10430         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10431     IntDiagMsg = std::make_pair(
10432         "InterleavingBeneficialButDisabled",
10433         "the cost-model indicates that interleaving is beneficial "
10434         "but is explicitly disabled or interleave count is set to 1");
10435     InterleaveLoop = false;
10436   }
10437 
10438   // Override IC if user provided an interleave count.
10439   IC = UserIC > 0 ? UserIC : IC;
10440 
10441   // Emit diagnostic messages, if any.
10442   const char *VAPassName = Hints.vectorizeAnalysisPassName();
10443   if (!VectorizeLoop && !InterleaveLoop) {
10444     // Do not vectorize or interleaving the loop.
10445     ORE->emit([&]() {
10446       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10447                                       L->getStartLoc(), L->getHeader())
10448              << VecDiagMsg.second;
10449     });
10450     ORE->emit([&]() {
10451       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10452                                       L->getStartLoc(), L->getHeader())
10453              << IntDiagMsg.second;
10454     });
10455     return false;
10456   } else if (!VectorizeLoop && InterleaveLoop) {
10457     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10458     ORE->emit([&]() {
10459       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10460                                         L->getStartLoc(), L->getHeader())
10461              << VecDiagMsg.second;
10462     });
10463   } else if (VectorizeLoop && !InterleaveLoop) {
10464     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10465                       << ") in " << DebugLocStr << '\n');
10466     ORE->emit([&]() {
10467       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10468                                         L->getStartLoc(), L->getHeader())
10469              << IntDiagMsg.second;
10470     });
10471   } else if (VectorizeLoop && InterleaveLoop) {
10472     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10473                       << ") in " << DebugLocStr << '\n');
10474     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10475   }
10476 
10477   bool DisableRuntimeUnroll = false;
10478   MDNode *OrigLoopID = L->getLoopID();
10479   {
10480     // Optimistically generate runtime checks. Drop them if they turn out to not
10481     // be profitable. Limit the scope of Checks, so the cleanup happens
10482     // immediately after vector codegeneration is done.
10483     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10484                              F->getParent()->getDataLayout());
10485     if (!VF.Width.isScalar() || IC > 1)
10486       Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate());
10487 
10488     using namespace ore;
10489     if (!VectorizeLoop) {
10490       assert(IC > 1 && "interleave count should not be 1 or 0");
10491       // If we decided that it is not legal to vectorize the loop, then
10492       // interleave it.
10493       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10494                                  &CM, BFI, PSI, Checks);
10495 
10496       VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10497       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT);
10498 
10499       ORE->emit([&]() {
10500         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10501                                   L->getHeader())
10502                << "interleaved loop (interleaved count: "
10503                << NV("InterleaveCount", IC) << ")";
10504       });
10505     } else {
10506       // If we decided that it is *legal* to vectorize the loop, then do it.
10507 
10508       // Consider vectorizing the epilogue too if it's profitable.
10509       VectorizationFactor EpilogueVF =
10510           CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
10511       if (EpilogueVF.Width.isVector()) {
10512 
10513         // The first pass vectorizes the main loop and creates a scalar epilogue
10514         // to be vectorized by executing the plan (potentially with a different
10515         // factor) again shortly afterwards.
10516         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10517         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10518                                            EPI, &LVL, &CM, BFI, PSI, Checks);
10519 
10520         VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
10521         LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV,
10522                         DT);
10523         ++LoopsVectorized;
10524 
10525         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10526         formLCSSARecursively(*L, *DT, LI, SE);
10527 
10528         // Second pass vectorizes the epilogue and adjusts the control flow
10529         // edges from the first pass.
10530         EPI.MainLoopVF = EPI.EpilogueVF;
10531         EPI.MainLoopUF = EPI.EpilogueUF;
10532         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10533                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
10534                                                  Checks);
10535 
10536         VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
10537         LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10538                         DT);
10539         ++LoopsEpilogueVectorized;
10540 
10541         if (!MainILV.areSafetyChecksAdded())
10542           DisableRuntimeUnroll = true;
10543       } else {
10544         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
10545                                &LVL, &CM, BFI, PSI, Checks);
10546 
10547         VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10548         LVP.executePlan(VF.Width, IC, BestPlan, LB, DT);
10549         ++LoopsVectorized;
10550 
10551         // Add metadata to disable runtime unrolling a scalar loop when there
10552         // are no runtime checks about strides and memory. A scalar loop that is
10553         // rarely used is not worth unrolling.
10554         if (!LB.areSafetyChecksAdded())
10555           DisableRuntimeUnroll = true;
10556       }
10557       // Report the vectorization decision.
10558       ORE->emit([&]() {
10559         return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
10560                                   L->getHeader())
10561                << "vectorized loop (vectorization width: "
10562                << NV("VectorizationFactor", VF.Width)
10563                << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
10564       });
10565     }
10566 
10567     if (ORE->allowExtraAnalysis(LV_NAME))
10568       checkMixedPrecision(L, ORE);
10569   }
10570 
10571   Optional<MDNode *> RemainderLoopID =
10572       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10573                                       LLVMLoopVectorizeFollowupEpilogue});
10574   if (RemainderLoopID.hasValue()) {
10575     L->setLoopID(RemainderLoopID.getValue());
10576   } else {
10577     if (DisableRuntimeUnroll)
10578       AddRuntimeUnrollDisableMetaData(L);
10579 
10580     // Mark the loop as already vectorized to avoid vectorizing again.
10581     Hints.setAlreadyVectorized();
10582   }
10583 
10584   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10585   return true;
10586 }
10587 
10588 LoopVectorizeResult LoopVectorizePass::runImpl(
10589     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10590     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
10591     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
10592     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
10593     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10594   SE = &SE_;
10595   LI = &LI_;
10596   TTI = &TTI_;
10597   DT = &DT_;
10598   BFI = &BFI_;
10599   TLI = TLI_;
10600   AA = &AA_;
10601   AC = &AC_;
10602   GetLAA = &GetLAA_;
10603   DB = &DB_;
10604   ORE = &ORE_;
10605   PSI = PSI_;
10606 
10607   // Don't attempt if
10608   // 1. the target claims to have no vector registers, and
10609   // 2. interleaving won't help ILP.
10610   //
10611   // The second condition is necessary because, even if the target has no
10612   // vector registers, loop vectorization may still enable scalar
10613   // interleaving.
10614   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10615       TTI->getMaxInterleaveFactor(1) < 2)
10616     return LoopVectorizeResult(false, false);
10617 
10618   bool Changed = false, CFGChanged = false;
10619 
10620   // The vectorizer requires loops to be in simplified form.
10621   // Since simplification may add new inner loops, it has to run before the
10622   // legality and profitability checks. This means running the loop vectorizer
10623   // will simplify all loops, regardless of whether anything end up being
10624   // vectorized.
10625   for (auto &L : *LI)
10626     Changed |= CFGChanged |=
10627         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10628 
10629   // Build up a worklist of inner-loops to vectorize. This is necessary as
10630   // the act of vectorizing or partially unrolling a loop creates new loops
10631   // and can invalidate iterators across the loops.
10632   SmallVector<Loop *, 8> Worklist;
10633 
10634   for (Loop *L : *LI)
10635     collectSupportedLoops(*L, LI, ORE, Worklist);
10636 
10637   LoopsAnalyzed += Worklist.size();
10638 
10639   // Now walk the identified inner loops.
10640   while (!Worklist.empty()) {
10641     Loop *L = Worklist.pop_back_val();
10642 
10643     // For the inner loops we actually process, form LCSSA to simplify the
10644     // transform.
10645     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10646 
10647     Changed |= CFGChanged |= processLoop(L);
10648   }
10649 
10650   // Process each loop nest in the function.
10651   return LoopVectorizeResult(Changed, CFGChanged);
10652 }
10653 
10654 PreservedAnalyses LoopVectorizePass::run(Function &F,
10655                                          FunctionAnalysisManager &AM) {
10656     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
10657     auto &LI = AM.getResult<LoopAnalysis>(F);
10658     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10659     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10660     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
10661     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10662     auto &AA = AM.getResult<AAManager>(F);
10663     auto &AC = AM.getResult<AssumptionAnalysis>(F);
10664     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10665     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10666 
10667     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
10668     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
10669         [&](Loop &L) -> const LoopAccessInfo & {
10670       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,      SE,
10671                                         TLI, TTI, nullptr, nullptr, nullptr};
10672       return LAM.getResult<LoopAccessAnalysis>(L, AR);
10673     };
10674     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10675     ProfileSummaryInfo *PSI =
10676         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10677     LoopVectorizeResult Result =
10678         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
10679     if (!Result.MadeAnyChange)
10680       return PreservedAnalyses::all();
10681     PreservedAnalyses PA;
10682 
10683     // We currently do not preserve loopinfo/dominator analyses with outer loop
10684     // vectorization. Until this is addressed, mark these analyses as preserved
10685     // only for non-VPlan-native path.
10686     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10687     if (!EnableVPlanNativePath) {
10688       PA.preserve<LoopAnalysis>();
10689       PA.preserve<DominatorTreeAnalysis>();
10690     }
10691 
10692     if (Result.MadeCFGChange) {
10693       // Making CFG changes likely means a loop got vectorized. Indicate that
10694       // extra simplification passes should be run.
10695       // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10696       // be run if runtime checks have been added.
10697       AM.getResult<ShouldRunExtraVectorPasses>(F);
10698       PA.preserve<ShouldRunExtraVectorPasses>();
10699     } else {
10700       PA.preserveSet<CFGAnalyses>();
10701     }
10702     return PA;
10703 }
10704 
10705 void LoopVectorizePass::printPipeline(
10706     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10707   static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10708       OS, MapClassName2PassName);
10709 
10710   OS << "<";
10711   OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10712   OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10713   OS << ">";
10714 }
10715