1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SmallPtrSet.h"
73 #include "llvm/ADT/SmallSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
91 #include "llvm/Analysis/ProfileSummaryInfo.h"
92 #include "llvm/Analysis/ScalarEvolution.h"
93 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
94 #include "llvm/Analysis/TargetLibraryInfo.h"
95 #include "llvm/Analysis/TargetTransformInfo.h"
96 #include "llvm/Analysis/VectorUtils.h"
97 #include "llvm/IR/Attributes.h"
98 #include "llvm/IR/BasicBlock.h"
99 #include "llvm/IR/CFG.h"
100 #include "llvm/IR/Constant.h"
101 #include "llvm/IR/Constants.h"
102 #include "llvm/IR/DataLayout.h"
103 #include "llvm/IR/DebugInfoMetadata.h"
104 #include "llvm/IR/DebugLoc.h"
105 #include "llvm/IR/DerivedTypes.h"
106 #include "llvm/IR/DiagnosticInfo.h"
107 #include "llvm/IR/Dominators.h"
108 #include "llvm/IR/Function.h"
109 #include "llvm/IR/IRBuilder.h"
110 #include "llvm/IR/InstrTypes.h"
111 #include "llvm/IR/Instruction.h"
112 #include "llvm/IR/Instructions.h"
113 #include "llvm/IR/IntrinsicInst.h"
114 #include "llvm/IR/Intrinsics.h"
115 #include "llvm/IR/LLVMContext.h"
116 #include "llvm/IR/Metadata.h"
117 #include "llvm/IR/Module.h"
118 #include "llvm/IR/Operator.h"
119 #include "llvm/IR/PatternMatch.h"
120 #include "llvm/IR/Type.h"
121 #include "llvm/IR/Use.h"
122 #include "llvm/IR/User.h"
123 #include "llvm/IR/Value.h"
124 #include "llvm/IR/ValueHandle.h"
125 #include "llvm/IR/Verifier.h"
126 #include "llvm/InitializePasses.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/InstructionCost.h"
134 #include "llvm/Support/MathExtras.h"
135 #include "llvm/Support/raw_ostream.h"
136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
137 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
138 #include "llvm/Transforms/Utils/LoopSimplify.h"
139 #include "llvm/Transforms/Utils/LoopUtils.h"
140 #include "llvm/Transforms/Utils/LoopVersioning.h"
141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
142 #include "llvm/Transforms/Utils/SizeOpts.h"
143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
144 #include <algorithm>
145 #include <cassert>
146 #include <cstdint>
147 #include <cstdlib>
148 #include <functional>
149 #include <iterator>
150 #include <limits>
151 #include <memory>
152 #include <string>
153 #include <tuple>
154 #include <utility>
155 
156 using namespace llvm;
157 
158 #define LV_NAME "loop-vectorize"
159 #define DEBUG_TYPE LV_NAME
160 
161 #ifndef NDEBUG
162 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
163 #endif
164 
165 /// @{
166 /// Metadata attribute names
167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
168 const char LLVMLoopVectorizeFollowupVectorized[] =
169     "llvm.loop.vectorize.followup_vectorized";
170 const char LLVMLoopVectorizeFollowupEpilogue[] =
171     "llvm.loop.vectorize.followup_epilogue";
172 /// @}
173 
174 STATISTIC(LoopsVectorized, "Number of loops vectorized");
175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
177 
178 static cl::opt<bool> EnableEpilogueVectorization(
179     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
180     cl::desc("Enable vectorization of epilogue loops."));
181 
182 static cl::opt<unsigned> EpilogueVectorizationForceVF(
183     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
184     cl::desc("When epilogue vectorization is enabled, and a value greater than "
185              "1 is specified, forces the given VF for all applicable epilogue "
186              "loops."));
187 
188 static cl::opt<unsigned> EpilogueVectorizationMinVF(
189     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
190     cl::desc("Only loops with vectorization factor equal to or larger than "
191              "the specified value are considered for epilogue vectorization."));
192 
193 /// Loops with a known constant trip count below this number are vectorized only
194 /// if no scalar iteration overheads are incurred.
195 static cl::opt<unsigned> TinyTripCountVectorThreshold(
196     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
197     cl::desc("Loops with a constant trip count that is smaller than this "
198              "value are vectorized only if no scalar iteration overheads "
199              "are incurred."));
200 
201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
202     "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
203     cl::desc("The maximum allowed number of runtime memory checks with a "
204              "vectorize(enable) pragma."));
205 
206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
207 // that predication is preferred, and this lists all options. I.e., the
208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
209 // and predicate the instructions accordingly. If tail-folding fails, there are
210 // different fallback strategies depending on these values:
211 namespace PreferPredicateTy {
212   enum Option {
213     ScalarEpilogue = 0,
214     PredicateElseScalarEpilogue,
215     PredicateOrDontVectorize
216   };
217 } // namespace PreferPredicateTy
218 
219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
220     "prefer-predicate-over-epilogue",
221     cl::init(PreferPredicateTy::ScalarEpilogue),
222     cl::Hidden,
223     cl::desc("Tail-folding and predication preferences over creating a scalar "
224              "epilogue loop."),
225     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
226                          "scalar-epilogue",
227                          "Don't tail-predicate loops, create scalar epilogue"),
228               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
229                          "predicate-else-scalar-epilogue",
230                          "prefer tail-folding, create scalar epilogue if tail "
231                          "folding fails."),
232               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
233                          "predicate-dont-vectorize",
234                          "prefers tail-folding, don't attempt vectorization if "
235                          "tail-folding fails.")));
236 
237 static cl::opt<bool> MaximizeBandwidth(
238     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
239     cl::desc("Maximize bandwidth when selecting vectorization factor which "
240              "will be determined by the smallest type in loop."));
241 
242 static cl::opt<bool> EnableInterleavedMemAccesses(
243     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
244     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
245 
246 /// An interleave-group may need masking if it resides in a block that needs
247 /// predication, or in order to mask away gaps.
248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
249     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
250     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
251 
252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
253     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
254     cl::desc("We don't interleave loops with a estimated constant trip count "
255              "below this number"));
256 
257 static cl::opt<unsigned> ForceTargetNumScalarRegs(
258     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
259     cl::desc("A flag that overrides the target's number of scalar registers."));
260 
261 static cl::opt<unsigned> ForceTargetNumVectorRegs(
262     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
263     cl::desc("A flag that overrides the target's number of vector registers."));
264 
265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
266     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
267     cl::desc("A flag that overrides the target's max interleave factor for "
268              "scalar loops."));
269 
270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
271     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
272     cl::desc("A flag that overrides the target's max interleave factor for "
273              "vectorized loops."));
274 
275 static cl::opt<unsigned> ForceTargetInstructionCost(
276     "force-target-instruction-cost", cl::init(0), cl::Hidden,
277     cl::desc("A flag that overrides the target's expected cost for "
278              "an instruction to a single constant value. Mostly "
279              "useful for getting consistent testing."));
280 
281 static cl::opt<bool> ForceTargetSupportsScalableVectors(
282     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
283     cl::desc(
284         "Pretend that scalable vectors are supported, even if the target does "
285         "not support them. This flag should only be used for testing."));
286 
287 static cl::opt<unsigned> SmallLoopCost(
288     "small-loop-cost", cl::init(20), cl::Hidden,
289     cl::desc(
290         "The cost of a loop that is considered 'small' by the interleaver."));
291 
292 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
293     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
294     cl::desc("Enable the use of the block frequency analysis to access PGO "
295              "heuristics minimizing code growth in cold regions and being more "
296              "aggressive in hot regions."));
297 
298 // Runtime interleave loops for load/store throughput.
299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
300     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
301     cl::desc(
302         "Enable runtime interleaving until load/store ports are saturated"));
303 
304 /// Interleave small loops with scalar reductions.
305 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
306     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
307     cl::desc("Enable interleaving for loops with small iteration counts that "
308              "contain scalar reductions to expose ILP."));
309 
310 /// The number of stores in a loop that are allowed to need predication.
311 static cl::opt<unsigned> NumberOfStoresToPredicate(
312     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
313     cl::desc("Max number of stores to be predicated behind an if."));
314 
315 static cl::opt<bool> EnableIndVarRegisterHeur(
316     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
317     cl::desc("Count the induction variable only once when interleaving"));
318 
319 static cl::opt<bool> EnableCondStoresVectorization(
320     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
321     cl::desc("Enable if predication of stores during vectorization."));
322 
323 static cl::opt<unsigned> MaxNestedScalarReductionIC(
324     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
325     cl::desc("The maximum interleave count to use when interleaving a scalar "
326              "reduction in a nested loop."));
327 
328 static cl::opt<bool>
329     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
330                            cl::Hidden,
331                            cl::desc("Prefer in-loop vector reductions, "
332                                     "overriding the targets preference."));
333 
334 static cl::opt<bool> ForceOrderedReductions(
335     "force-ordered-reductions", cl::init(false), cl::Hidden,
336     cl::desc("Enable the vectorisation of loops with in-order (strict) "
337              "FP reductions"));
338 
339 static cl::opt<bool> PreferPredicatedReductionSelect(
340     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
341     cl::desc(
342         "Prefer predicating a reduction operation over an after loop select."));
343 
344 cl::opt<bool> EnableVPlanNativePath(
345     "enable-vplan-native-path", cl::init(false), cl::Hidden,
346     cl::desc("Enable VPlan-native vectorization path with "
347              "support for outer loop vectorization."));
348 
349 // FIXME: Remove this switch once we have divergence analysis. Currently we
350 // assume divergent non-backedge branches when this switch is true.
351 cl::opt<bool> EnableVPlanPredication(
352     "enable-vplan-predication", cl::init(false), cl::Hidden,
353     cl::desc("Enable VPlan-native vectorization path predicator with "
354              "support for outer loop vectorization."));
355 
356 // This flag enables the stress testing of the VPlan H-CFG construction in the
357 // VPlan-native vectorization path. It must be used in conjuction with
358 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
359 // verification of the H-CFGs built.
360 static cl::opt<bool> VPlanBuildStressTest(
361     "vplan-build-stress-test", cl::init(false), cl::Hidden,
362     cl::desc(
363         "Build VPlan for every supported loop nest in the function and bail "
364         "out right after the build (stress test the VPlan H-CFG construction "
365         "in the VPlan-native vectorization path)."));
366 
367 cl::opt<bool> llvm::EnableLoopInterleaving(
368     "interleave-loops", cl::init(true), cl::Hidden,
369     cl::desc("Enable loop interleaving in Loop vectorization passes"));
370 cl::opt<bool> llvm::EnableLoopVectorization(
371     "vectorize-loops", cl::init(true), cl::Hidden,
372     cl::desc("Run the Loop vectorization passes"));
373 
374 cl::opt<bool> PrintVPlansInDotFormat(
375     "vplan-print-in-dot-format", cl::init(false), cl::Hidden,
376     cl::desc("Use dot format instead of plain text when dumping VPlans"));
377 
378 /// A helper function that returns true if the given type is irregular. The
379 /// type is irregular if its allocated size doesn't equal the store size of an
380 /// element of the corresponding vector type.
381 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
382   // Determine if an array of N elements of type Ty is "bitcast compatible"
383   // with a <N x Ty> vector.
384   // This is only true if there is no padding between the array elements.
385   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
386 }
387 
388 /// A helper function that returns the reciprocal of the block probability of
389 /// predicated blocks. If we return X, we are assuming the predicated block
390 /// will execute once for every X iterations of the loop header.
391 ///
392 /// TODO: We should use actual block probability here, if available. Currently,
393 ///       we always assume predicated blocks have a 50% chance of executing.
394 static unsigned getReciprocalPredBlockProb() { return 2; }
395 
396 /// A helper function that returns an integer or floating-point constant with
397 /// value C.
398 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
399   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
400                            : ConstantFP::get(Ty, C);
401 }
402 
403 /// Returns "best known" trip count for the specified loop \p L as defined by
404 /// the following procedure:
405 ///   1) Returns exact trip count if it is known.
406 ///   2) Returns expected trip count according to profile data if any.
407 ///   3) Returns upper bound estimate if it is known.
408 ///   4) Returns None if all of the above failed.
409 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
410   // Check if exact trip count is known.
411   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
412     return ExpectedTC;
413 
414   // Check if there is an expected trip count available from profile data.
415   if (LoopVectorizeWithBlockFrequency)
416     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
417       return EstimatedTC;
418 
419   // Check if upper bound estimate is known.
420   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
421     return ExpectedTC;
422 
423   return None;
424 }
425 
426 // Forward declare GeneratedRTChecks.
427 class GeneratedRTChecks;
428 
429 namespace llvm {
430 
431 /// InnerLoopVectorizer vectorizes loops which contain only one basic
432 /// block to a specified vectorization factor (VF).
433 /// This class performs the widening of scalars into vectors, or multiple
434 /// scalars. This class also implements the following features:
435 /// * It inserts an epilogue loop for handling loops that don't have iteration
436 ///   counts that are known to be a multiple of the vectorization factor.
437 /// * It handles the code generation for reduction variables.
438 /// * Scalarization (implementation using scalars) of un-vectorizable
439 ///   instructions.
440 /// InnerLoopVectorizer does not perform any vectorization-legality
441 /// checks, and relies on the caller to check for the different legality
442 /// aspects. The InnerLoopVectorizer relies on the
443 /// LoopVectorizationLegality class to provide information about the induction
444 /// and reduction variables that were found to a given vectorization factor.
445 class InnerLoopVectorizer {
446 public:
447   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
448                       LoopInfo *LI, DominatorTree *DT,
449                       const TargetLibraryInfo *TLI,
450                       const TargetTransformInfo *TTI, AssumptionCache *AC,
451                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
452                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
453                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
454                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
455       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
456         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
457         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
458         PSI(PSI), RTChecks(RTChecks) {
459     // Query this against the original loop and save it here because the profile
460     // of the original loop header may change as the transformation happens.
461     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
462         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
463   }
464 
465   virtual ~InnerLoopVectorizer() = default;
466 
467   /// Create a new empty loop that will contain vectorized instructions later
468   /// on, while the old loop will be used as the scalar remainder. Control flow
469   /// is generated around the vectorized (and scalar epilogue) loops consisting
470   /// of various checks and bypasses. Return the pre-header block of the new
471   /// loop.
472   /// In the case of epilogue vectorization, this function is overriden to
473   /// handle the more complex control flow around the loops.
474   virtual BasicBlock *createVectorizedLoopSkeleton();
475 
476   /// Widen a single instruction within the innermost loop.
477   void widenInstruction(Instruction &I, VPWidenRecipe *WidenRec,
478                         VPTransformState &State);
479 
480   /// Widen a single call instruction within the innermost loop.
481   void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
482                             VPTransformState &State);
483 
484   /// Widen a single select instruction within the innermost loop.
485   void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands,
486                               bool InvariantCond, VPTransformState &State);
487 
488   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
489   void fixVectorizedLoop(VPTransformState &State);
490 
491   // Return true if any runtime check is added.
492   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
493 
494   /// A type for vectorized values in the new loop. Each value from the
495   /// original loop, when vectorized, is represented by UF vector values in the
496   /// new unrolled loop, where UF is the unroll factor.
497   using VectorParts = SmallVector<Value *, 2>;
498 
499   /// Vectorize a single GetElementPtrInst based on information gathered and
500   /// decisions taken during planning.
501   void widenGEP(GetElementPtrInst *GEP, VPWidenGEPRecipe *WidenGEPRec,
502                 VPUser &Indices, unsigned UF, ElementCount VF,
503                 bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant,
504                 VPTransformState &State);
505 
506   /// Vectorize a single first-order recurrence or pointer induction PHINode in
507   /// a block. This method handles the induction variable canonicalization. It
508   /// supports both VF = 1 for unrolled loops and arbitrary length vectors.
509   void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR,
510                            VPTransformState &State);
511 
512   /// A helper function to scalarize a single Instruction in the innermost loop.
513   /// Generates a sequence of scalar instances for each lane between \p MinLane
514   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
515   /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
516   /// Instr's operands.
517   void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe,
518                             const VPIteration &Instance, bool IfPredicateInstr,
519                             VPTransformState &State);
520 
521   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
522   /// is provided, the integer induction variable will first be truncated to
523   /// the corresponding type.
524   void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc,
525                              VPValue *Def, VPValue *CastDef,
526                              VPTransformState &State);
527 
528   /// Construct the vector value of a scalarized value \p V one lane at a time.
529   void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
530                                  VPTransformState &State);
531 
532   /// Try to vectorize interleaved access group \p Group with the base address
533   /// given in \p Addr, optionally masking the vector operations if \p
534   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
535   /// values in the vectorized loop.
536   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
537                                 ArrayRef<VPValue *> VPDefs,
538                                 VPTransformState &State, VPValue *Addr,
539                                 ArrayRef<VPValue *> StoredValues,
540                                 VPValue *BlockInMask = nullptr);
541 
542   /// Vectorize Load and Store instructions with the base address given in \p
543   /// Addr, optionally masking the vector operations if \p BlockInMask is
544   /// non-null. Use \p State to translate given VPValues to IR values in the
545   /// vectorized loop.
546   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
547                                   VPValue *Def, VPValue *Addr,
548                                   VPValue *StoredValue, VPValue *BlockInMask,
549                                   bool ConsecutiveStride, bool Reverse);
550 
551   /// Set the debug location in the builder \p Ptr using the debug location in
552   /// \p V. If \p Ptr is None then it uses the class member's Builder.
553   void setDebugLocFromInst(const Value *V,
554                            Optional<IRBuilder<> *> CustomBuilder = None);
555 
556   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
557   void fixNonInductionPHIs(VPTransformState &State);
558 
559   /// Returns true if the reordering of FP operations is not allowed, but we are
560   /// able to vectorize with strict in-order reductions for the given RdxDesc.
561   bool useOrderedReductions(RecurrenceDescriptor &RdxDesc);
562 
563   /// Create a broadcast instruction. This method generates a broadcast
564   /// instruction (shuffle) for loop invariant values and for the induction
565   /// value. If this is the induction variable then we extend it to N, N+1, ...
566   /// this is needed because each iteration in the loop corresponds to a SIMD
567   /// element.
568   virtual Value *getBroadcastInstrs(Value *V);
569 
570 protected:
571   friend class LoopVectorizationPlanner;
572 
573   /// A small list of PHINodes.
574   using PhiVector = SmallVector<PHINode *, 4>;
575 
576   /// A type for scalarized values in the new loop. Each value from the
577   /// original loop, when scalarized, is represented by UF x VF scalar values
578   /// in the new unrolled loop, where UF is the unroll factor and VF is the
579   /// vectorization factor.
580   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
581 
582   /// Set up the values of the IVs correctly when exiting the vector loop.
583   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
584                     Value *CountRoundDown, Value *EndValue,
585                     BasicBlock *MiddleBlock);
586 
587   /// Create a new induction variable inside L.
588   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
589                                    Value *Step, Instruction *DL);
590 
591   /// Handle all cross-iteration phis in the header.
592   void fixCrossIterationPHIs(VPTransformState &State);
593 
594   /// Create the exit value of first order recurrences in the middle block and
595   /// update their users.
596   void fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, VPTransformState &State);
597 
598   /// Create code for the loop exit value of the reduction.
599   void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
600 
601   /// Clear NSW/NUW flags from reduction instructions if necessary.
602   void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
603                                VPTransformState &State);
604 
605   /// Fixup the LCSSA phi nodes in the unique exit block.  This simply
606   /// means we need to add the appropriate incoming value from the middle
607   /// block as exiting edges from the scalar epilogue loop (if present) are
608   /// already in place, and we exit the vector loop exclusively to the middle
609   /// block.
610   void fixLCSSAPHIs(VPTransformState &State);
611 
612   /// Iteratively sink the scalarized operands of a predicated instruction into
613   /// the block that was created for it.
614   void sinkScalarOperands(Instruction *PredInst);
615 
616   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
617   /// represented as.
618   void truncateToMinimalBitwidths(VPTransformState &State);
619 
620   /// This function adds
621   /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
622   /// to each vector element of Val. The sequence starts at StartIndex.
623   /// \p Opcode is relevant for FP induction variable.
624   virtual Value *
625   getStepVector(Value *Val, Value *StartIdx, Value *Step,
626                 Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd);
627 
628   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
629   /// variable on which to base the steps, \p Step is the size of the step, and
630   /// \p EntryVal is the value from the original loop that maps to the steps.
631   /// Note that \p EntryVal doesn't have to be an induction variable - it
632   /// can also be a truncate instruction.
633   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
634                         const InductionDescriptor &ID, VPValue *Def,
635                         VPValue *CastDef, VPTransformState &State);
636 
637   /// Create a vector induction phi node based on an existing scalar one. \p
638   /// EntryVal is the value from the original loop that maps to the vector phi
639   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
640   /// truncate instruction, instead of widening the original IV, we widen a
641   /// version of the IV truncated to \p EntryVal's type.
642   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
643                                        Value *Step, Value *Start,
644                                        Instruction *EntryVal, VPValue *Def,
645                                        VPValue *CastDef,
646                                        VPTransformState &State);
647 
648   /// Returns true if an instruction \p I should be scalarized instead of
649   /// vectorized for the chosen vectorization factor.
650   bool shouldScalarizeInstruction(Instruction *I) const;
651 
652   /// Returns true if we should generate a scalar version of \p IV.
653   bool needsScalarInduction(Instruction *IV) const;
654 
655   /// If there is a cast involved in the induction variable \p ID, which should
656   /// be ignored in the vectorized loop body, this function records the
657   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
658   /// cast. We had already proved that the casted Phi is equal to the uncasted
659   /// Phi in the vectorized loop (under a runtime guard), and therefore
660   /// there is no need to vectorize the cast - the same value can be used in the
661   /// vector loop for both the Phi and the cast.
662   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
663   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
664   ///
665   /// \p EntryVal is the value from the original loop that maps to the vector
666   /// phi node and is used to distinguish what is the IV currently being
667   /// processed - original one (if \p EntryVal is a phi corresponding to the
668   /// original IV) or the "newly-created" one based on the proof mentioned above
669   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
670   /// latter case \p EntryVal is a TruncInst and we must not record anything for
671   /// that IV, but it's error-prone to expect callers of this routine to care
672   /// about that, hence this explicit parameter.
673   void recordVectorLoopValueForInductionCast(
674       const InductionDescriptor &ID, const Instruction *EntryVal,
675       Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State,
676       unsigned Part, unsigned Lane = UINT_MAX);
677 
678   /// Generate a shuffle sequence that will reverse the vector Vec.
679   virtual Value *reverseVector(Value *Vec);
680 
681   /// Returns (and creates if needed) the original loop trip count.
682   Value *getOrCreateTripCount(Loop *NewLoop);
683 
684   /// Returns (and creates if needed) the trip count of the widened loop.
685   Value *getOrCreateVectorTripCount(Loop *NewLoop);
686 
687   /// Returns a bitcasted value to the requested vector type.
688   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
689   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
690                                 const DataLayout &DL);
691 
692   /// Emit a bypass check to see if the vector trip count is zero, including if
693   /// it overflows.
694   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
695 
696   /// Emit a bypass check to see if all of the SCEV assumptions we've
697   /// had to make are correct. Returns the block containing the checks or
698   /// nullptr if no checks have been added.
699   BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass);
700 
701   /// Emit bypass checks to check any memory assumptions we may have made.
702   /// Returns the block containing the checks or nullptr if no checks have been
703   /// added.
704   BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
705 
706   /// Compute the transformed value of Index at offset StartValue using step
707   /// StepValue.
708   /// For integer induction, returns StartValue + Index * StepValue.
709   /// For pointer induction, returns StartValue[Index * StepValue].
710   /// FIXME: The newly created binary instructions should contain nsw/nuw
711   /// flags, which can be found from the original scalar operations.
712   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
713                               const DataLayout &DL,
714                               const InductionDescriptor &ID) const;
715 
716   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
717   /// vector loop preheader, middle block and scalar preheader. Also
718   /// allocate a loop object for the new vector loop and return it.
719   Loop *createVectorLoopSkeleton(StringRef Prefix);
720 
721   /// Create new phi nodes for the induction variables to resume iteration count
722   /// in the scalar epilogue, from where the vectorized loop left off (given by
723   /// \p VectorTripCount).
724   /// In cases where the loop skeleton is more complicated (eg. epilogue
725   /// vectorization) and the resume values can come from an additional bypass
726   /// block, the \p AdditionalBypass pair provides information about the bypass
727   /// block and the end value on the edge from bypass to this loop.
728   void createInductionResumeValues(
729       Loop *L, Value *VectorTripCount,
730       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
731 
732   /// Complete the loop skeleton by adding debug MDs, creating appropriate
733   /// conditional branches in the middle block, preparing the builder and
734   /// running the verifier. Take in the vector loop \p L as argument, and return
735   /// the preheader of the completed vector loop.
736   BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
737 
738   /// Add additional metadata to \p To that was not present on \p Orig.
739   ///
740   /// Currently this is used to add the noalias annotations based on the
741   /// inserted memchecks.  Use this for instructions that are *cloned* into the
742   /// vector loop.
743   void addNewMetadata(Instruction *To, const Instruction *Orig);
744 
745   /// Add metadata from one instruction to another.
746   ///
747   /// This includes both the original MDs from \p From and additional ones (\see
748   /// addNewMetadata).  Use this for *newly created* instructions in the vector
749   /// loop.
750   void addMetadata(Instruction *To, Instruction *From);
751 
752   /// Similar to the previous function but it adds the metadata to a
753   /// vector of instructions.
754   void addMetadata(ArrayRef<Value *> To, Instruction *From);
755 
756   /// Collect poison-generating recipes that may generate a poison value that is
757   /// used after vectorization, even when their operands are not poison. Those
758   /// recipes meet the following conditions:
759   ///  * Contribute to the address computation of a recipe generating a widen
760   ///    memory load/store (VPWidenMemoryInstructionRecipe or
761   ///    VPInterleaveRecipe).
762   ///  * Such a widen memory load/store has at least one underlying Instruction
763   ///    that is in a basic block that needs predication and after vectorization
764   ///    the generated instruction won't be predicated.
765   void collectPoisonGeneratingRecipes(VPTransformState &State);
766 
767   /// Allow subclasses to override and print debug traces before/after vplan
768   /// execution, when trace information is requested.
769   virtual void printDebugTracesAtStart(){};
770   virtual void printDebugTracesAtEnd(){};
771 
772   /// The original loop.
773   Loop *OrigLoop;
774 
775   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
776   /// dynamic knowledge to simplify SCEV expressions and converts them to a
777   /// more usable form.
778   PredicatedScalarEvolution &PSE;
779 
780   /// Loop Info.
781   LoopInfo *LI;
782 
783   /// Dominator Tree.
784   DominatorTree *DT;
785 
786   /// Alias Analysis.
787   AAResults *AA;
788 
789   /// Target Library Info.
790   const TargetLibraryInfo *TLI;
791 
792   /// Target Transform Info.
793   const TargetTransformInfo *TTI;
794 
795   /// Assumption Cache.
796   AssumptionCache *AC;
797 
798   /// Interface to emit optimization remarks.
799   OptimizationRemarkEmitter *ORE;
800 
801   /// LoopVersioning.  It's only set up (non-null) if memchecks were
802   /// used.
803   ///
804   /// This is currently only used to add no-alias metadata based on the
805   /// memchecks.  The actually versioning is performed manually.
806   std::unique_ptr<LoopVersioning> LVer;
807 
808   /// The vectorization SIMD factor to use. Each vector will have this many
809   /// vector elements.
810   ElementCount VF;
811 
812   /// The vectorization unroll factor to use. Each scalar is vectorized to this
813   /// many different vector instructions.
814   unsigned UF;
815 
816   /// The builder that we use
817   IRBuilder<> Builder;
818 
819   // --- Vectorization state ---
820 
821   /// The vector-loop preheader.
822   BasicBlock *LoopVectorPreHeader;
823 
824   /// The scalar-loop preheader.
825   BasicBlock *LoopScalarPreHeader;
826 
827   /// Middle Block between the vector and the scalar.
828   BasicBlock *LoopMiddleBlock;
829 
830   /// The unique ExitBlock of the scalar loop if one exists.  Note that
831   /// there can be multiple exiting edges reaching this block.
832   BasicBlock *LoopExitBlock;
833 
834   /// The vector loop body.
835   BasicBlock *LoopVectorBody;
836 
837   /// The scalar loop body.
838   BasicBlock *LoopScalarBody;
839 
840   /// A list of all bypass blocks. The first block is the entry of the loop.
841   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
842 
843   /// The new Induction variable which was added to the new block.
844   PHINode *Induction = nullptr;
845 
846   /// The induction variable of the old basic block.
847   PHINode *OldInduction = nullptr;
848 
849   /// Store instructions that were predicated.
850   SmallVector<Instruction *, 4> PredicatedInstructions;
851 
852   /// Trip count of the original loop.
853   Value *TripCount = nullptr;
854 
855   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
856   Value *VectorTripCount = nullptr;
857 
858   /// The legality analysis.
859   LoopVectorizationLegality *Legal;
860 
861   /// The profitablity analysis.
862   LoopVectorizationCostModel *Cost;
863 
864   // Record whether runtime checks are added.
865   bool AddedSafetyChecks = false;
866 
867   // Holds the end values for each induction variable. We save the end values
868   // so we can later fix-up the external users of the induction variables.
869   DenseMap<PHINode *, Value *> IVEndValues;
870 
871   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
872   // fixed up at the end of vector code generation.
873   SmallVector<PHINode *, 8> OrigPHIsToFix;
874 
875   /// BFI and PSI are used to check for profile guided size optimizations.
876   BlockFrequencyInfo *BFI;
877   ProfileSummaryInfo *PSI;
878 
879   // Whether this loop should be optimized for size based on profile guided size
880   // optimizatios.
881   bool OptForSizeBasedOnProfile;
882 
883   /// Structure to hold information about generated runtime checks, responsible
884   /// for cleaning the checks, if vectorization turns out unprofitable.
885   GeneratedRTChecks &RTChecks;
886 };
887 
888 class InnerLoopUnroller : public InnerLoopVectorizer {
889 public:
890   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
891                     LoopInfo *LI, DominatorTree *DT,
892                     const TargetLibraryInfo *TLI,
893                     const TargetTransformInfo *TTI, AssumptionCache *AC,
894                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
895                     LoopVectorizationLegality *LVL,
896                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
897                     ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
898       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
899                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
900                             BFI, PSI, Check) {}
901 
902 private:
903   Value *getBroadcastInstrs(Value *V) override;
904   Value *getStepVector(
905       Value *Val, Value *StartIdx, Value *Step,
906       Instruction::BinaryOps Opcode = Instruction::BinaryOpsEnd) override;
907   Value *reverseVector(Value *Vec) override;
908 };
909 
910 /// Encapsulate information regarding vectorization of a loop and its epilogue.
911 /// This information is meant to be updated and used across two stages of
912 /// epilogue vectorization.
913 struct EpilogueLoopVectorizationInfo {
914   ElementCount MainLoopVF = ElementCount::getFixed(0);
915   unsigned MainLoopUF = 0;
916   ElementCount EpilogueVF = ElementCount::getFixed(0);
917   unsigned EpilogueUF = 0;
918   BasicBlock *MainLoopIterationCountCheck = nullptr;
919   BasicBlock *EpilogueIterationCountCheck = nullptr;
920   BasicBlock *SCEVSafetyCheck = nullptr;
921   BasicBlock *MemSafetyCheck = nullptr;
922   Value *TripCount = nullptr;
923   Value *VectorTripCount = nullptr;
924 
925   EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
926                                 ElementCount EVF, unsigned EUF)
927       : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
928     assert(EUF == 1 &&
929            "A high UF for the epilogue loop is likely not beneficial.");
930   }
931 };
932 
933 /// An extension of the inner loop vectorizer that creates a skeleton for a
934 /// vectorized loop that has its epilogue (residual) also vectorized.
935 /// The idea is to run the vplan on a given loop twice, firstly to setup the
936 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
937 /// from the first step and vectorize the epilogue.  This is achieved by
938 /// deriving two concrete strategy classes from this base class and invoking
939 /// them in succession from the loop vectorizer planner.
940 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
941 public:
942   InnerLoopAndEpilogueVectorizer(
943       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
944       DominatorTree *DT, const TargetLibraryInfo *TLI,
945       const TargetTransformInfo *TTI, AssumptionCache *AC,
946       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
947       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
948       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
949       GeneratedRTChecks &Checks)
950       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
951                             EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI,
952                             Checks),
953         EPI(EPI) {}
954 
955   // Override this function to handle the more complex control flow around the
956   // three loops.
957   BasicBlock *createVectorizedLoopSkeleton() final override {
958     return createEpilogueVectorizedLoopSkeleton();
959   }
960 
961   /// The interface for creating a vectorized skeleton using one of two
962   /// different strategies, each corresponding to one execution of the vplan
963   /// as described above.
964   virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
965 
966   /// Holds and updates state information required to vectorize the main loop
967   /// and its epilogue in two separate passes. This setup helps us avoid
968   /// regenerating and recomputing runtime safety checks. It also helps us to
969   /// shorten the iteration-count-check path length for the cases where the
970   /// iteration count of the loop is so small that the main vector loop is
971   /// completely skipped.
972   EpilogueLoopVectorizationInfo &EPI;
973 };
974 
975 /// A specialized derived class of inner loop vectorizer that performs
976 /// vectorization of *main* loops in the process of vectorizing loops and their
977 /// epilogues.
978 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
979 public:
980   EpilogueVectorizerMainLoop(
981       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
982       DominatorTree *DT, const TargetLibraryInfo *TLI,
983       const TargetTransformInfo *TTI, AssumptionCache *AC,
984       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
985       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
986       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
987       GeneratedRTChecks &Check)
988       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
989                                        EPI, LVL, CM, BFI, PSI, Check) {}
990   /// Implements the interface for creating a vectorized skeleton using the
991   /// *main loop* strategy (ie the first pass of vplan execution).
992   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
993 
994 protected:
995   /// Emits an iteration count bypass check once for the main loop (when \p
996   /// ForEpilogue is false) and once for the epilogue loop (when \p
997   /// ForEpilogue is true).
998   BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
999                                              bool ForEpilogue);
1000   void printDebugTracesAtStart() override;
1001   void printDebugTracesAtEnd() override;
1002 };
1003 
1004 // A specialized derived class of inner loop vectorizer that performs
1005 // vectorization of *epilogue* loops in the process of vectorizing loops and
1006 // their epilogues.
1007 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
1008 public:
1009   EpilogueVectorizerEpilogueLoop(
1010       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
1011       DominatorTree *DT, const TargetLibraryInfo *TLI,
1012       const TargetTransformInfo *TTI, AssumptionCache *AC,
1013       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
1014       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
1015       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
1016       GeneratedRTChecks &Checks)
1017       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
1018                                        EPI, LVL, CM, BFI, PSI, Checks) {}
1019   /// Implements the interface for creating a vectorized skeleton using the
1020   /// *epilogue loop* strategy (ie the second pass of vplan execution).
1021   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
1022 
1023 protected:
1024   /// Emits an iteration count bypass check after the main vector loop has
1025   /// finished to see if there are any iterations left to execute by either
1026   /// the vector epilogue or the scalar epilogue.
1027   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
1028                                                       BasicBlock *Bypass,
1029                                                       BasicBlock *Insert);
1030   void printDebugTracesAtStart() override;
1031   void printDebugTracesAtEnd() override;
1032 };
1033 } // end namespace llvm
1034 
1035 /// Look for a meaningful debug location on the instruction or it's
1036 /// operands.
1037 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
1038   if (!I)
1039     return I;
1040 
1041   DebugLoc Empty;
1042   if (I->getDebugLoc() != Empty)
1043     return I;
1044 
1045   for (Use &Op : I->operands()) {
1046     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
1047       if (OpInst->getDebugLoc() != Empty)
1048         return OpInst;
1049   }
1050 
1051   return I;
1052 }
1053 
1054 void InnerLoopVectorizer::setDebugLocFromInst(
1055     const Value *V, Optional<IRBuilder<> *> CustomBuilder) {
1056   IRBuilder<> *B = (CustomBuilder == None) ? &Builder : *CustomBuilder;
1057   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) {
1058     const DILocation *DIL = Inst->getDebugLoc();
1059 
1060     // When a FSDiscriminator is enabled, we don't need to add the multiply
1061     // factors to the discriminators.
1062     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
1063         !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) {
1064       // FIXME: For scalable vectors, assume vscale=1.
1065       auto NewDIL =
1066           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
1067       if (NewDIL)
1068         B->SetCurrentDebugLocation(NewDIL.getValue());
1069       else
1070         LLVM_DEBUG(dbgs()
1071                    << "Failed to create new discriminator: "
1072                    << DIL->getFilename() << " Line: " << DIL->getLine());
1073     } else
1074       B->SetCurrentDebugLocation(DIL);
1075   } else
1076     B->SetCurrentDebugLocation(DebugLoc());
1077 }
1078 
1079 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
1080 /// is passed, the message relates to that particular instruction.
1081 #ifndef NDEBUG
1082 static void debugVectorizationMessage(const StringRef Prefix,
1083                                       const StringRef DebugMsg,
1084                                       Instruction *I) {
1085   dbgs() << "LV: " << Prefix << DebugMsg;
1086   if (I != nullptr)
1087     dbgs() << " " << *I;
1088   else
1089     dbgs() << '.';
1090   dbgs() << '\n';
1091 }
1092 #endif
1093 
1094 /// Create an analysis remark that explains why vectorization failed
1095 ///
1096 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
1097 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
1098 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
1099 /// the location of the remark.  \return the remark object that can be
1100 /// streamed to.
1101 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
1102     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
1103   Value *CodeRegion = TheLoop->getHeader();
1104   DebugLoc DL = TheLoop->getStartLoc();
1105 
1106   if (I) {
1107     CodeRegion = I->getParent();
1108     // If there is no debug location attached to the instruction, revert back to
1109     // using the loop's.
1110     if (I->getDebugLoc())
1111       DL = I->getDebugLoc();
1112   }
1113 
1114   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
1115 }
1116 
1117 /// Return a value for Step multiplied by VF.
1118 static Value *createStepForVF(IRBuilder<> &B, Type *Ty, ElementCount VF,
1119                               int64_t Step) {
1120   assert(Ty->isIntegerTy() && "Expected an integer step");
1121   Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue());
1122   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
1123 }
1124 
1125 namespace llvm {
1126 
1127 /// Return the runtime value for VF.
1128 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) {
1129   Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
1130   return VF.isScalable() ? B.CreateVScale(EC) : EC;
1131 }
1132 
1133 static Value *getRuntimeVFAsFloat(IRBuilder<> &B, Type *FTy, ElementCount VF) {
1134   assert(FTy->isFloatingPointTy() && "Expected floating point type!");
1135   Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
1136   Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
1137   return B.CreateUIToFP(RuntimeVF, FTy);
1138 }
1139 
1140 void reportVectorizationFailure(const StringRef DebugMsg,
1141                                 const StringRef OREMsg, const StringRef ORETag,
1142                                 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1143                                 Instruction *I) {
1144   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
1145   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1146   ORE->emit(
1147       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1148       << "loop not vectorized: " << OREMsg);
1149 }
1150 
1151 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1152                              OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1153                              Instruction *I) {
1154   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
1155   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1156   ORE->emit(
1157       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1158       << Msg);
1159 }
1160 
1161 } // end namespace llvm
1162 
1163 #ifndef NDEBUG
1164 /// \return string containing a file name and a line # for the given loop.
1165 static std::string getDebugLocString(const Loop *L) {
1166   std::string Result;
1167   if (L) {
1168     raw_string_ostream OS(Result);
1169     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1170       LoopDbgLoc.print(OS);
1171     else
1172       // Just print the module name.
1173       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1174     OS.flush();
1175   }
1176   return Result;
1177 }
1178 #endif
1179 
1180 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1181                                          const Instruction *Orig) {
1182   // If the loop was versioned with memchecks, add the corresponding no-alias
1183   // metadata.
1184   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1185     LVer->annotateInstWithNoAlias(To, Orig);
1186 }
1187 
1188 void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
1189     VPTransformState &State) {
1190 
1191   // Collect recipes in the backward slice of `Root` that may generate a poison
1192   // value that is used after vectorization.
1193   SmallPtrSet<VPRecipeBase *, 16> Visited;
1194   auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
1195     SmallVector<VPRecipeBase *, 16> Worklist;
1196     Worklist.push_back(Root);
1197 
1198     // Traverse the backward slice of Root through its use-def chain.
1199     while (!Worklist.empty()) {
1200       VPRecipeBase *CurRec = Worklist.back();
1201       Worklist.pop_back();
1202 
1203       if (!Visited.insert(CurRec).second)
1204         continue;
1205 
1206       // Prune search if we find another recipe generating a widen memory
1207       // instruction. Widen memory instructions involved in address computation
1208       // will lead to gather/scatter instructions, which don't need to be
1209       // handled.
1210       if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
1211           isa<VPInterleaveRecipe>(CurRec))
1212         continue;
1213 
1214       // This recipe contributes to the address computation of a widen
1215       // load/store. Collect recipe if its underlying instruction has
1216       // poison-generating flags.
1217       Instruction *Instr = CurRec->getUnderlyingInstr();
1218       if (Instr && cast<Operator>(Instr)->hasPoisonGeneratingFlags())
1219         State.MayGeneratePoisonRecipes.insert(CurRec);
1220 
1221       // Add new definitions to the worklist.
1222       for (VPValue *operand : CurRec->operands())
1223         if (VPDef *OpDef = operand->getDef())
1224           Worklist.push_back(cast<VPRecipeBase>(OpDef));
1225     }
1226   });
1227 
1228   // Traverse all the recipes in the VPlan and collect the poison-generating
1229   // recipes in the backward slice starting at the address of a VPWidenRecipe or
1230   // VPInterleaveRecipe.
1231   auto Iter = depth_first(
1232       VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry()));
1233   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1234     for (VPRecipeBase &Recipe : *VPBB) {
1235       if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
1236         Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr();
1237         VPDef *AddrDef = WidenRec->getAddr()->getDef();
1238         if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr &&
1239             Legal->blockNeedsPredication(UnderlyingInstr->getParent()))
1240           collectPoisonGeneratingInstrsInBackwardSlice(
1241               cast<VPRecipeBase>(AddrDef));
1242       } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1243         VPDef *AddrDef = InterleaveRec->getAddr()->getDef();
1244         if (AddrDef) {
1245           // Check if any member of the interleave group needs predication.
1246           const InterleaveGroup<Instruction> *InterGroup =
1247               InterleaveRec->getInterleaveGroup();
1248           bool NeedPredication = false;
1249           for (int I = 0, NumMembers = InterGroup->getNumMembers();
1250                I < NumMembers; ++I) {
1251             Instruction *Member = InterGroup->getMember(I);
1252             if (Member)
1253               NeedPredication |=
1254                   Legal->blockNeedsPredication(Member->getParent());
1255           }
1256 
1257           if (NeedPredication)
1258             collectPoisonGeneratingInstrsInBackwardSlice(
1259                 cast<VPRecipeBase>(AddrDef));
1260         }
1261       }
1262     }
1263   }
1264 }
1265 
1266 void InnerLoopVectorizer::addMetadata(Instruction *To,
1267                                       Instruction *From) {
1268   propagateMetadata(To, From);
1269   addNewMetadata(To, From);
1270 }
1271 
1272 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1273                                       Instruction *From) {
1274   for (Value *V : To) {
1275     if (Instruction *I = dyn_cast<Instruction>(V))
1276       addMetadata(I, From);
1277   }
1278 }
1279 
1280 namespace llvm {
1281 
1282 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1283 // lowered.
1284 enum ScalarEpilogueLowering {
1285 
1286   // The default: allowing scalar epilogues.
1287   CM_ScalarEpilogueAllowed,
1288 
1289   // Vectorization with OptForSize: don't allow epilogues.
1290   CM_ScalarEpilogueNotAllowedOptSize,
1291 
1292   // A special case of vectorisation with OptForSize: loops with a very small
1293   // trip count are considered for vectorization under OptForSize, thereby
1294   // making sure the cost of their loop body is dominant, free of runtime
1295   // guards and scalar iteration overheads.
1296   CM_ScalarEpilogueNotAllowedLowTripLoop,
1297 
1298   // Loop hint predicate indicating an epilogue is undesired.
1299   CM_ScalarEpilogueNotNeededUsePredicate,
1300 
1301   // Directive indicating we must either tail fold or not vectorize
1302   CM_ScalarEpilogueNotAllowedUsePredicate
1303 };
1304 
1305 /// ElementCountComparator creates a total ordering for ElementCount
1306 /// for the purposes of using it in a set structure.
1307 struct ElementCountComparator {
1308   bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
1309     return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
1310            std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
1311   }
1312 };
1313 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
1314 
1315 /// LoopVectorizationCostModel - estimates the expected speedups due to
1316 /// vectorization.
1317 /// In many cases vectorization is not profitable. This can happen because of
1318 /// a number of reasons. In this class we mainly attempt to predict the
1319 /// expected speedup/slowdowns due to the supported instruction set. We use the
1320 /// TargetTransformInfo to query the different backends for the cost of
1321 /// different operations.
1322 class LoopVectorizationCostModel {
1323 public:
1324   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1325                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1326                              LoopVectorizationLegality *Legal,
1327                              const TargetTransformInfo &TTI,
1328                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1329                              AssumptionCache *AC,
1330                              OptimizationRemarkEmitter *ORE, const Function *F,
1331                              const LoopVectorizeHints *Hints,
1332                              InterleavedAccessInfo &IAI)
1333       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1334         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1335         Hints(Hints), InterleaveInfo(IAI) {}
1336 
1337   /// \return An upper bound for the vectorization factors (both fixed and
1338   /// scalable). If the factors are 0, vectorization and interleaving should be
1339   /// avoided up front.
1340   FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1341 
1342   /// \return True if runtime checks are required for vectorization, and false
1343   /// otherwise.
1344   bool runtimeChecksRequired();
1345 
1346   /// \return The most profitable vectorization factor and the cost of that VF.
1347   /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
1348   /// then this vectorization factor will be selected if vectorization is
1349   /// possible.
1350   VectorizationFactor
1351   selectVectorizationFactor(const ElementCountSet &CandidateVFs);
1352 
1353   VectorizationFactor
1354   selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1355                                     const LoopVectorizationPlanner &LVP);
1356 
1357   /// Setup cost-based decisions for user vectorization factor.
1358   /// \return true if the UserVF is a feasible VF to be chosen.
1359   bool selectUserVectorizationFactor(ElementCount UserVF) {
1360     collectUniformsAndScalars(UserVF);
1361     collectInstsToScalarize(UserVF);
1362     return expectedCost(UserVF).first.isValid();
1363   }
1364 
1365   /// \return The size (in bits) of the smallest and widest types in the code
1366   /// that needs to be vectorized. We ignore values that remain scalar such as
1367   /// 64 bit loop indices.
1368   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1369 
1370   /// \return The desired interleave count.
1371   /// If interleave count has been specified by metadata it will be returned.
1372   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1373   /// are the selected vectorization factor and the cost of the selected VF.
1374   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1375 
1376   /// Memory access instruction may be vectorized in more than one way.
1377   /// Form of instruction after vectorization depends on cost.
1378   /// This function takes cost-based decisions for Load/Store instructions
1379   /// and collects them in a map. This decisions map is used for building
1380   /// the lists of loop-uniform and loop-scalar instructions.
1381   /// The calculated cost is saved with widening decision in order to
1382   /// avoid redundant calculations.
1383   void setCostBasedWideningDecision(ElementCount VF);
1384 
1385   /// A struct that represents some properties of the register usage
1386   /// of a loop.
1387   struct RegisterUsage {
1388     /// Holds the number of loop invariant values that are used in the loop.
1389     /// The key is ClassID of target-provided register class.
1390     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1391     /// Holds the maximum number of concurrent live intervals in the loop.
1392     /// The key is ClassID of target-provided register class.
1393     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1394   };
1395 
1396   /// \return Returns information about the register usages of the loop for the
1397   /// given vectorization factors.
1398   SmallVector<RegisterUsage, 8>
1399   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1400 
1401   /// Collect values we want to ignore in the cost model.
1402   void collectValuesToIgnore();
1403 
1404   /// Collect all element types in the loop for which widening is needed.
1405   void collectElementTypesForWidening();
1406 
1407   /// Split reductions into those that happen in the loop, and those that happen
1408   /// outside. In loop reductions are collected into InLoopReductionChains.
1409   void collectInLoopReductions();
1410 
1411   /// Returns true if we should use strict in-order reductions for the given
1412   /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1413   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1414   /// of FP operations.
1415   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) {
1416     return !Hints->allowReordering() && RdxDesc.isOrdered();
1417   }
1418 
1419   /// \returns The smallest bitwidth each instruction can be represented with.
1420   /// The vector equivalents of these instructions should be truncated to this
1421   /// type.
1422   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1423     return MinBWs;
1424   }
1425 
1426   /// \returns True if it is more profitable to scalarize instruction \p I for
1427   /// vectorization factor \p VF.
1428   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1429     assert(VF.isVector() &&
1430            "Profitable to scalarize relevant only for VF > 1.");
1431 
1432     // Cost model is not run in the VPlan-native path - return conservative
1433     // result until this changes.
1434     if (EnableVPlanNativePath)
1435       return false;
1436 
1437     auto Scalars = InstsToScalarize.find(VF);
1438     assert(Scalars != InstsToScalarize.end() &&
1439            "VF not yet analyzed for scalarization profitability");
1440     return Scalars->second.find(I) != Scalars->second.end();
1441   }
1442 
1443   /// Returns true if \p I is known to be uniform after vectorization.
1444   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1445     if (VF.isScalar())
1446       return true;
1447 
1448     // Cost model is not run in the VPlan-native path - return conservative
1449     // result until this changes.
1450     if (EnableVPlanNativePath)
1451       return false;
1452 
1453     auto UniformsPerVF = Uniforms.find(VF);
1454     assert(UniformsPerVF != Uniforms.end() &&
1455            "VF not yet analyzed for uniformity");
1456     return UniformsPerVF->second.count(I);
1457   }
1458 
1459   /// Returns true if \p I is known to be scalar after vectorization.
1460   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1461     if (VF.isScalar())
1462       return true;
1463 
1464     // Cost model is not run in the VPlan-native path - return conservative
1465     // result until this changes.
1466     if (EnableVPlanNativePath)
1467       return false;
1468 
1469     auto ScalarsPerVF = Scalars.find(VF);
1470     assert(ScalarsPerVF != Scalars.end() &&
1471            "Scalar values are not calculated for VF");
1472     return ScalarsPerVF->second.count(I);
1473   }
1474 
1475   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1476   /// for vectorization factor \p VF.
1477   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1478     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1479            !isProfitableToScalarize(I, VF) &&
1480            !isScalarAfterVectorization(I, VF);
1481   }
1482 
1483   /// Decision that was taken during cost calculation for memory instruction.
1484   enum InstWidening {
1485     CM_Unknown,
1486     CM_Widen,         // For consecutive accesses with stride +1.
1487     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1488     CM_Interleave,
1489     CM_GatherScatter,
1490     CM_Scalarize
1491   };
1492 
1493   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1494   /// instruction \p I and vector width \p VF.
1495   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1496                            InstructionCost Cost) {
1497     assert(VF.isVector() && "Expected VF >=2");
1498     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1499   }
1500 
1501   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1502   /// interleaving group \p Grp and vector width \p VF.
1503   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1504                            ElementCount VF, InstWidening W,
1505                            InstructionCost Cost) {
1506     assert(VF.isVector() && "Expected VF >=2");
1507     /// Broadcast this decicion to all instructions inside the group.
1508     /// But the cost will be assigned to one instruction only.
1509     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1510       if (auto *I = Grp->getMember(i)) {
1511         if (Grp->getInsertPos() == I)
1512           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1513         else
1514           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1515       }
1516     }
1517   }
1518 
1519   /// Return the cost model decision for the given instruction \p I and vector
1520   /// width \p VF. Return CM_Unknown if this instruction did not pass
1521   /// through the cost modeling.
1522   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1523     assert(VF.isVector() && "Expected VF to be a vector VF");
1524     // Cost model is not run in the VPlan-native path - return conservative
1525     // result until this changes.
1526     if (EnableVPlanNativePath)
1527       return CM_GatherScatter;
1528 
1529     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1530     auto Itr = WideningDecisions.find(InstOnVF);
1531     if (Itr == WideningDecisions.end())
1532       return CM_Unknown;
1533     return Itr->second.first;
1534   }
1535 
1536   /// Return the vectorization cost for the given instruction \p I and vector
1537   /// width \p VF.
1538   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1539     assert(VF.isVector() && "Expected VF >=2");
1540     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1541     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1542            "The cost is not calculated");
1543     return WideningDecisions[InstOnVF].second;
1544   }
1545 
1546   /// Return True if instruction \p I is an optimizable truncate whose operand
1547   /// is an induction variable. Such a truncate will be removed by adding a new
1548   /// induction variable with the destination type.
1549   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1550     // If the instruction is not a truncate, return false.
1551     auto *Trunc = dyn_cast<TruncInst>(I);
1552     if (!Trunc)
1553       return false;
1554 
1555     // Get the source and destination types of the truncate.
1556     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1557     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1558 
1559     // If the truncate is free for the given types, return false. Replacing a
1560     // free truncate with an induction variable would add an induction variable
1561     // update instruction to each iteration of the loop. We exclude from this
1562     // check the primary induction variable since it will need an update
1563     // instruction regardless.
1564     Value *Op = Trunc->getOperand(0);
1565     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1566       return false;
1567 
1568     // If the truncated value is not an induction variable, return false.
1569     return Legal->isInductionPhi(Op);
1570   }
1571 
1572   /// Collects the instructions to scalarize for each predicated instruction in
1573   /// the loop.
1574   void collectInstsToScalarize(ElementCount VF);
1575 
1576   /// Collect Uniform and Scalar values for the given \p VF.
1577   /// The sets depend on CM decision for Load/Store instructions
1578   /// that may be vectorized as interleave, gather-scatter or scalarized.
1579   void collectUniformsAndScalars(ElementCount VF) {
1580     // Do the analysis once.
1581     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1582       return;
1583     setCostBasedWideningDecision(VF);
1584     collectLoopUniforms(VF);
1585     collectLoopScalars(VF);
1586   }
1587 
1588   /// Returns true if the target machine supports masked store operation
1589   /// for the given \p DataType and kind of access to \p Ptr.
1590   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1591     return Legal->isConsecutivePtr(DataType, Ptr) &&
1592            TTI.isLegalMaskedStore(DataType, Alignment);
1593   }
1594 
1595   /// Returns true if the target machine supports masked load operation
1596   /// for the given \p DataType and kind of access to \p Ptr.
1597   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1598     return Legal->isConsecutivePtr(DataType, Ptr) &&
1599            TTI.isLegalMaskedLoad(DataType, Alignment);
1600   }
1601 
1602   /// Returns true if the target machine can represent \p V as a masked gather
1603   /// or scatter operation.
1604   bool isLegalGatherOrScatter(Value *V) {
1605     bool LI = isa<LoadInst>(V);
1606     bool SI = isa<StoreInst>(V);
1607     if (!LI && !SI)
1608       return false;
1609     auto *Ty = getLoadStoreType(V);
1610     Align Align = getLoadStoreAlignment(V);
1611     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1612            (SI && TTI.isLegalMaskedScatter(Ty, Align));
1613   }
1614 
1615   /// Returns true if the target machine supports all of the reduction
1616   /// variables found for the given VF.
1617   bool canVectorizeReductions(ElementCount VF) const {
1618     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1619       const RecurrenceDescriptor &RdxDesc = Reduction.second;
1620       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1621     }));
1622   }
1623 
1624   /// Returns true if \p I is an instruction that will be scalarized with
1625   /// predication. Such instructions include conditional stores and
1626   /// instructions that may divide by zero.
1627   /// If a non-zero VF has been calculated, we check if I will be scalarized
1628   /// predication for that VF.
1629   bool isScalarWithPredication(Instruction *I) const;
1630 
1631   // Returns true if \p I is an instruction that will be predicated either
1632   // through scalar predication or masked load/store or masked gather/scatter.
1633   // Superset of instructions that return true for isScalarWithPredication.
1634   bool isPredicatedInst(Instruction *I, bool IsKnownUniform = false) {
1635     // When we know the load is uniform and the original scalar loop was not
1636     // predicated we don't need to mark it as a predicated instruction. Any
1637     // vectorised blocks created when tail-folding are something artificial we
1638     // have introduced and we know there is always at least one active lane.
1639     // That's why we call Legal->blockNeedsPredication here because it doesn't
1640     // query tail-folding.
1641     if (IsKnownUniform && isa<LoadInst>(I) &&
1642         !Legal->blockNeedsPredication(I->getParent()))
1643       return false;
1644     if (!blockNeedsPredicationForAnyReason(I->getParent()))
1645       return false;
1646     // Loads and stores that need some form of masked operation are predicated
1647     // instructions.
1648     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1649       return Legal->isMaskRequired(I);
1650     return isScalarWithPredication(I);
1651   }
1652 
1653   /// Returns true if \p I is a memory instruction with consecutive memory
1654   /// access that can be widened.
1655   bool
1656   memoryInstructionCanBeWidened(Instruction *I,
1657                                 ElementCount VF = ElementCount::getFixed(1));
1658 
1659   /// Returns true if \p I is a memory instruction in an interleaved-group
1660   /// of memory accesses that can be vectorized with wide vector loads/stores
1661   /// and shuffles.
1662   bool
1663   interleavedAccessCanBeWidened(Instruction *I,
1664                                 ElementCount VF = ElementCount::getFixed(1));
1665 
1666   /// Check if \p Instr belongs to any interleaved access group.
1667   bool isAccessInterleaved(Instruction *Instr) {
1668     return InterleaveInfo.isInterleaved(Instr);
1669   }
1670 
1671   /// Get the interleaved access group that \p Instr belongs to.
1672   const InterleaveGroup<Instruction> *
1673   getInterleavedAccessGroup(Instruction *Instr) {
1674     return InterleaveInfo.getInterleaveGroup(Instr);
1675   }
1676 
1677   /// Returns true if we're required to use a scalar epilogue for at least
1678   /// the final iteration of the original loop.
1679   bool requiresScalarEpilogue(ElementCount VF) const {
1680     if (!isScalarEpilogueAllowed())
1681       return false;
1682     // If we might exit from anywhere but the latch, must run the exiting
1683     // iteration in scalar form.
1684     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1685       return true;
1686     return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();
1687   }
1688 
1689   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1690   /// loop hint annotation.
1691   bool isScalarEpilogueAllowed() const {
1692     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1693   }
1694 
1695   /// Returns true if all loop blocks should be masked to fold tail loop.
1696   bool foldTailByMasking() const { return FoldTailByMasking; }
1697 
1698   /// Returns true if the instructions in this block requires predication
1699   /// for any reason, e.g. because tail folding now requires a predicate
1700   /// or because the block in the original loop was predicated.
1701   bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1702     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1703   }
1704 
1705   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1706   /// nodes to the chain of instructions representing the reductions. Uses a
1707   /// MapVector to ensure deterministic iteration order.
1708   using ReductionChainMap =
1709       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1710 
1711   /// Return the chain of instructions representing an inloop reduction.
1712   const ReductionChainMap &getInLoopReductionChains() const {
1713     return InLoopReductionChains;
1714   }
1715 
1716   /// Returns true if the Phi is part of an inloop reduction.
1717   bool isInLoopReduction(PHINode *Phi) const {
1718     return InLoopReductionChains.count(Phi);
1719   }
1720 
1721   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1722   /// with factor VF.  Return the cost of the instruction, including
1723   /// scalarization overhead if it's needed.
1724   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1725 
1726   /// Estimate cost of a call instruction CI if it were vectorized with factor
1727   /// VF. Return the cost of the instruction, including scalarization overhead
1728   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1729   /// scalarized -
1730   /// i.e. either vector version isn't available, or is too expensive.
1731   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1732                                     bool &NeedToScalarize) const;
1733 
1734   /// Returns true if the per-lane cost of VectorizationFactor A is lower than
1735   /// that of B.
1736   bool isMoreProfitable(const VectorizationFactor &A,
1737                         const VectorizationFactor &B) const;
1738 
1739   /// Invalidates decisions already taken by the cost model.
1740   void invalidateCostModelingDecisions() {
1741     WideningDecisions.clear();
1742     Uniforms.clear();
1743     Scalars.clear();
1744   }
1745 
1746 private:
1747   unsigned NumPredStores = 0;
1748 
1749   /// \return An upper bound for the vectorization factors for both
1750   /// fixed and scalable vectorization, where the minimum-known number of
1751   /// elements is a power-of-2 larger than zero. If scalable vectorization is
1752   /// disabled or unsupported, then the scalable part will be equal to
1753   /// ElementCount::getScalable(0).
1754   FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1755                                            ElementCount UserVF);
1756 
1757   /// \return the maximized element count based on the targets vector
1758   /// registers and the loop trip-count, but limited to a maximum safe VF.
1759   /// This is a helper function of computeFeasibleMaxVF.
1760   /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure
1761   /// issue that occurred on one of the buildbots which cannot be reproduced
1762   /// without having access to the properietary compiler (see comments on
1763   /// D98509). The issue is currently under investigation and this workaround
1764   /// will be removed as soon as possible.
1765   ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
1766                                        unsigned SmallestType,
1767                                        unsigned WidestType,
1768                                        const ElementCount &MaxSafeVF);
1769 
1770   /// \return the maximum legal scalable VF, based on the safe max number
1771   /// of elements.
1772   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1773 
1774   /// The vectorization cost is a combination of the cost itself and a boolean
1775   /// indicating whether any of the contributing operations will actually
1776   /// operate on vector values after type legalization in the backend. If this
1777   /// latter value is false, then all operations will be scalarized (i.e. no
1778   /// vectorization has actually taken place).
1779   using VectorizationCostTy = std::pair<InstructionCost, bool>;
1780 
1781   /// Returns the expected execution cost. The unit of the cost does
1782   /// not matter because we use the 'cost' units to compare different
1783   /// vector widths. The cost that is returned is *not* normalized by
1784   /// the factor width. If \p Invalid is not nullptr, this function
1785   /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1786   /// each instruction that has an Invalid cost for the given VF.
1787   using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1788   VectorizationCostTy
1789   expectedCost(ElementCount VF,
1790                SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1791 
1792   /// Returns the execution time cost of an instruction for a given vector
1793   /// width. Vector width of one means scalar.
1794   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1795 
1796   /// The cost-computation logic from getInstructionCost which provides
1797   /// the vector type as an output parameter.
1798   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1799                                      Type *&VectorTy);
1800 
1801   /// Return the cost of instructions in an inloop reduction pattern, if I is
1802   /// part of that pattern.
1803   Optional<InstructionCost>
1804   getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1805                           TTI::TargetCostKind CostKind);
1806 
1807   /// Calculate vectorization cost of memory instruction \p I.
1808   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1809 
1810   /// The cost computation for scalarized memory instruction.
1811   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1812 
1813   /// The cost computation for interleaving group of memory instructions.
1814   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1815 
1816   /// The cost computation for Gather/Scatter instruction.
1817   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1818 
1819   /// The cost computation for widening instruction \p I with consecutive
1820   /// memory access.
1821   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1822 
1823   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1824   /// Load: scalar load + broadcast.
1825   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1826   /// element)
1827   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1828 
1829   /// Estimate the overhead of scalarizing an instruction. This is a
1830   /// convenience wrapper for the type-based getScalarizationOverhead API.
1831   InstructionCost getScalarizationOverhead(Instruction *I,
1832                                            ElementCount VF) const;
1833 
1834   /// Returns whether the instruction is a load or store and will be a emitted
1835   /// as a vector operation.
1836   bool isConsecutiveLoadOrStore(Instruction *I);
1837 
1838   /// Returns true if an artificially high cost for emulated masked memrefs
1839   /// should be used.
1840   bool useEmulatedMaskMemRefHack(Instruction *I);
1841 
1842   /// Map of scalar integer values to the smallest bitwidth they can be legally
1843   /// represented as. The vector equivalents of these values should be truncated
1844   /// to this type.
1845   MapVector<Instruction *, uint64_t> MinBWs;
1846 
1847   /// A type representing the costs for instructions if they were to be
1848   /// scalarized rather than vectorized. The entries are Instruction-Cost
1849   /// pairs.
1850   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1851 
1852   /// A set containing all BasicBlocks that are known to present after
1853   /// vectorization as a predicated block.
1854   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1855 
1856   /// Records whether it is allowed to have the original scalar loop execute at
1857   /// least once. This may be needed as a fallback loop in case runtime
1858   /// aliasing/dependence checks fail, or to handle the tail/remainder
1859   /// iterations when the trip count is unknown or doesn't divide by the VF,
1860   /// or as a peel-loop to handle gaps in interleave-groups.
1861   /// Under optsize and when the trip count is very small we don't allow any
1862   /// iterations to execute in the scalar loop.
1863   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1864 
1865   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1866   bool FoldTailByMasking = false;
1867 
1868   /// A map holding scalar costs for different vectorization factors. The
1869   /// presence of a cost for an instruction in the mapping indicates that the
1870   /// instruction will be scalarized when vectorizing with the associated
1871   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1872   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1873 
1874   /// Holds the instructions known to be uniform after vectorization.
1875   /// The data is collected per VF.
1876   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1877 
1878   /// Holds the instructions known to be scalar after vectorization.
1879   /// The data is collected per VF.
1880   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1881 
1882   /// Holds the instructions (address computations) that are forced to be
1883   /// scalarized.
1884   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1885 
1886   /// PHINodes of the reductions that should be expanded in-loop along with
1887   /// their associated chains of reduction operations, in program order from top
1888   /// (PHI) to bottom
1889   ReductionChainMap InLoopReductionChains;
1890 
1891   /// A Map of inloop reduction operations and their immediate chain operand.
1892   /// FIXME: This can be removed once reductions can be costed correctly in
1893   /// vplan. This was added to allow quick lookup to the inloop operations,
1894   /// without having to loop through InLoopReductionChains.
1895   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1896 
1897   /// Returns the expected difference in cost from scalarizing the expression
1898   /// feeding a predicated instruction \p PredInst. The instructions to
1899   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1900   /// non-negative return value implies the expression will be scalarized.
1901   /// Currently, only single-use chains are considered for scalarization.
1902   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1903                               ElementCount VF);
1904 
1905   /// Collect the instructions that are uniform after vectorization. An
1906   /// instruction is uniform if we represent it with a single scalar value in
1907   /// the vectorized loop corresponding to each vector iteration. Examples of
1908   /// uniform instructions include pointer operands of consecutive or
1909   /// interleaved memory accesses. Note that although uniformity implies an
1910   /// instruction will be scalar, the reverse is not true. In general, a
1911   /// scalarized instruction will be represented by VF scalar values in the
1912   /// vectorized loop, each corresponding to an iteration of the original
1913   /// scalar loop.
1914   void collectLoopUniforms(ElementCount VF);
1915 
1916   /// Collect the instructions that are scalar after vectorization. An
1917   /// instruction is scalar if it is known to be uniform or will be scalarized
1918   /// during vectorization. Non-uniform scalarized instructions will be
1919   /// represented by VF values in the vectorized loop, each corresponding to an
1920   /// iteration of the original scalar loop.
1921   void collectLoopScalars(ElementCount VF);
1922 
1923   /// Keeps cost model vectorization decision and cost for instructions.
1924   /// Right now it is used for memory instructions only.
1925   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1926                                 std::pair<InstWidening, InstructionCost>>;
1927 
1928   DecisionList WideningDecisions;
1929 
1930   /// Returns true if \p V is expected to be vectorized and it needs to be
1931   /// extracted.
1932   bool needsExtract(Value *V, ElementCount VF) const {
1933     Instruction *I = dyn_cast<Instruction>(V);
1934     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1935         TheLoop->isLoopInvariant(I))
1936       return false;
1937 
1938     // Assume we can vectorize V (and hence we need extraction) if the
1939     // scalars are not computed yet. This can happen, because it is called
1940     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1941     // the scalars are collected. That should be a safe assumption in most
1942     // cases, because we check if the operands have vectorizable types
1943     // beforehand in LoopVectorizationLegality.
1944     return Scalars.find(VF) == Scalars.end() ||
1945            !isScalarAfterVectorization(I, VF);
1946   };
1947 
1948   /// Returns a range containing only operands needing to be extracted.
1949   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1950                                                    ElementCount VF) const {
1951     return SmallVector<Value *, 4>(make_filter_range(
1952         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1953   }
1954 
1955   /// Determines if we have the infrastructure to vectorize loop \p L and its
1956   /// epilogue, assuming the main loop is vectorized by \p VF.
1957   bool isCandidateForEpilogueVectorization(const Loop &L,
1958                                            const ElementCount VF) const;
1959 
1960   /// Returns true if epilogue vectorization is considered profitable, and
1961   /// false otherwise.
1962   /// \p VF is the vectorization factor chosen for the original loop.
1963   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1964 
1965 public:
1966   /// The loop that we evaluate.
1967   Loop *TheLoop;
1968 
1969   /// Predicated scalar evolution analysis.
1970   PredicatedScalarEvolution &PSE;
1971 
1972   /// Loop Info analysis.
1973   LoopInfo *LI;
1974 
1975   /// Vectorization legality.
1976   LoopVectorizationLegality *Legal;
1977 
1978   /// Vector target information.
1979   const TargetTransformInfo &TTI;
1980 
1981   /// Target Library Info.
1982   const TargetLibraryInfo *TLI;
1983 
1984   /// Demanded bits analysis.
1985   DemandedBits *DB;
1986 
1987   /// Assumption cache.
1988   AssumptionCache *AC;
1989 
1990   /// Interface to emit optimization remarks.
1991   OptimizationRemarkEmitter *ORE;
1992 
1993   const Function *TheFunction;
1994 
1995   /// Loop Vectorize Hint.
1996   const LoopVectorizeHints *Hints;
1997 
1998   /// The interleave access information contains groups of interleaved accesses
1999   /// with the same stride and close to each other.
2000   InterleavedAccessInfo &InterleaveInfo;
2001 
2002   /// Values to ignore in the cost model.
2003   SmallPtrSet<const Value *, 16> ValuesToIgnore;
2004 
2005   /// Values to ignore in the cost model when VF > 1.
2006   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
2007 
2008   /// All element types found in the loop.
2009   SmallPtrSet<Type *, 16> ElementTypesInLoop;
2010 
2011   /// Profitable vector factors.
2012   SmallVector<VectorizationFactor, 8> ProfitableVFs;
2013 };
2014 } // end namespace llvm
2015 
2016 /// Helper struct to manage generating runtime checks for vectorization.
2017 ///
2018 /// The runtime checks are created up-front in temporary blocks to allow better
2019 /// estimating the cost and un-linked from the existing IR. After deciding to
2020 /// vectorize, the checks are moved back. If deciding not to vectorize, the
2021 /// temporary blocks are completely removed.
2022 class GeneratedRTChecks {
2023   /// Basic block which contains the generated SCEV checks, if any.
2024   BasicBlock *SCEVCheckBlock = nullptr;
2025 
2026   /// The value representing the result of the generated SCEV checks. If it is
2027   /// nullptr, either no SCEV checks have been generated or they have been used.
2028   Value *SCEVCheckCond = nullptr;
2029 
2030   /// Basic block which contains the generated memory runtime checks, if any.
2031   BasicBlock *MemCheckBlock = nullptr;
2032 
2033   /// The value representing the result of the generated memory runtime checks.
2034   /// If it is nullptr, either no memory runtime checks have been generated or
2035   /// they have been used.
2036   Value *MemRuntimeCheckCond = nullptr;
2037 
2038   DominatorTree *DT;
2039   LoopInfo *LI;
2040 
2041   SCEVExpander SCEVExp;
2042   SCEVExpander MemCheckExp;
2043 
2044 public:
2045   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
2046                     const DataLayout &DL)
2047       : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"),
2048         MemCheckExp(SE, DL, "scev.check") {}
2049 
2050   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
2051   /// accurately estimate the cost of the runtime checks. The blocks are
2052   /// un-linked from the IR and is added back during vector code generation. If
2053   /// there is no vector code generation, the check blocks are removed
2054   /// completely.
2055   void Create(Loop *L, const LoopAccessInfo &LAI,
2056               const SCEVUnionPredicate &UnionPred) {
2057 
2058     BasicBlock *LoopHeader = L->getHeader();
2059     BasicBlock *Preheader = L->getLoopPreheader();
2060 
2061     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
2062     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
2063     // may be used by SCEVExpander. The blocks will be un-linked from their
2064     // predecessors and removed from LI & DT at the end of the function.
2065     if (!UnionPred.isAlwaysTrue()) {
2066       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
2067                                   nullptr, "vector.scevcheck");
2068 
2069       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
2070           &UnionPred, SCEVCheckBlock->getTerminator());
2071     }
2072 
2073     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
2074     if (RtPtrChecking.Need) {
2075       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
2076       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
2077                                  "vector.memcheck");
2078 
2079       MemRuntimeCheckCond =
2080           addRuntimeChecks(MemCheckBlock->getTerminator(), L,
2081                            RtPtrChecking.getChecks(), MemCheckExp);
2082       assert(MemRuntimeCheckCond &&
2083              "no RT checks generated although RtPtrChecking "
2084              "claimed checks are required");
2085     }
2086 
2087     if (!MemCheckBlock && !SCEVCheckBlock)
2088       return;
2089 
2090     // Unhook the temporary block with the checks, update various places
2091     // accordingly.
2092     if (SCEVCheckBlock)
2093       SCEVCheckBlock->replaceAllUsesWith(Preheader);
2094     if (MemCheckBlock)
2095       MemCheckBlock->replaceAllUsesWith(Preheader);
2096 
2097     if (SCEVCheckBlock) {
2098       SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2099       new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
2100       Preheader->getTerminator()->eraseFromParent();
2101     }
2102     if (MemCheckBlock) {
2103       MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2104       new UnreachableInst(Preheader->getContext(), MemCheckBlock);
2105       Preheader->getTerminator()->eraseFromParent();
2106     }
2107 
2108     DT->changeImmediateDominator(LoopHeader, Preheader);
2109     if (MemCheckBlock) {
2110       DT->eraseNode(MemCheckBlock);
2111       LI->removeBlock(MemCheckBlock);
2112     }
2113     if (SCEVCheckBlock) {
2114       DT->eraseNode(SCEVCheckBlock);
2115       LI->removeBlock(SCEVCheckBlock);
2116     }
2117   }
2118 
2119   /// Remove the created SCEV & memory runtime check blocks & instructions, if
2120   /// unused.
2121   ~GeneratedRTChecks() {
2122     SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT);
2123     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT);
2124     if (!SCEVCheckCond)
2125       SCEVCleaner.markResultUsed();
2126 
2127     if (!MemRuntimeCheckCond)
2128       MemCheckCleaner.markResultUsed();
2129 
2130     if (MemRuntimeCheckCond) {
2131       auto &SE = *MemCheckExp.getSE();
2132       // Memory runtime check generation creates compares that use expanded
2133       // values. Remove them before running the SCEVExpanderCleaners.
2134       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2135         if (MemCheckExp.isInsertedInstruction(&I))
2136           continue;
2137         SE.forgetValue(&I);
2138         I.eraseFromParent();
2139       }
2140     }
2141     MemCheckCleaner.cleanup();
2142     SCEVCleaner.cleanup();
2143 
2144     if (SCEVCheckCond)
2145       SCEVCheckBlock->eraseFromParent();
2146     if (MemRuntimeCheckCond)
2147       MemCheckBlock->eraseFromParent();
2148   }
2149 
2150   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2151   /// adjusts the branches to branch to the vector preheader or \p Bypass,
2152   /// depending on the generated condition.
2153   BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass,
2154                              BasicBlock *LoopVectorPreHeader,
2155                              BasicBlock *LoopExitBlock) {
2156     if (!SCEVCheckCond)
2157       return nullptr;
2158     if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond))
2159       if (C->isZero())
2160         return nullptr;
2161 
2162     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2163 
2164     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2165     // Create new preheader for vector loop.
2166     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2167       PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2168 
2169     SCEVCheckBlock->getTerminator()->eraseFromParent();
2170     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2171     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2172                                                 SCEVCheckBlock);
2173 
2174     DT->addNewBlock(SCEVCheckBlock, Pred);
2175     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2176 
2177     ReplaceInstWithInst(
2178         SCEVCheckBlock->getTerminator(),
2179         BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond));
2180     // Mark the check as used, to prevent it from being removed during cleanup.
2181     SCEVCheckCond = nullptr;
2182     return SCEVCheckBlock;
2183   }
2184 
2185   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2186   /// the branches to branch to the vector preheader or \p Bypass, depending on
2187   /// the generated condition.
2188   BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass,
2189                                    BasicBlock *LoopVectorPreHeader) {
2190     // Check if we generated code that checks in runtime if arrays overlap.
2191     if (!MemRuntimeCheckCond)
2192       return nullptr;
2193 
2194     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2195     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2196                                                 MemCheckBlock);
2197 
2198     DT->addNewBlock(MemCheckBlock, Pred);
2199     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2200     MemCheckBlock->moveBefore(LoopVectorPreHeader);
2201 
2202     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2203       PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2204 
2205     ReplaceInstWithInst(
2206         MemCheckBlock->getTerminator(),
2207         BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2208     MemCheckBlock->getTerminator()->setDebugLoc(
2209         Pred->getTerminator()->getDebugLoc());
2210 
2211     // Mark the check as used, to prevent it from being removed during cleanup.
2212     MemRuntimeCheckCond = nullptr;
2213     return MemCheckBlock;
2214   }
2215 };
2216 
2217 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2218 // vectorization. The loop needs to be annotated with #pragma omp simd
2219 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2220 // vector length information is not provided, vectorization is not considered
2221 // explicit. Interleave hints are not allowed either. These limitations will be
2222 // relaxed in the future.
2223 // Please, note that we are currently forced to abuse the pragma 'clang
2224 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2225 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2226 // provides *explicit vectorization hints* (LV can bypass legal checks and
2227 // assume that vectorization is legal). However, both hints are implemented
2228 // using the same metadata (llvm.loop.vectorize, processed by
2229 // LoopVectorizeHints). This will be fixed in the future when the native IR
2230 // representation for pragma 'omp simd' is introduced.
2231 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2232                                    OptimizationRemarkEmitter *ORE) {
2233   assert(!OuterLp->isInnermost() && "This is not an outer loop");
2234   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2235 
2236   // Only outer loops with an explicit vectorization hint are supported.
2237   // Unannotated outer loops are ignored.
2238   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2239     return false;
2240 
2241   Function *Fn = OuterLp->getHeader()->getParent();
2242   if (!Hints.allowVectorization(Fn, OuterLp,
2243                                 true /*VectorizeOnlyWhenForced*/)) {
2244     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2245     return false;
2246   }
2247 
2248   if (Hints.getInterleave() > 1) {
2249     // TODO: Interleave support is future work.
2250     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2251                          "outer loops.\n");
2252     Hints.emitRemarkWithHints();
2253     return false;
2254   }
2255 
2256   return true;
2257 }
2258 
2259 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2260                                   OptimizationRemarkEmitter *ORE,
2261                                   SmallVectorImpl<Loop *> &V) {
2262   // Collect inner loops and outer loops without irreducible control flow. For
2263   // now, only collect outer loops that have explicit vectorization hints. If we
2264   // are stress testing the VPlan H-CFG construction, we collect the outermost
2265   // loop of every loop nest.
2266   if (L.isInnermost() || VPlanBuildStressTest ||
2267       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2268     LoopBlocksRPO RPOT(&L);
2269     RPOT.perform(LI);
2270     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2271       V.push_back(&L);
2272       // TODO: Collect inner loops inside marked outer loops in case
2273       // vectorization fails for the outer loop. Do not invoke
2274       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2275       // already known to be reducible. We can use an inherited attribute for
2276       // that.
2277       return;
2278     }
2279   }
2280   for (Loop *InnerL : L)
2281     collectSupportedLoops(*InnerL, LI, ORE, V);
2282 }
2283 
2284 namespace {
2285 
2286 /// The LoopVectorize Pass.
2287 struct LoopVectorize : public FunctionPass {
2288   /// Pass identification, replacement for typeid
2289   static char ID;
2290 
2291   LoopVectorizePass Impl;
2292 
2293   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2294                          bool VectorizeOnlyWhenForced = false)
2295       : FunctionPass(ID),
2296         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2297     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2298   }
2299 
2300   bool runOnFunction(Function &F) override {
2301     if (skipFunction(F))
2302       return false;
2303 
2304     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2305     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2306     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2307     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2308     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2309     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2310     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2311     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2312     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2313     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2314     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2315     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2316     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2317 
2318     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2319         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2320 
2321     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2322                         GetLAA, *ORE, PSI).MadeAnyChange;
2323   }
2324 
2325   void getAnalysisUsage(AnalysisUsage &AU) const override {
2326     AU.addRequired<AssumptionCacheTracker>();
2327     AU.addRequired<BlockFrequencyInfoWrapperPass>();
2328     AU.addRequired<DominatorTreeWrapperPass>();
2329     AU.addRequired<LoopInfoWrapperPass>();
2330     AU.addRequired<ScalarEvolutionWrapperPass>();
2331     AU.addRequired<TargetTransformInfoWrapperPass>();
2332     AU.addRequired<AAResultsWrapperPass>();
2333     AU.addRequired<LoopAccessLegacyAnalysis>();
2334     AU.addRequired<DemandedBitsWrapperPass>();
2335     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2336     AU.addRequired<InjectTLIMappingsLegacy>();
2337 
2338     // We currently do not preserve loopinfo/dominator analyses with outer loop
2339     // vectorization. Until this is addressed, mark these analyses as preserved
2340     // only for non-VPlan-native path.
2341     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2342     if (!EnableVPlanNativePath) {
2343       AU.addPreserved<LoopInfoWrapperPass>();
2344       AU.addPreserved<DominatorTreeWrapperPass>();
2345     }
2346 
2347     AU.addPreserved<BasicAAWrapperPass>();
2348     AU.addPreserved<GlobalsAAWrapperPass>();
2349     AU.addRequired<ProfileSummaryInfoWrapperPass>();
2350   }
2351 };
2352 
2353 } // end anonymous namespace
2354 
2355 //===----------------------------------------------------------------------===//
2356 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2357 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2358 //===----------------------------------------------------------------------===//
2359 
2360 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2361   // We need to place the broadcast of invariant variables outside the loop,
2362   // but only if it's proven safe to do so. Else, broadcast will be inside
2363   // vector loop body.
2364   Instruction *Instr = dyn_cast<Instruction>(V);
2365   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2366                      (!Instr ||
2367                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2368   // Place the code for broadcasting invariant variables in the new preheader.
2369   IRBuilder<>::InsertPointGuard Guard(Builder);
2370   if (SafeToHoist)
2371     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2372 
2373   // Broadcast the scalar into all locations in the vector.
2374   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2375 
2376   return Shuf;
2377 }
2378 
2379 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
2380     const InductionDescriptor &II, Value *Step, Value *Start,
2381     Instruction *EntryVal, VPValue *Def, VPValue *CastDef,
2382     VPTransformState &State) {
2383   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2384          "Expected either an induction phi-node or a truncate of it!");
2385 
2386   // Construct the initial value of the vector IV in the vector loop preheader
2387   auto CurrIP = Builder.saveIP();
2388   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2389   if (isa<TruncInst>(EntryVal)) {
2390     assert(Start->getType()->isIntegerTy() &&
2391            "Truncation requires an integer type");
2392     auto *TruncType = cast<IntegerType>(EntryVal->getType());
2393     Step = Builder.CreateTrunc(Step, TruncType);
2394     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2395   }
2396 
2397   Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
2398   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
2399   Value *SteppedStart =
2400       getStepVector(SplatStart, Zero, Step, II.getInductionOpcode());
2401 
2402   // We create vector phi nodes for both integer and floating-point induction
2403   // variables. Here, we determine the kind of arithmetic we will perform.
2404   Instruction::BinaryOps AddOp;
2405   Instruction::BinaryOps MulOp;
2406   if (Step->getType()->isIntegerTy()) {
2407     AddOp = Instruction::Add;
2408     MulOp = Instruction::Mul;
2409   } else {
2410     AddOp = II.getInductionOpcode();
2411     MulOp = Instruction::FMul;
2412   }
2413 
2414   // Multiply the vectorization factor by the step using integer or
2415   // floating-point arithmetic as appropriate.
2416   Type *StepType = Step->getType();
2417   Value *RuntimeVF;
2418   if (Step->getType()->isFloatingPointTy())
2419     RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, VF);
2420   else
2421     RuntimeVF = getRuntimeVF(Builder, StepType, VF);
2422   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
2423 
2424   // Create a vector splat to use in the induction update.
2425   //
2426   // FIXME: If the step is non-constant, we create the vector splat with
2427   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2428   //        handle a constant vector splat.
2429   Value *SplatVF = isa<Constant>(Mul)
2430                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
2431                        : Builder.CreateVectorSplat(VF, Mul);
2432   Builder.restoreIP(CurrIP);
2433 
2434   // We may need to add the step a number of times, depending on the unroll
2435   // factor. The last of those goes into the PHI.
2436   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2437                                     &*LoopVectorBody->getFirstInsertionPt());
2438   VecInd->setDebugLoc(EntryVal->getDebugLoc());
2439   Instruction *LastInduction = VecInd;
2440   for (unsigned Part = 0; Part < UF; ++Part) {
2441     State.set(Def, LastInduction, Part);
2442 
2443     if (isa<TruncInst>(EntryVal))
2444       addMetadata(LastInduction, EntryVal);
2445     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef,
2446                                           State, Part);
2447 
2448     LastInduction = cast<Instruction>(
2449         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
2450     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
2451   }
2452 
2453   // Move the last step to the end of the latch block. This ensures consistent
2454   // placement of all induction updates.
2455   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2456   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2457   auto *ICmp = cast<Instruction>(Br->getCondition());
2458   LastInduction->moveBefore(ICmp);
2459   LastInduction->setName("vec.ind.next");
2460 
2461   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2462   VecInd->addIncoming(LastInduction, LoopVectorLatch);
2463 }
2464 
2465 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
2466   return Cost->isScalarAfterVectorization(I, VF) ||
2467          Cost->isProfitableToScalarize(I, VF);
2468 }
2469 
2470 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
2471   if (shouldScalarizeInstruction(IV))
2472     return true;
2473   auto isScalarInst = [&](User *U) -> bool {
2474     auto *I = cast<Instruction>(U);
2475     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
2476   };
2477   return llvm::any_of(IV->users(), isScalarInst);
2478 }
2479 
2480 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
2481     const InductionDescriptor &ID, const Instruction *EntryVal,
2482     Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State,
2483     unsigned Part, unsigned Lane) {
2484   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2485          "Expected either an induction phi-node or a truncate of it!");
2486 
2487   // This induction variable is not the phi from the original loop but the
2488   // newly-created IV based on the proof that casted Phi is equal to the
2489   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
2490   // re-uses the same InductionDescriptor that original IV uses but we don't
2491   // have to do any recording in this case - that is done when original IV is
2492   // processed.
2493   if (isa<TruncInst>(EntryVal))
2494     return;
2495 
2496   if (!CastDef) {
2497     assert(ID.getCastInsts().empty() &&
2498            "there are casts for ID, but no CastDef");
2499     return;
2500   }
2501   assert(!ID.getCastInsts().empty() &&
2502          "there is a CastDef, but no casts for ID");
2503   // Only the first Cast instruction in the Casts vector is of interest.
2504   // The rest of the Casts (if exist) have no uses outside the
2505   // induction update chain itself.
2506   if (Lane < UINT_MAX)
2507     State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane));
2508   else
2509     State.set(CastDef, VectorLoopVal, Part);
2510 }
2511 
2512 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
2513                                                 TruncInst *Trunc, VPValue *Def,
2514                                                 VPValue *CastDef,
2515                                                 VPTransformState &State) {
2516   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
2517          "Primary induction variable must have an integer type");
2518 
2519   auto II = Legal->getInductionVars().find(IV);
2520   assert(II != Legal->getInductionVars().end() && "IV is not an induction");
2521 
2522   auto ID = II->second;
2523   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
2524 
2525   // The value from the original loop to which we are mapping the new induction
2526   // variable.
2527   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2528 
2529   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2530 
2531   // Generate code for the induction step. Note that induction steps are
2532   // required to be loop-invariant
2533   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
2534     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
2535            "Induction step should be loop invariant");
2536     if (PSE.getSE()->isSCEVable(IV->getType())) {
2537       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2538       return Exp.expandCodeFor(Step, Step->getType(),
2539                                LoopVectorPreHeader->getTerminator());
2540     }
2541     return cast<SCEVUnknown>(Step)->getValue();
2542   };
2543 
2544   // The scalar value to broadcast. This is derived from the canonical
2545   // induction variable. If a truncation type is given, truncate the canonical
2546   // induction variable and step. Otherwise, derive these values from the
2547   // induction descriptor.
2548   auto CreateScalarIV = [&](Value *&Step) -> Value * {
2549     Value *ScalarIV = Induction;
2550     if (IV != OldInduction) {
2551       ScalarIV = IV->getType()->isIntegerTy()
2552                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
2553                      : Builder.CreateCast(Instruction::SIToFP, Induction,
2554                                           IV->getType());
2555       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
2556       ScalarIV->setName("offset.idx");
2557     }
2558     if (Trunc) {
2559       auto *TruncType = cast<IntegerType>(Trunc->getType());
2560       assert(Step->getType()->isIntegerTy() &&
2561              "Truncation requires an integer step");
2562       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2563       Step = Builder.CreateTrunc(Step, TruncType);
2564     }
2565     return ScalarIV;
2566   };
2567 
2568   // Create the vector values from the scalar IV, in the absence of creating a
2569   // vector IV.
2570   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
2571     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
2572     for (unsigned Part = 0; Part < UF; ++Part) {
2573       assert(!VF.isScalable() && "scalable vectors not yet supported.");
2574       Value *StartIdx;
2575       if (Step->getType()->isFloatingPointTy())
2576         StartIdx = getRuntimeVFAsFloat(Builder, Step->getType(), VF * Part);
2577       else
2578         StartIdx = getRuntimeVF(Builder, Step->getType(), VF * Part);
2579 
2580       Value *EntryPart =
2581           getStepVector(Broadcasted, StartIdx, Step, ID.getInductionOpcode());
2582       State.set(Def, EntryPart, Part);
2583       if (Trunc)
2584         addMetadata(EntryPart, Trunc);
2585       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef,
2586                                             State, Part);
2587     }
2588   };
2589 
2590   // Fast-math-flags propagate from the original induction instruction.
2591   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
2592   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
2593     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
2594 
2595   // Now do the actual transformations, and start with creating the step value.
2596   Value *Step = CreateStepValue(ID.getStep());
2597   if (VF.isZero() || VF.isScalar()) {
2598     Value *ScalarIV = CreateScalarIV(Step);
2599     CreateSplatIV(ScalarIV, Step);
2600     return;
2601   }
2602 
2603   // Determine if we want a scalar version of the induction variable. This is
2604   // true if the induction variable itself is not widened, or if it has at
2605   // least one user in the loop that is not widened.
2606   auto NeedsScalarIV = needsScalarInduction(EntryVal);
2607   if (!NeedsScalarIV) {
2608     createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
2609                                     State);
2610     return;
2611   }
2612 
2613   // Try to create a new independent vector induction variable. If we can't
2614   // create the phi node, we will splat the scalar induction variable in each
2615   // loop iteration.
2616   if (!shouldScalarizeInstruction(EntryVal)) {
2617     createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
2618                                     State);
2619     Value *ScalarIV = CreateScalarIV(Step);
2620     // Create scalar steps that can be used by instructions we will later
2621     // scalarize. Note that the addition of the scalar steps will not increase
2622     // the number of instructions in the loop in the common case prior to
2623     // InstCombine. We will be trading one vector extract for each scalar step.
2624     buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
2625     return;
2626   }
2627 
2628   // All IV users are scalar instructions, so only emit a scalar IV, not a
2629   // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2630   // predicate used by the masked loads/stores.
2631   Value *ScalarIV = CreateScalarIV(Step);
2632   if (!Cost->isScalarEpilogueAllowed())
2633     CreateSplatIV(ScalarIV, Step);
2634   buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
2635 }
2636 
2637 Value *InnerLoopVectorizer::getStepVector(Value *Val, Value *StartIdx,
2638                                           Value *Step,
2639                                           Instruction::BinaryOps BinOp) {
2640   // Create and check the types.
2641   auto *ValVTy = cast<VectorType>(Val->getType());
2642   ElementCount VLen = ValVTy->getElementCount();
2643 
2644   Type *STy = Val->getType()->getScalarType();
2645   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2646          "Induction Step must be an integer or FP");
2647   assert(Step->getType() == STy && "Step has wrong type");
2648 
2649   SmallVector<Constant *, 8> Indices;
2650 
2651   // Create a vector of consecutive numbers from zero to VF.
2652   VectorType *InitVecValVTy = ValVTy;
2653   Type *InitVecValSTy = STy;
2654   if (STy->isFloatingPointTy()) {
2655     InitVecValSTy =
2656         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
2657     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2658   }
2659   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2660 
2661   // Splat the StartIdx
2662   Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
2663 
2664   if (STy->isIntegerTy()) {
2665     InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2666     Step = Builder.CreateVectorSplat(VLen, Step);
2667     assert(Step->getType() == Val->getType() && "Invalid step vec");
2668     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2669     // which can be found from the original scalar operations.
2670     Step = Builder.CreateMul(InitVec, Step);
2671     return Builder.CreateAdd(Val, Step, "induction");
2672   }
2673 
2674   // Floating point induction.
2675   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2676          "Binary Opcode should be specified for FP induction");
2677   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2678   InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
2679 
2680   Step = Builder.CreateVectorSplat(VLen, Step);
2681   Value *MulOp = Builder.CreateFMul(InitVec, Step);
2682   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2683 }
2684 
2685 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2686                                            Instruction *EntryVal,
2687                                            const InductionDescriptor &ID,
2688                                            VPValue *Def, VPValue *CastDef,
2689                                            VPTransformState &State) {
2690   // We shouldn't have to build scalar steps if we aren't vectorizing.
2691   assert(VF.isVector() && "VF should be greater than one");
2692   // Get the value type and ensure it and the step have the same integer type.
2693   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2694   assert(ScalarIVTy == Step->getType() &&
2695          "Val and Step should have the same type");
2696 
2697   // We build scalar steps for both integer and floating-point induction
2698   // variables. Here, we determine the kind of arithmetic we will perform.
2699   Instruction::BinaryOps AddOp;
2700   Instruction::BinaryOps MulOp;
2701   if (ScalarIVTy->isIntegerTy()) {
2702     AddOp = Instruction::Add;
2703     MulOp = Instruction::Mul;
2704   } else {
2705     AddOp = ID.getInductionOpcode();
2706     MulOp = Instruction::FMul;
2707   }
2708 
2709   // Determine the number of scalars we need to generate for each unroll
2710   // iteration. If EntryVal is uniform, we only need to generate the first
2711   // lane. Otherwise, we generate all VF values.
2712   bool IsUniform =
2713       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF);
2714   unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue();
2715   // Compute the scalar steps and save the results in State.
2716   Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2717                                      ScalarIVTy->getScalarSizeInBits());
2718   Type *VecIVTy = nullptr;
2719   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2720   if (!IsUniform && VF.isScalable()) {
2721     VecIVTy = VectorType::get(ScalarIVTy, VF);
2722     UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF));
2723     SplatStep = Builder.CreateVectorSplat(VF, Step);
2724     SplatIV = Builder.CreateVectorSplat(VF, ScalarIV);
2725   }
2726 
2727   for (unsigned Part = 0; Part < UF; ++Part) {
2728     Value *StartIdx0 = createStepForVF(Builder, IntStepTy, VF, Part);
2729 
2730     if (!IsUniform && VF.isScalable()) {
2731       auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0);
2732       auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2733       if (ScalarIVTy->isFloatingPointTy())
2734         InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2735       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2736       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2737       State.set(Def, Add, Part);
2738       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
2739                                             Part);
2740       // It's useful to record the lane values too for the known minimum number
2741       // of elements so we do those below. This improves the code quality when
2742       // trying to extract the first element, for example.
2743     }
2744 
2745     if (ScalarIVTy->isFloatingPointTy())
2746       StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2747 
2748     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2749       Value *StartIdx = Builder.CreateBinOp(
2750           AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2751       // The step returned by `createStepForVF` is a runtime-evaluated value
2752       // when VF is scalable. Otherwise, it should be folded into a Constant.
2753       assert((VF.isScalable() || isa<Constant>(StartIdx)) &&
2754              "Expected StartIdx to be folded to a constant when VF is not "
2755              "scalable");
2756       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2757       auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2758       State.set(Def, Add, VPIteration(Part, Lane));
2759       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
2760                                             Part, Lane);
2761     }
2762   }
2763 }
2764 
2765 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2766                                                     const VPIteration &Instance,
2767                                                     VPTransformState &State) {
2768   Value *ScalarInst = State.get(Def, Instance);
2769   Value *VectorValue = State.get(Def, Instance.Part);
2770   VectorValue = Builder.CreateInsertElement(
2771       VectorValue, ScalarInst,
2772       Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2773   State.set(Def, VectorValue, Instance.Part);
2774 }
2775 
2776 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2777   assert(Vec->getType()->isVectorTy() && "Invalid type");
2778   return Builder.CreateVectorReverse(Vec, "reverse");
2779 }
2780 
2781 // Return whether we allow using masked interleave-groups (for dealing with
2782 // strided loads/stores that reside in predicated blocks, or for dealing
2783 // with gaps).
2784 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2785   // If an override option has been passed in for interleaved accesses, use it.
2786   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2787     return EnableMaskedInterleavedMemAccesses;
2788 
2789   return TTI.enableMaskedInterleavedAccessVectorization();
2790 }
2791 
2792 // Try to vectorize the interleave group that \p Instr belongs to.
2793 //
2794 // E.g. Translate following interleaved load group (factor = 3):
2795 //   for (i = 0; i < N; i+=3) {
2796 //     R = Pic[i];             // Member of index 0
2797 //     G = Pic[i+1];           // Member of index 1
2798 //     B = Pic[i+2];           // Member of index 2
2799 //     ... // do something to R, G, B
2800 //   }
2801 // To:
2802 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2803 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2804 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2805 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2806 //
2807 // Or translate following interleaved store group (factor = 3):
2808 //   for (i = 0; i < N; i+=3) {
2809 //     ... do something to R, G, B
2810 //     Pic[i]   = R;           // Member of index 0
2811 //     Pic[i+1] = G;           // Member of index 1
2812 //     Pic[i+2] = B;           // Member of index 2
2813 //   }
2814 // To:
2815 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2816 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2817 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2818 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2819 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2820 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2821     const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2822     VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2823     VPValue *BlockInMask) {
2824   Instruction *Instr = Group->getInsertPos();
2825   const DataLayout &DL = Instr->getModule()->getDataLayout();
2826 
2827   // Prepare for the vector type of the interleaved load/store.
2828   Type *ScalarTy = getLoadStoreType(Instr);
2829   unsigned InterleaveFactor = Group->getFactor();
2830   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2831   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2832 
2833   // Prepare for the new pointers.
2834   SmallVector<Value *, 2> AddrParts;
2835   unsigned Index = Group->getIndex(Instr);
2836 
2837   // TODO: extend the masked interleaved-group support to reversed access.
2838   assert((!BlockInMask || !Group->isReverse()) &&
2839          "Reversed masked interleave-group not supported.");
2840 
2841   // If the group is reverse, adjust the index to refer to the last vector lane
2842   // instead of the first. We adjust the index from the first vector lane,
2843   // rather than directly getting the pointer for lane VF - 1, because the
2844   // pointer operand of the interleaved access is supposed to be uniform. For
2845   // uniform instructions, we're only required to generate a value for the
2846   // first vector lane in each unroll iteration.
2847   if (Group->isReverse())
2848     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2849 
2850   for (unsigned Part = 0; Part < UF; Part++) {
2851     Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2852     setDebugLocFromInst(AddrPart);
2853 
2854     // Notice current instruction could be any index. Need to adjust the address
2855     // to the member of index 0.
2856     //
2857     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2858     //       b = A[i];       // Member of index 0
2859     // Current pointer is pointed to A[i+1], adjust it to A[i].
2860     //
2861     // E.g.  A[i+1] = a;     // Member of index 1
2862     //       A[i]   = b;     // Member of index 0
2863     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2864     // Current pointer is pointed to A[i+2], adjust it to A[i].
2865 
2866     bool InBounds = false;
2867     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2868       InBounds = gep->isInBounds();
2869     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2870     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2871 
2872     // Cast to the vector pointer type.
2873     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2874     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2875     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2876   }
2877 
2878   setDebugLocFromInst(Instr);
2879   Value *PoisonVec = PoisonValue::get(VecTy);
2880 
2881   Value *MaskForGaps = nullptr;
2882   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2883     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2884     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2885   }
2886 
2887   // Vectorize the interleaved load group.
2888   if (isa<LoadInst>(Instr)) {
2889     // For each unroll part, create a wide load for the group.
2890     SmallVector<Value *, 2> NewLoads;
2891     for (unsigned Part = 0; Part < UF; Part++) {
2892       Instruction *NewLoad;
2893       if (BlockInMask || MaskForGaps) {
2894         assert(useMaskedInterleavedAccesses(*TTI) &&
2895                "masked interleaved groups are not allowed.");
2896         Value *GroupMask = MaskForGaps;
2897         if (BlockInMask) {
2898           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2899           Value *ShuffledMask = Builder.CreateShuffleVector(
2900               BlockInMaskPart,
2901               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2902               "interleaved.mask");
2903           GroupMask = MaskForGaps
2904                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2905                                                 MaskForGaps)
2906                           : ShuffledMask;
2907         }
2908         NewLoad =
2909             Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2910                                      GroupMask, PoisonVec, "wide.masked.vec");
2911       }
2912       else
2913         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2914                                             Group->getAlign(), "wide.vec");
2915       Group->addMetadata(NewLoad);
2916       NewLoads.push_back(NewLoad);
2917     }
2918 
2919     // For each member in the group, shuffle out the appropriate data from the
2920     // wide loads.
2921     unsigned J = 0;
2922     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2923       Instruction *Member = Group->getMember(I);
2924 
2925       // Skip the gaps in the group.
2926       if (!Member)
2927         continue;
2928 
2929       auto StrideMask =
2930           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2931       for (unsigned Part = 0; Part < UF; Part++) {
2932         Value *StridedVec = Builder.CreateShuffleVector(
2933             NewLoads[Part], StrideMask, "strided.vec");
2934 
2935         // If this member has different type, cast the result type.
2936         if (Member->getType() != ScalarTy) {
2937           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2938           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2939           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2940         }
2941 
2942         if (Group->isReverse())
2943           StridedVec = reverseVector(StridedVec);
2944 
2945         State.set(VPDefs[J], StridedVec, Part);
2946       }
2947       ++J;
2948     }
2949     return;
2950   }
2951 
2952   // The sub vector type for current instruction.
2953   auto *SubVT = VectorType::get(ScalarTy, VF);
2954 
2955   // Vectorize the interleaved store group.
2956   MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2957   assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2958          "masked interleaved groups are not allowed.");
2959   assert((!MaskForGaps || !VF.isScalable()) &&
2960          "masking gaps for scalable vectors is not yet supported.");
2961   for (unsigned Part = 0; Part < UF; Part++) {
2962     // Collect the stored vector from each member.
2963     SmallVector<Value *, 4> StoredVecs;
2964     for (unsigned i = 0; i < InterleaveFactor; i++) {
2965       assert((Group->getMember(i) || MaskForGaps) &&
2966              "Fail to get a member from an interleaved store group");
2967       Instruction *Member = Group->getMember(i);
2968 
2969       // Skip the gaps in the group.
2970       if (!Member) {
2971         Value *Undef = PoisonValue::get(SubVT);
2972         StoredVecs.push_back(Undef);
2973         continue;
2974       }
2975 
2976       Value *StoredVec = State.get(StoredValues[i], Part);
2977 
2978       if (Group->isReverse())
2979         StoredVec = reverseVector(StoredVec);
2980 
2981       // If this member has different type, cast it to a unified type.
2982 
2983       if (StoredVec->getType() != SubVT)
2984         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2985 
2986       StoredVecs.push_back(StoredVec);
2987     }
2988 
2989     // Concatenate all vectors into a wide vector.
2990     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2991 
2992     // Interleave the elements in the wide vector.
2993     Value *IVec = Builder.CreateShuffleVector(
2994         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2995         "interleaved.vec");
2996 
2997     Instruction *NewStoreInstr;
2998     if (BlockInMask || MaskForGaps) {
2999       Value *GroupMask = MaskForGaps;
3000       if (BlockInMask) {
3001         Value *BlockInMaskPart = State.get(BlockInMask, Part);
3002         Value *ShuffledMask = Builder.CreateShuffleVector(
3003             BlockInMaskPart,
3004             createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
3005             "interleaved.mask");
3006         GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And,
3007                                                       ShuffledMask, MaskForGaps)
3008                                 : ShuffledMask;
3009       }
3010       NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
3011                                                 Group->getAlign(), GroupMask);
3012     } else
3013       NewStoreInstr =
3014           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
3015 
3016     Group->addMetadata(NewStoreInstr);
3017   }
3018 }
3019 
3020 void InnerLoopVectorizer::vectorizeMemoryInstruction(
3021     Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,
3022     VPValue *StoredValue, VPValue *BlockInMask, bool ConsecutiveStride,
3023     bool Reverse) {
3024   // Attempt to issue a wide load.
3025   LoadInst *LI = dyn_cast<LoadInst>(Instr);
3026   StoreInst *SI = dyn_cast<StoreInst>(Instr);
3027 
3028   assert((LI || SI) && "Invalid Load/Store instruction");
3029   assert((!SI || StoredValue) && "No stored value provided for widened store");
3030   assert((!LI || !StoredValue) && "Stored value provided for widened load");
3031 
3032   Type *ScalarDataTy = getLoadStoreType(Instr);
3033 
3034   auto *DataTy = VectorType::get(ScalarDataTy, VF);
3035   const Align Alignment = getLoadStoreAlignment(Instr);
3036   bool CreateGatherScatter = !ConsecutiveStride;
3037 
3038   VectorParts BlockInMaskParts(UF);
3039   bool isMaskRequired = BlockInMask;
3040   if (isMaskRequired)
3041     for (unsigned Part = 0; Part < UF; ++Part)
3042       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
3043 
3044   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
3045     // Calculate the pointer for the specific unroll-part.
3046     GetElementPtrInst *PartPtr = nullptr;
3047 
3048     bool InBounds = false;
3049     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
3050       InBounds = gep->isInBounds();
3051     if (Reverse) {
3052       // If the address is consecutive but reversed, then the
3053       // wide store needs to start at the last vector element.
3054       // RunTimeVF =  VScale * VF.getKnownMinValue()
3055       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
3056       Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF);
3057       // NumElt = -Part * RunTimeVF
3058       Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
3059       // LastLane = 1 - RunTimeVF
3060       Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
3061       PartPtr =
3062           cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
3063       PartPtr->setIsInBounds(InBounds);
3064       PartPtr = cast<GetElementPtrInst>(
3065           Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
3066       PartPtr->setIsInBounds(InBounds);
3067       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
3068         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
3069     } else {
3070       Value *Increment =
3071           createStepForVF(Builder, Builder.getInt32Ty(), VF, Part);
3072       PartPtr = cast<GetElementPtrInst>(
3073           Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
3074       PartPtr->setIsInBounds(InBounds);
3075     }
3076 
3077     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
3078     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
3079   };
3080 
3081   // Handle Stores:
3082   if (SI) {
3083     setDebugLocFromInst(SI);
3084 
3085     for (unsigned Part = 0; Part < UF; ++Part) {
3086       Instruction *NewSI = nullptr;
3087       Value *StoredVal = State.get(StoredValue, Part);
3088       if (CreateGatherScatter) {
3089         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
3090         Value *VectorGep = State.get(Addr, Part);
3091         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
3092                                             MaskPart);
3093       } else {
3094         if (Reverse) {
3095           // If we store to reverse consecutive memory locations, then we need
3096           // to reverse the order of elements in the stored value.
3097           StoredVal = reverseVector(StoredVal);
3098           // We don't want to update the value in the map as it might be used in
3099           // another expression. So don't call resetVectorValue(StoredVal).
3100         }
3101         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));
3102         if (isMaskRequired)
3103           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
3104                                             BlockInMaskParts[Part]);
3105         else
3106           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
3107       }
3108       addMetadata(NewSI, SI);
3109     }
3110     return;
3111   }
3112 
3113   // Handle loads.
3114   assert(LI && "Must have a load instruction");
3115   setDebugLocFromInst(LI);
3116   for (unsigned Part = 0; Part < UF; ++Part) {
3117     Value *NewLI;
3118     if (CreateGatherScatter) {
3119       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
3120       Value *VectorGep = State.get(Addr, Part);
3121       NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
3122                                          nullptr, "wide.masked.gather");
3123       addMetadata(NewLI, LI);
3124     } else {
3125       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));
3126       if (isMaskRequired)
3127         NewLI = Builder.CreateMaskedLoad(
3128             DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
3129             PoisonValue::get(DataTy), "wide.masked.load");
3130       else
3131         NewLI =
3132             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
3133 
3134       // Add metadata to the load, but setVectorValue to the reverse shuffle.
3135       addMetadata(NewLI, LI);
3136       if (Reverse)
3137         NewLI = reverseVector(NewLI);
3138     }
3139 
3140     State.set(Def, NewLI, Part);
3141   }
3142 }
3143 
3144 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
3145                                                VPReplicateRecipe *RepRecipe,
3146                                                const VPIteration &Instance,
3147                                                bool IfPredicateInstr,
3148                                                VPTransformState &State) {
3149   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
3150 
3151   // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
3152   // the first lane and part.
3153   if (isa<NoAliasScopeDeclInst>(Instr))
3154     if (!Instance.isFirstIteration())
3155       return;
3156 
3157   setDebugLocFromInst(Instr);
3158 
3159   // Does this instruction return a value ?
3160   bool IsVoidRetTy = Instr->getType()->isVoidTy();
3161 
3162   Instruction *Cloned = Instr->clone();
3163   if (!IsVoidRetTy)
3164     Cloned->setName(Instr->getName() + ".cloned");
3165 
3166   // If the scalarized instruction contributes to the address computation of a
3167   // widen masked load/store which was in a basic block that needed predication
3168   // and is not predicated after vectorization, we can't propagate
3169   // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized
3170   // instruction could feed a poison value to the base address of the widen
3171   // load/store.
3172   if (State.MayGeneratePoisonRecipes.count(RepRecipe) > 0)
3173     Cloned->dropPoisonGeneratingFlags();
3174 
3175   State.Builder.SetInsertPoint(Builder.GetInsertBlock(),
3176                                Builder.GetInsertPoint());
3177   // Replace the operands of the cloned instructions with their scalar
3178   // equivalents in the new loop.
3179   for (unsigned op = 0, e = RepRecipe->getNumOperands(); op != e; ++op) {
3180     auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
3181     auto InputInstance = Instance;
3182     if (!Operand || !OrigLoop->contains(Operand) ||
3183         (Cost->isUniformAfterVectorization(Operand, State.VF)))
3184       InputInstance.Lane = VPLane::getFirstLane();
3185     auto *NewOp = State.get(RepRecipe->getOperand(op), InputInstance);
3186     Cloned->setOperand(op, NewOp);
3187   }
3188   addNewMetadata(Cloned, Instr);
3189 
3190   // Place the cloned scalar in the new loop.
3191   Builder.Insert(Cloned);
3192 
3193   State.set(RepRecipe, Cloned, Instance);
3194 
3195   // If we just cloned a new assumption, add it the assumption cache.
3196   if (auto *II = dyn_cast<AssumeInst>(Cloned))
3197     AC->registerAssumption(II);
3198 
3199   // End if-block.
3200   if (IfPredicateInstr)
3201     PredicatedInstructions.push_back(Cloned);
3202 }
3203 
3204 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
3205                                                       Value *End, Value *Step,
3206                                                       Instruction *DL) {
3207   BasicBlock *Header = L->getHeader();
3208   BasicBlock *Latch = L->getLoopLatch();
3209   // As we're just creating this loop, it's possible no latch exists
3210   // yet. If so, use the header as this will be a single block loop.
3211   if (!Latch)
3212     Latch = Header;
3213 
3214   IRBuilder<> B(&*Header->getFirstInsertionPt());
3215   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
3216   setDebugLocFromInst(OldInst, &B);
3217   auto *Induction = B.CreatePHI(Start->getType(), 2, "index");
3218 
3219   B.SetInsertPoint(Latch->getTerminator());
3220   setDebugLocFromInst(OldInst, &B);
3221 
3222   // Create i+1 and fill the PHINode.
3223   //
3224   // If the tail is not folded, we know that End - Start >= Step (either
3225   // statically or through the minimum iteration checks). We also know that both
3226   // Start % Step == 0 and End % Step == 0. We exit the vector loop if %IV +
3227   // %Step == %End. Hence we must exit the loop before %IV + %Step unsigned
3228   // overflows and we can mark the induction increment as NUW.
3229   Value *Next = B.CreateAdd(Induction, Step, "index.next",
3230                             /*NUW=*/!Cost->foldTailByMasking(), /*NSW=*/false);
3231   Induction->addIncoming(Start, L->getLoopPreheader());
3232   Induction->addIncoming(Next, Latch);
3233   // Create the compare.
3234   Value *ICmp = B.CreateICmpEQ(Next, End);
3235   B.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header);
3236 
3237   // Now we have two terminators. Remove the old one from the block.
3238   Latch->getTerminator()->eraseFromParent();
3239 
3240   return Induction;
3241 }
3242 
3243 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
3244   if (TripCount)
3245     return TripCount;
3246 
3247   assert(L && "Create Trip Count for null loop.");
3248   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3249   // Find the loop boundaries.
3250   ScalarEvolution *SE = PSE.getSE();
3251   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
3252   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
3253          "Invalid loop count");
3254 
3255   Type *IdxTy = Legal->getWidestInductionType();
3256   assert(IdxTy && "No type for induction");
3257 
3258   // The exit count might have the type of i64 while the phi is i32. This can
3259   // happen if we have an induction variable that is sign extended before the
3260   // compare. The only way that we get a backedge taken count is that the
3261   // induction variable was signed and as such will not overflow. In such a case
3262   // truncation is legal.
3263   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
3264       IdxTy->getPrimitiveSizeInBits())
3265     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
3266   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
3267 
3268   // Get the total trip count from the count by adding 1.
3269   const SCEV *ExitCount = SE->getAddExpr(
3270       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
3271 
3272   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
3273 
3274   // Expand the trip count and place the new instructions in the preheader.
3275   // Notice that the pre-header does not change, only the loop body.
3276   SCEVExpander Exp(*SE, DL, "induction");
3277 
3278   // Count holds the overall loop count (N).
3279   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
3280                                 L->getLoopPreheader()->getTerminator());
3281 
3282   if (TripCount->getType()->isPointerTy())
3283     TripCount =
3284         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
3285                                     L->getLoopPreheader()->getTerminator());
3286 
3287   return TripCount;
3288 }
3289 
3290 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
3291   if (VectorTripCount)
3292     return VectorTripCount;
3293 
3294   Value *TC = getOrCreateTripCount(L);
3295   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3296 
3297   Type *Ty = TC->getType();
3298   // This is where we can make the step a runtime constant.
3299   Value *Step = createStepForVF(Builder, Ty, VF, UF);
3300 
3301   // If the tail is to be folded by masking, round the number of iterations N
3302   // up to a multiple of Step instead of rounding down. This is done by first
3303   // adding Step-1 and then rounding down. Note that it's ok if this addition
3304   // overflows: the vector induction variable will eventually wrap to zero given
3305   // that it starts at zero and its Step is a power of two; the loop will then
3306   // exit, with the last early-exit vector comparison also producing all-true.
3307   if (Cost->foldTailByMasking()) {
3308     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
3309            "VF*UF must be a power of 2 when folding tail by masking");
3310     assert(!VF.isScalable() &&
3311            "Tail folding not yet supported for scalable vectors");
3312     TC = Builder.CreateAdd(
3313         TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
3314   }
3315 
3316   // Now we need to generate the expression for the part of the loop that the
3317   // vectorized body will execute. This is equal to N - (N % Step) if scalar
3318   // iterations are not required for correctness, or N - Step, otherwise. Step
3319   // is equal to the vectorization factor (number of SIMD elements) times the
3320   // unroll factor (number of SIMD instructions).
3321   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
3322 
3323   // There are cases where we *must* run at least one iteration in the remainder
3324   // loop.  See the cost model for when this can happen.  If the step evenly
3325   // divides the trip count, we set the remainder to be equal to the step. If
3326   // the step does not evenly divide the trip count, no adjustment is necessary
3327   // since there will already be scalar iterations. Note that the minimum
3328   // iterations check ensures that N >= Step.
3329   if (Cost->requiresScalarEpilogue(VF)) {
3330     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
3331     R = Builder.CreateSelect(IsZero, Step, R);
3332   }
3333 
3334   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
3335 
3336   return VectorTripCount;
3337 }
3338 
3339 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
3340                                                    const DataLayout &DL) {
3341   // Verify that V is a vector type with same number of elements as DstVTy.
3342   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
3343   unsigned VF = DstFVTy->getNumElements();
3344   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
3345   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
3346   Type *SrcElemTy = SrcVecTy->getElementType();
3347   Type *DstElemTy = DstFVTy->getElementType();
3348   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3349          "Vector elements must have same size");
3350 
3351   // Do a direct cast if element types are castable.
3352   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3353     return Builder.CreateBitOrPointerCast(V, DstFVTy);
3354   }
3355   // V cannot be directly casted to desired vector type.
3356   // May happen when V is a floating point vector but DstVTy is a vector of
3357   // pointers or vice-versa. Handle this using a two-step bitcast using an
3358   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3359   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3360          "Only one type should be a pointer type");
3361   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3362          "Only one type should be a floating point type");
3363   Type *IntTy =
3364       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3365   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
3366   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3367   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
3368 }
3369 
3370 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
3371                                                          BasicBlock *Bypass) {
3372   Value *Count = getOrCreateTripCount(L);
3373   // Reuse existing vector loop preheader for TC checks.
3374   // Note that new preheader block is generated for vector loop.
3375   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
3376   IRBuilder<> Builder(TCCheckBlock->getTerminator());
3377 
3378   // Generate code to check if the loop's trip count is less than VF * UF, or
3379   // equal to it in case a scalar epilogue is required; this implies that the
3380   // vector trip count is zero. This check also covers the case where adding one
3381   // to the backedge-taken count overflowed leading to an incorrect trip count
3382   // of zero. In this case we will also jump to the scalar loop.
3383   auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
3384                                             : ICmpInst::ICMP_ULT;
3385 
3386   // If tail is to be folded, vector loop takes care of all iterations.
3387   Value *CheckMinIters = Builder.getFalse();
3388   if (!Cost->foldTailByMasking()) {
3389     Value *Step = createStepForVF(Builder, Count->getType(), VF, UF);
3390     CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
3391   }
3392   // Create new preheader for vector loop.
3393   LoopVectorPreHeader =
3394       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3395                  "vector.ph");
3396 
3397   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
3398                                DT->getNode(Bypass)->getIDom()) &&
3399          "TC check is expected to dominate Bypass");
3400 
3401   // Update dominator for Bypass & LoopExit (if needed).
3402   DT->changeImmediateDominator(Bypass, TCCheckBlock);
3403   if (!Cost->requiresScalarEpilogue(VF))
3404     // If there is an epilogue which must run, there's no edge from the
3405     // middle block to exit blocks  and thus no need to update the immediate
3406     // dominator of the exit blocks.
3407     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3408 
3409   ReplaceInstWithInst(
3410       TCCheckBlock->getTerminator(),
3411       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3412   LoopBypassBlocks.push_back(TCCheckBlock);
3413 }
3414 
3415 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
3416 
3417   BasicBlock *const SCEVCheckBlock =
3418       RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock);
3419   if (!SCEVCheckBlock)
3420     return nullptr;
3421 
3422   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3423            (OptForSizeBasedOnProfile &&
3424             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
3425          "Cannot SCEV check stride or overflow when optimizing for size");
3426 
3427 
3428   // Update dominator only if this is first RT check.
3429   if (LoopBypassBlocks.empty()) {
3430     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3431     if (!Cost->requiresScalarEpilogue(VF))
3432       // If there is an epilogue which must run, there's no edge from the
3433       // middle block to exit blocks  and thus no need to update the immediate
3434       // dominator of the exit blocks.
3435       DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3436   }
3437 
3438   LoopBypassBlocks.push_back(SCEVCheckBlock);
3439   AddedSafetyChecks = true;
3440   return SCEVCheckBlock;
3441 }
3442 
3443 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
3444                                                       BasicBlock *Bypass) {
3445   // VPlan-native path does not do any analysis for runtime checks currently.
3446   if (EnableVPlanNativePath)
3447     return nullptr;
3448 
3449   BasicBlock *const MemCheckBlock =
3450       RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader);
3451 
3452   // Check if we generated code that checks in runtime if arrays overlap. We put
3453   // the checks into a separate block to make the more common case of few
3454   // elements faster.
3455   if (!MemCheckBlock)
3456     return nullptr;
3457 
3458   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3459     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3460            "Cannot emit memory checks when optimizing for size, unless forced "
3461            "to vectorize.");
3462     ORE->emit([&]() {
3463       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3464                                         L->getStartLoc(), L->getHeader())
3465              << "Code-size may be reduced by not forcing "
3466                 "vectorization, or by source-code modifications "
3467                 "eliminating the need for runtime checks "
3468                 "(e.g., adding 'restrict').";
3469     });
3470   }
3471 
3472   LoopBypassBlocks.push_back(MemCheckBlock);
3473 
3474   AddedSafetyChecks = true;
3475 
3476   // We currently don't use LoopVersioning for the actual loop cloning but we
3477   // still use it to add the noalias metadata.
3478   LVer = std::make_unique<LoopVersioning>(
3479       *Legal->getLAI(),
3480       Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3481       DT, PSE.getSE());
3482   LVer->prepareNoAliasMetadata();
3483   return MemCheckBlock;
3484 }
3485 
3486 Value *InnerLoopVectorizer::emitTransformedIndex(
3487     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
3488     const InductionDescriptor &ID) const {
3489 
3490   SCEVExpander Exp(*SE, DL, "induction");
3491   auto Step = ID.getStep();
3492   auto StartValue = ID.getStartValue();
3493   assert(Index->getType()->getScalarType() == Step->getType() &&
3494          "Index scalar type does not match StepValue type");
3495 
3496   // Note: the IR at this point is broken. We cannot use SE to create any new
3497   // SCEV and then expand it, hoping that SCEV's simplification will give us
3498   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3499   // lead to various SCEV crashes. So all we can do is to use builder and rely
3500   // on InstCombine for future simplifications. Here we handle some trivial
3501   // cases only.
3502   auto CreateAdd = [&B](Value *X, Value *Y) {
3503     assert(X->getType() == Y->getType() && "Types don't match!");
3504     if (auto *CX = dyn_cast<ConstantInt>(X))
3505       if (CX->isZero())
3506         return Y;
3507     if (auto *CY = dyn_cast<ConstantInt>(Y))
3508       if (CY->isZero())
3509         return X;
3510     return B.CreateAdd(X, Y);
3511   };
3512 
3513   // We allow X to be a vector type, in which case Y will potentially be
3514   // splatted into a vector with the same element count.
3515   auto CreateMul = [&B](Value *X, Value *Y) {
3516     assert(X->getType()->getScalarType() == Y->getType() &&
3517            "Types don't match!");
3518     if (auto *CX = dyn_cast<ConstantInt>(X))
3519       if (CX->isOne())
3520         return Y;
3521     if (auto *CY = dyn_cast<ConstantInt>(Y))
3522       if (CY->isOne())
3523         return X;
3524     VectorType *XVTy = dyn_cast<VectorType>(X->getType());
3525     if (XVTy && !isa<VectorType>(Y->getType()))
3526       Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
3527     return B.CreateMul(X, Y);
3528   };
3529 
3530   // Get a suitable insert point for SCEV expansion. For blocks in the vector
3531   // loop, choose the end of the vector loop header (=LoopVectorBody), because
3532   // the DomTree is not kept up-to-date for additional blocks generated in the
3533   // vector loop. By using the header as insertion point, we guarantee that the
3534   // expanded instructions dominate all their uses.
3535   auto GetInsertPoint = [this, &B]() {
3536     BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3537     if (InsertBB != LoopVectorBody &&
3538         LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
3539       return LoopVectorBody->getTerminator();
3540     return &*B.GetInsertPoint();
3541   };
3542 
3543   switch (ID.getKind()) {
3544   case InductionDescriptor::IK_IntInduction: {
3545     assert(!isa<VectorType>(Index->getType()) &&
3546            "Vector indices not supported for integer inductions yet");
3547     assert(Index->getType() == StartValue->getType() &&
3548            "Index type does not match StartValue type");
3549     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3550       return B.CreateSub(StartValue, Index);
3551     auto *Offset = CreateMul(
3552         Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3553     return CreateAdd(StartValue, Offset);
3554   }
3555   case InductionDescriptor::IK_PtrInduction: {
3556     assert(isa<SCEVConstant>(Step) &&
3557            "Expected constant step for pointer induction");
3558     return B.CreateGEP(
3559         ID.getElementType(), StartValue,
3560         CreateMul(Index,
3561                   Exp.expandCodeFor(Step, Index->getType()->getScalarType(),
3562                                     GetInsertPoint())));
3563   }
3564   case InductionDescriptor::IK_FpInduction: {
3565     assert(!isa<VectorType>(Index->getType()) &&
3566            "Vector indices not supported for FP inductions yet");
3567     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
3568     auto InductionBinOp = ID.getInductionBinOp();
3569     assert(InductionBinOp &&
3570            (InductionBinOp->getOpcode() == Instruction::FAdd ||
3571             InductionBinOp->getOpcode() == Instruction::FSub) &&
3572            "Original bin op should be defined for FP induction");
3573 
3574     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3575     Value *MulExp = B.CreateFMul(StepValue, Index);
3576     return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3577                          "induction");
3578   }
3579   case InductionDescriptor::IK_NoInduction:
3580     return nullptr;
3581   }
3582   llvm_unreachable("invalid enum");
3583 }
3584 
3585 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3586   LoopScalarBody = OrigLoop->getHeader();
3587   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3588   assert(LoopVectorPreHeader && "Invalid loop structure");
3589   LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3590   assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) &&
3591          "multiple exit loop without required epilogue?");
3592 
3593   LoopMiddleBlock =
3594       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3595                  LI, nullptr, Twine(Prefix) + "middle.block");
3596   LoopScalarPreHeader =
3597       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3598                  nullptr, Twine(Prefix) + "scalar.ph");
3599 
3600   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3601 
3602   // Set up the middle block terminator.  Two cases:
3603   // 1) If we know that we must execute the scalar epilogue, emit an
3604   //    unconditional branch.
3605   // 2) Otherwise, we must have a single unique exit block (due to how we
3606   //    implement the multiple exit case).  In this case, set up a conditonal
3607   //    branch from the middle block to the loop scalar preheader, and the
3608   //    exit block.  completeLoopSkeleton will update the condition to use an
3609   //    iteration check, if required to decide whether to execute the remainder.
3610   BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ?
3611     BranchInst::Create(LoopScalarPreHeader) :
3612     BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3613                        Builder.getTrue());
3614   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3615   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3616 
3617   // We intentionally don't let SplitBlock to update LoopInfo since
3618   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3619   // LoopVectorBody is explicitly added to the correct place few lines later.
3620   LoopVectorBody =
3621       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3622                  nullptr, nullptr, Twine(Prefix) + "vector.body");
3623 
3624   // Update dominator for loop exit.
3625   if (!Cost->requiresScalarEpilogue(VF))
3626     // If there is an epilogue which must run, there's no edge from the
3627     // middle block to exit blocks  and thus no need to update the immediate
3628     // dominator of the exit blocks.
3629     DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3630 
3631   // Create and register the new vector loop.
3632   Loop *Lp = LI->AllocateLoop();
3633   Loop *ParentLoop = OrigLoop->getParentLoop();
3634 
3635   // Insert the new loop into the loop nest and register the new basic blocks
3636   // before calling any utilities such as SCEV that require valid LoopInfo.
3637   if (ParentLoop) {
3638     ParentLoop->addChildLoop(Lp);
3639   } else {
3640     LI->addTopLevelLoop(Lp);
3641   }
3642   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3643   return Lp;
3644 }
3645 
3646 void InnerLoopVectorizer::createInductionResumeValues(
3647     Loop *L, Value *VectorTripCount,
3648     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3649   assert(VectorTripCount && L && "Expected valid arguments");
3650   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3651           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3652          "Inconsistent information about additional bypass.");
3653   // We are going to resume the execution of the scalar loop.
3654   // Go over all of the induction variables that we found and fix the
3655   // PHIs that are left in the scalar version of the loop.
3656   // The starting values of PHI nodes depend on the counter of the last
3657   // iteration in the vectorized loop.
3658   // If we come from a bypass edge then we need to start from the original
3659   // start value.
3660   for (auto &InductionEntry : Legal->getInductionVars()) {
3661     PHINode *OrigPhi = InductionEntry.first;
3662     InductionDescriptor II = InductionEntry.second;
3663 
3664     // Create phi nodes to merge from the  backedge-taken check block.
3665     PHINode *BCResumeVal =
3666         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3667                         LoopScalarPreHeader->getTerminator());
3668     // Copy original phi DL over to the new one.
3669     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3670     Value *&EndValue = IVEndValues[OrigPhi];
3671     Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3672     if (OrigPhi == OldInduction) {
3673       // We know what the end value is.
3674       EndValue = VectorTripCount;
3675     } else {
3676       IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3677 
3678       // Fast-math-flags propagate from the original induction instruction.
3679       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3680         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3681 
3682       Type *StepType = II.getStep()->getType();
3683       Instruction::CastOps CastOp =
3684           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3685       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3686       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3687       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3688       EndValue->setName("ind.end");
3689 
3690       // Compute the end value for the additional bypass (if applicable).
3691       if (AdditionalBypass.first) {
3692         B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3693         CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3694                                          StepType, true);
3695         CRD =
3696             B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
3697         EndValueFromAdditionalBypass =
3698             emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3699         EndValueFromAdditionalBypass->setName("ind.end");
3700       }
3701     }
3702     // The new PHI merges the original incoming value, in case of a bypass,
3703     // or the value at the end of the vectorized loop.
3704     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3705 
3706     // Fix the scalar body counter (PHI node).
3707     // The old induction's phi node in the scalar body needs the truncated
3708     // value.
3709     for (BasicBlock *BB : LoopBypassBlocks)
3710       BCResumeVal->addIncoming(II.getStartValue(), BB);
3711 
3712     if (AdditionalBypass.first)
3713       BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3714                                             EndValueFromAdditionalBypass);
3715 
3716     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3717   }
3718 }
3719 
3720 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3721                                                       MDNode *OrigLoopID) {
3722   assert(L && "Expected valid loop.");
3723 
3724   // The trip counts should be cached by now.
3725   Value *Count = getOrCreateTripCount(L);
3726   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3727 
3728   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3729 
3730   // Add a check in the middle block to see if we have completed
3731   // all of the iterations in the first vector loop.  Three cases:
3732   // 1) If we require a scalar epilogue, there is no conditional branch as
3733   //    we unconditionally branch to the scalar preheader.  Do nothing.
3734   // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3735   //    Thus if tail is to be folded, we know we don't need to run the
3736   //    remainder and we can use the previous value for the condition (true).
3737   // 3) Otherwise, construct a runtime check.
3738   if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) {
3739     Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3740                                         Count, VectorTripCount, "cmp.n",
3741                                         LoopMiddleBlock->getTerminator());
3742 
3743     // Here we use the same DebugLoc as the scalar loop latch terminator instead
3744     // of the corresponding compare because they may have ended up with
3745     // different line numbers and we want to avoid awkward line stepping while
3746     // debugging. Eg. if the compare has got a line number inside the loop.
3747     CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3748     cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3749   }
3750 
3751   // Get ready to start creating new instructions into the vectorized body.
3752   assert(LoopVectorPreHeader == L->getLoopPreheader() &&
3753          "Inconsistent vector loop preheader");
3754   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3755 
3756   Optional<MDNode *> VectorizedLoopID =
3757       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3758                                       LLVMLoopVectorizeFollowupVectorized});
3759   if (VectorizedLoopID.hasValue()) {
3760     L->setLoopID(VectorizedLoopID.getValue());
3761 
3762     // Do not setAlreadyVectorized if loop attributes have been defined
3763     // explicitly.
3764     return LoopVectorPreHeader;
3765   }
3766 
3767   // Keep all loop hints from the original loop on the vector loop (we'll
3768   // replace the vectorizer-specific hints below).
3769   if (MDNode *LID = OrigLoop->getLoopID())
3770     L->setLoopID(LID);
3771 
3772   LoopVectorizeHints Hints(L, true, *ORE);
3773   Hints.setAlreadyVectorized();
3774 
3775 #ifdef EXPENSIVE_CHECKS
3776   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3777   LI->verify(*DT);
3778 #endif
3779 
3780   return LoopVectorPreHeader;
3781 }
3782 
3783 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3784   /*
3785    In this function we generate a new loop. The new loop will contain
3786    the vectorized instructions while the old loop will continue to run the
3787    scalar remainder.
3788 
3789        [ ] <-- loop iteration number check.
3790     /   |
3791    /    v
3792   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3793   |  /  |
3794   | /   v
3795   ||   [ ]     <-- vector pre header.
3796   |/    |
3797   |     v
3798   |    [  ] \
3799   |    [  ]_|   <-- vector loop.
3800   |     |
3801   |     v
3802   \   -[ ]   <--- middle-block.
3803    \/   |
3804    /\   v
3805    | ->[ ]     <--- new preheader.
3806    |    |
3807  (opt)  v      <-- edge from middle to exit iff epilogue is not required.
3808    |   [ ] \
3809    |   [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue).
3810     \   |
3811      \  v
3812       >[ ]     <-- exit block(s).
3813    ...
3814    */
3815 
3816   // Get the metadata of the original loop before it gets modified.
3817   MDNode *OrigLoopID = OrigLoop->getLoopID();
3818 
3819   // Workaround!  Compute the trip count of the original loop and cache it
3820   // before we start modifying the CFG.  This code has a systemic problem
3821   // wherein it tries to run analysis over partially constructed IR; this is
3822   // wrong, and not simply for SCEV.  The trip count of the original loop
3823   // simply happens to be prone to hitting this in practice.  In theory, we
3824   // can hit the same issue for any SCEV, or ValueTracking query done during
3825   // mutation.  See PR49900.
3826   getOrCreateTripCount(OrigLoop);
3827 
3828   // Create an empty vector loop, and prepare basic blocks for the runtime
3829   // checks.
3830   Loop *Lp = createVectorLoopSkeleton("");
3831 
3832   // Now, compare the new count to zero. If it is zero skip the vector loop and
3833   // jump to the scalar loop. This check also covers the case where the
3834   // backedge-taken count is uint##_max: adding one to it will overflow leading
3835   // to an incorrect trip count of zero. In this (rare) case we will also jump
3836   // to the scalar loop.
3837   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3838 
3839   // Generate the code to check any assumptions that we've made for SCEV
3840   // expressions.
3841   emitSCEVChecks(Lp, LoopScalarPreHeader);
3842 
3843   // Generate the code that checks in runtime if arrays overlap. We put the
3844   // checks into a separate block to make the more common case of few elements
3845   // faster.
3846   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3847 
3848   // Some loops have a single integer induction variable, while other loops
3849   // don't. One example is c++ iterators that often have multiple pointer
3850   // induction variables. In the code below we also support a case where we
3851   // don't have a single induction variable.
3852   //
3853   // We try to obtain an induction variable from the original loop as hard
3854   // as possible. However if we don't find one that:
3855   //   - is an integer
3856   //   - counts from zero, stepping by one
3857   //   - is the size of the widest induction variable type
3858   // then we create a new one.
3859   OldInduction = Legal->getPrimaryInduction();
3860   Type *IdxTy = Legal->getWidestInductionType();
3861   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3862   // The loop step is equal to the vectorization factor (num of SIMD elements)
3863   // times the unroll factor (num of SIMD instructions).
3864   Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
3865   Value *Step = createStepForVF(Builder, IdxTy, VF, UF);
3866   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3867   Induction =
3868       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3869                               getDebugLocFromInstOrOperands(OldInduction));
3870 
3871   // Emit phis for the new starting index of the scalar loop.
3872   createInductionResumeValues(Lp, CountRoundDown);
3873 
3874   return completeLoopSkeleton(Lp, OrigLoopID);
3875 }
3876 
3877 // Fix up external users of the induction variable. At this point, we are
3878 // in LCSSA form, with all external PHIs that use the IV having one input value,
3879 // coming from the remainder loop. We need those PHIs to also have a correct
3880 // value for the IV when arriving directly from the middle block.
3881 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3882                                        const InductionDescriptor &II,
3883                                        Value *CountRoundDown, Value *EndValue,
3884                                        BasicBlock *MiddleBlock) {
3885   // There are two kinds of external IV usages - those that use the value
3886   // computed in the last iteration (the PHI) and those that use the penultimate
3887   // value (the value that feeds into the phi from the loop latch).
3888   // We allow both, but they, obviously, have different values.
3889 
3890   assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3891 
3892   DenseMap<Value *, Value *> MissingVals;
3893 
3894   // An external user of the last iteration's value should see the value that
3895   // the remainder loop uses to initialize its own IV.
3896   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3897   for (User *U : PostInc->users()) {
3898     Instruction *UI = cast<Instruction>(U);
3899     if (!OrigLoop->contains(UI)) {
3900       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3901       MissingVals[UI] = EndValue;
3902     }
3903   }
3904 
3905   // An external user of the penultimate value need to see EndValue - Step.
3906   // The simplest way to get this is to recompute it from the constituent SCEVs,
3907   // that is Start + (Step * (CRD - 1)).
3908   for (User *U : OrigPhi->users()) {
3909     auto *UI = cast<Instruction>(U);
3910     if (!OrigLoop->contains(UI)) {
3911       const DataLayout &DL =
3912           OrigLoop->getHeader()->getModule()->getDataLayout();
3913       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3914 
3915       IRBuilder<> B(MiddleBlock->getTerminator());
3916 
3917       // Fast-math-flags propagate from the original induction instruction.
3918       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3919         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3920 
3921       Value *CountMinusOne = B.CreateSub(
3922           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3923       Value *CMO =
3924           !II.getStep()->getType()->isIntegerTy()
3925               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3926                              II.getStep()->getType())
3927               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3928       CMO->setName("cast.cmo");
3929       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3930       Escape->setName("ind.escape");
3931       MissingVals[UI] = Escape;
3932     }
3933   }
3934 
3935   for (auto &I : MissingVals) {
3936     PHINode *PHI = cast<PHINode>(I.first);
3937     // One corner case we have to handle is two IVs "chasing" each-other,
3938     // that is %IV2 = phi [...], [ %IV1, %latch ]
3939     // In this case, if IV1 has an external use, we need to avoid adding both
3940     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3941     // don't already have an incoming value for the middle block.
3942     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3943       PHI->addIncoming(I.second, MiddleBlock);
3944   }
3945 }
3946 
3947 namespace {
3948 
3949 struct CSEDenseMapInfo {
3950   static bool canHandle(const Instruction *I) {
3951     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3952            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3953   }
3954 
3955   static inline Instruction *getEmptyKey() {
3956     return DenseMapInfo<Instruction *>::getEmptyKey();
3957   }
3958 
3959   static inline Instruction *getTombstoneKey() {
3960     return DenseMapInfo<Instruction *>::getTombstoneKey();
3961   }
3962 
3963   static unsigned getHashValue(const Instruction *I) {
3964     assert(canHandle(I) && "Unknown instruction!");
3965     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3966                                                            I->value_op_end()));
3967   }
3968 
3969   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3970     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3971         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3972       return LHS == RHS;
3973     return LHS->isIdenticalTo(RHS);
3974   }
3975 };
3976 
3977 } // end anonymous namespace
3978 
3979 ///Perform cse of induction variable instructions.
3980 static void cse(BasicBlock *BB) {
3981   // Perform simple cse.
3982   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3983   for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3984     if (!CSEDenseMapInfo::canHandle(&In))
3985       continue;
3986 
3987     // Check if we can replace this instruction with any of the
3988     // visited instructions.
3989     if (Instruction *V = CSEMap.lookup(&In)) {
3990       In.replaceAllUsesWith(V);
3991       In.eraseFromParent();
3992       continue;
3993     }
3994 
3995     CSEMap[&In] = &In;
3996   }
3997 }
3998 
3999 InstructionCost
4000 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
4001                                               bool &NeedToScalarize) const {
4002   Function *F = CI->getCalledFunction();
4003   Type *ScalarRetTy = CI->getType();
4004   SmallVector<Type *, 4> Tys, ScalarTys;
4005   for (auto &ArgOp : CI->args())
4006     ScalarTys.push_back(ArgOp->getType());
4007 
4008   // Estimate cost of scalarized vector call. The source operands are assumed
4009   // to be vectors, so we need to extract individual elements from there,
4010   // execute VF scalar calls, and then gather the result into the vector return
4011   // value.
4012   InstructionCost ScalarCallCost =
4013       TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
4014   if (VF.isScalar())
4015     return ScalarCallCost;
4016 
4017   // Compute corresponding vector type for return value and arguments.
4018   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
4019   for (Type *ScalarTy : ScalarTys)
4020     Tys.push_back(ToVectorTy(ScalarTy, VF));
4021 
4022   // Compute costs of unpacking argument values for the scalar calls and
4023   // packing the return values to a vector.
4024   InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
4025 
4026   InstructionCost Cost =
4027       ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
4028 
4029   // If we can't emit a vector call for this function, then the currently found
4030   // cost is the cost we need to return.
4031   NeedToScalarize = true;
4032   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4033   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
4034 
4035   if (!TLI || CI->isNoBuiltin() || !VecFunc)
4036     return Cost;
4037 
4038   // If the corresponding vector cost is cheaper, return its cost.
4039   InstructionCost VectorCallCost =
4040       TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
4041   if (VectorCallCost < Cost) {
4042     NeedToScalarize = false;
4043     Cost = VectorCallCost;
4044   }
4045   return Cost;
4046 }
4047 
4048 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
4049   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
4050     return Elt;
4051   return VectorType::get(Elt, VF);
4052 }
4053 
4054 InstructionCost
4055 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
4056                                                    ElementCount VF) const {
4057   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4058   assert(ID && "Expected intrinsic call!");
4059   Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
4060   FastMathFlags FMF;
4061   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
4062     FMF = FPMO->getFastMathFlags();
4063 
4064   SmallVector<const Value *> Arguments(CI->args());
4065   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
4066   SmallVector<Type *> ParamTys;
4067   std::transform(FTy->param_begin(), FTy->param_end(),
4068                  std::back_inserter(ParamTys),
4069                  [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
4070 
4071   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
4072                                     dyn_cast<IntrinsicInst>(CI));
4073   return TTI.getIntrinsicInstrCost(CostAttrs,
4074                                    TargetTransformInfo::TCK_RecipThroughput);
4075 }
4076 
4077 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
4078   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
4079   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
4080   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
4081 }
4082 
4083 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
4084   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
4085   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
4086   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
4087 }
4088 
4089 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
4090   // For every instruction `I` in MinBWs, truncate the operands, create a
4091   // truncated version of `I` and reextend its result. InstCombine runs
4092   // later and will remove any ext/trunc pairs.
4093   SmallPtrSet<Value *, 4> Erased;
4094   for (const auto &KV : Cost->getMinimalBitwidths()) {
4095     // If the value wasn't vectorized, we must maintain the original scalar
4096     // type. The absence of the value from State indicates that it
4097     // wasn't vectorized.
4098     // FIXME: Should not rely on getVPValue at this point.
4099     VPValue *Def = State.Plan->getVPValue(KV.first, true);
4100     if (!State.hasAnyVectorValue(Def))
4101       continue;
4102     for (unsigned Part = 0; Part < UF; ++Part) {
4103       Value *I = State.get(Def, Part);
4104       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
4105         continue;
4106       Type *OriginalTy = I->getType();
4107       Type *ScalarTruncatedTy =
4108           IntegerType::get(OriginalTy->getContext(), KV.second);
4109       auto *TruncatedTy = VectorType::get(
4110           ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
4111       if (TruncatedTy == OriginalTy)
4112         continue;
4113 
4114       IRBuilder<> B(cast<Instruction>(I));
4115       auto ShrinkOperand = [&](Value *V) -> Value * {
4116         if (auto *ZI = dyn_cast<ZExtInst>(V))
4117           if (ZI->getSrcTy() == TruncatedTy)
4118             return ZI->getOperand(0);
4119         return B.CreateZExtOrTrunc(V, TruncatedTy);
4120       };
4121 
4122       // The actual instruction modification depends on the instruction type,
4123       // unfortunately.
4124       Value *NewI = nullptr;
4125       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
4126         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
4127                              ShrinkOperand(BO->getOperand(1)));
4128 
4129         // Any wrapping introduced by shrinking this operation shouldn't be
4130         // considered undefined behavior. So, we can't unconditionally copy
4131         // arithmetic wrapping flags to NewI.
4132         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
4133       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
4134         NewI =
4135             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
4136                          ShrinkOperand(CI->getOperand(1)));
4137       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
4138         NewI = B.CreateSelect(SI->getCondition(),
4139                               ShrinkOperand(SI->getTrueValue()),
4140                               ShrinkOperand(SI->getFalseValue()));
4141       } else if (auto *CI = dyn_cast<CastInst>(I)) {
4142         switch (CI->getOpcode()) {
4143         default:
4144           llvm_unreachable("Unhandled cast!");
4145         case Instruction::Trunc:
4146           NewI = ShrinkOperand(CI->getOperand(0));
4147           break;
4148         case Instruction::SExt:
4149           NewI = B.CreateSExtOrTrunc(
4150               CI->getOperand(0),
4151               smallestIntegerVectorType(OriginalTy, TruncatedTy));
4152           break;
4153         case Instruction::ZExt:
4154           NewI = B.CreateZExtOrTrunc(
4155               CI->getOperand(0),
4156               smallestIntegerVectorType(OriginalTy, TruncatedTy));
4157           break;
4158         }
4159       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
4160         auto Elements0 =
4161             cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
4162         auto *O0 = B.CreateZExtOrTrunc(
4163             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
4164         auto Elements1 =
4165             cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
4166         auto *O1 = B.CreateZExtOrTrunc(
4167             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
4168 
4169         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
4170       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
4171         // Don't do anything with the operands, just extend the result.
4172         continue;
4173       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
4174         auto Elements =
4175             cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
4176         auto *O0 = B.CreateZExtOrTrunc(
4177             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
4178         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
4179         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
4180       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
4181         auto Elements =
4182             cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
4183         auto *O0 = B.CreateZExtOrTrunc(
4184             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
4185         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
4186       } else {
4187         // If we don't know what to do, be conservative and don't do anything.
4188         continue;
4189       }
4190 
4191       // Lastly, extend the result.
4192       NewI->takeName(cast<Instruction>(I));
4193       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
4194       I->replaceAllUsesWith(Res);
4195       cast<Instruction>(I)->eraseFromParent();
4196       Erased.insert(I);
4197       State.reset(Def, Res, Part);
4198     }
4199   }
4200 
4201   // We'll have created a bunch of ZExts that are now parentless. Clean up.
4202   for (const auto &KV : Cost->getMinimalBitwidths()) {
4203     // If the value wasn't vectorized, we must maintain the original scalar
4204     // type. The absence of the value from State indicates that it
4205     // wasn't vectorized.
4206     // FIXME: Should not rely on getVPValue at this point.
4207     VPValue *Def = State.Plan->getVPValue(KV.first, true);
4208     if (!State.hasAnyVectorValue(Def))
4209       continue;
4210     for (unsigned Part = 0; Part < UF; ++Part) {
4211       Value *I = State.get(Def, Part);
4212       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
4213       if (Inst && Inst->use_empty()) {
4214         Value *NewI = Inst->getOperand(0);
4215         Inst->eraseFromParent();
4216         State.reset(Def, NewI, Part);
4217       }
4218     }
4219   }
4220 }
4221 
4222 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
4223   // Insert truncates and extends for any truncated instructions as hints to
4224   // InstCombine.
4225   if (VF.isVector())
4226     truncateToMinimalBitwidths(State);
4227 
4228   // Fix widened non-induction PHIs by setting up the PHI operands.
4229   if (OrigPHIsToFix.size()) {
4230     assert(EnableVPlanNativePath &&
4231            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
4232     fixNonInductionPHIs(State);
4233   }
4234 
4235   // At this point every instruction in the original loop is widened to a
4236   // vector form. Now we need to fix the recurrences in the loop. These PHI
4237   // nodes are currently empty because we did not want to introduce cycles.
4238   // This is the second stage of vectorizing recurrences.
4239   fixCrossIterationPHIs(State);
4240 
4241   // Forget the original basic block.
4242   PSE.getSE()->forgetLoop(OrigLoop);
4243 
4244   // If we inserted an edge from the middle block to the unique exit block,
4245   // update uses outside the loop (phis) to account for the newly inserted
4246   // edge.
4247   if (!Cost->requiresScalarEpilogue(VF)) {
4248     // Fix-up external users of the induction variables.
4249     for (auto &Entry : Legal->getInductionVars())
4250       fixupIVUsers(Entry.first, Entry.second,
4251                    getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
4252                    IVEndValues[Entry.first], LoopMiddleBlock);
4253 
4254     fixLCSSAPHIs(State);
4255   }
4256 
4257   for (Instruction *PI : PredicatedInstructions)
4258     sinkScalarOperands(&*PI);
4259 
4260   // Remove redundant induction instructions.
4261   cse(LoopVectorBody);
4262 
4263   // Set/update profile weights for the vector and remainder loops as original
4264   // loop iterations are now distributed among them. Note that original loop
4265   // represented by LoopScalarBody becomes remainder loop after vectorization.
4266   //
4267   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
4268   // end up getting slightly roughened result but that should be OK since
4269   // profile is not inherently precise anyway. Note also possible bypass of
4270   // vector code caused by legality checks is ignored, assigning all the weight
4271   // to the vector loop, optimistically.
4272   //
4273   // For scalable vectorization we can't know at compile time how many iterations
4274   // of the loop are handled in one vector iteration, so instead assume a pessimistic
4275   // vscale of '1'.
4276   setProfileInfoAfterUnrolling(
4277       LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
4278       LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
4279 }
4280 
4281 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
4282   // In order to support recurrences we need to be able to vectorize Phi nodes.
4283   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4284   // stage #2: We now need to fix the recurrences by adding incoming edges to
4285   // the currently empty PHI nodes. At this point every instruction in the
4286   // original loop is widened to a vector form so we can use them to construct
4287   // the incoming edges.
4288   VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock();
4289   for (VPRecipeBase &R : Header->phis()) {
4290     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
4291       fixReduction(ReductionPhi, State);
4292     else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
4293       fixFirstOrderRecurrence(FOR, State);
4294   }
4295 }
4296 
4297 void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR,
4298                                                   VPTransformState &State) {
4299   // This is the second phase of vectorizing first-order recurrences. An
4300   // overview of the transformation is described below. Suppose we have the
4301   // following loop.
4302   //
4303   //   for (int i = 0; i < n; ++i)
4304   //     b[i] = a[i] - a[i - 1];
4305   //
4306   // There is a first-order recurrence on "a". For this loop, the shorthand
4307   // scalar IR looks like:
4308   //
4309   //   scalar.ph:
4310   //     s_init = a[-1]
4311   //     br scalar.body
4312   //
4313   //   scalar.body:
4314   //     i = phi [0, scalar.ph], [i+1, scalar.body]
4315   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
4316   //     s2 = a[i]
4317   //     b[i] = s2 - s1
4318   //     br cond, scalar.body, ...
4319   //
4320   // In this example, s1 is a recurrence because it's value depends on the
4321   // previous iteration. In the first phase of vectorization, we created a
4322   // vector phi v1 for s1. We now complete the vectorization and produce the
4323   // shorthand vector IR shown below (for VF = 4, UF = 1).
4324   //
4325   //   vector.ph:
4326   //     v_init = vector(..., ..., ..., a[-1])
4327   //     br vector.body
4328   //
4329   //   vector.body
4330   //     i = phi [0, vector.ph], [i+4, vector.body]
4331   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
4332   //     v2 = a[i, i+1, i+2, i+3];
4333   //     v3 = vector(v1(3), v2(0, 1, 2))
4334   //     b[i, i+1, i+2, i+3] = v2 - v3
4335   //     br cond, vector.body, middle.block
4336   //
4337   //   middle.block:
4338   //     x = v2(3)
4339   //     br scalar.ph
4340   //
4341   //   scalar.ph:
4342   //     s_init = phi [x, middle.block], [a[-1], otherwise]
4343   //     br scalar.body
4344   //
4345   // After execution completes the vector loop, we extract the next value of
4346   // the recurrence (x) to use as the initial value in the scalar loop.
4347 
4348   // Extract the last vector element in the middle block. This will be the
4349   // initial value for the recurrence when jumping to the scalar loop.
4350   VPValue *PreviousDef = PhiR->getBackedgeValue();
4351   Value *Incoming = State.get(PreviousDef, UF - 1);
4352   auto *ExtractForScalar = Incoming;
4353   auto *IdxTy = Builder.getInt32Ty();
4354   if (VF.isVector()) {
4355     auto *One = ConstantInt::get(IdxTy, 1);
4356     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4357     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
4358     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
4359     ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
4360                                                     "vector.recur.extract");
4361   }
4362   // Extract the second last element in the middle block if the
4363   // Phi is used outside the loop. We need to extract the phi itself
4364   // and not the last element (the phi update in the current iteration). This
4365   // will be the value when jumping to the exit block from the LoopMiddleBlock,
4366   // when the scalar loop is not run at all.
4367   Value *ExtractForPhiUsedOutsideLoop = nullptr;
4368   if (VF.isVector()) {
4369     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
4370     auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
4371     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4372         Incoming, Idx, "vector.recur.extract.for.phi");
4373   } else if (UF > 1)
4374     // When loop is unrolled without vectorizing, initialize
4375     // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
4376     // of `Incoming`. This is analogous to the vectorized case above: extracting
4377     // the second last element when VF > 1.
4378     ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
4379 
4380   // Fix the initial value of the original recurrence in the scalar loop.
4381   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
4382   PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
4383   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4384   auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
4385   for (auto *BB : predecessors(LoopScalarPreHeader)) {
4386     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4387     Start->addIncoming(Incoming, BB);
4388   }
4389 
4390   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
4391   Phi->setName("scalar.recur");
4392 
4393   // Finally, fix users of the recurrence outside the loop. The users will need
4394   // either the last value of the scalar recurrence or the last value of the
4395   // vector recurrence we extracted in the middle block. Since the loop is in
4396   // LCSSA form, we just need to find all the phi nodes for the original scalar
4397   // recurrence in the exit block, and then add an edge for the middle block.
4398   // Note that LCSSA does not imply single entry when the original scalar loop
4399   // had multiple exiting edges (as we always run the last iteration in the
4400   // scalar epilogue); in that case, there is no edge from middle to exit and
4401   // and thus no phis which needed updated.
4402   if (!Cost->requiresScalarEpilogue(VF))
4403     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4404       if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi))
4405         LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4406 }
4407 
4408 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
4409                                        VPTransformState &State) {
4410   PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
4411   // Get it's reduction variable descriptor.
4412   assert(Legal->isReductionVariable(OrigPhi) &&
4413          "Unable to find the reduction variable");
4414   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
4415 
4416   RecurKind RK = RdxDesc.getRecurrenceKind();
4417   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4418   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4419   setDebugLocFromInst(ReductionStartValue);
4420 
4421   VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
4422   // This is the vector-clone of the value that leaves the loop.
4423   Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
4424 
4425   // Wrap flags are in general invalid after vectorization, clear them.
4426   clearReductionWrapFlags(RdxDesc, State);
4427 
4428   // Before each round, move the insertion point right between
4429   // the PHIs and the values we are going to write.
4430   // This allows us to write both PHINodes and the extractelement
4431   // instructions.
4432   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4433 
4434   setDebugLocFromInst(LoopExitInst);
4435 
4436   Type *PhiTy = OrigPhi->getType();
4437   // If tail is folded by masking, the vector value to leave the loop should be
4438   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
4439   // instead of the former. For an inloop reduction the reduction will already
4440   // be predicated, and does not need to be handled here.
4441   if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
4442     for (unsigned Part = 0; Part < UF; ++Part) {
4443       Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
4444       Value *Sel = nullptr;
4445       for (User *U : VecLoopExitInst->users()) {
4446         if (isa<SelectInst>(U)) {
4447           assert(!Sel && "Reduction exit feeding two selects");
4448           Sel = U;
4449         } else
4450           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
4451       }
4452       assert(Sel && "Reduction exit feeds no select");
4453       State.reset(LoopExitInstDef, Sel, Part);
4454 
4455       // If the target can create a predicated operator for the reduction at no
4456       // extra cost in the loop (for example a predicated vadd), it can be
4457       // cheaper for the select to remain in the loop than be sunk out of it,
4458       // and so use the select value for the phi instead of the old
4459       // LoopExitValue.
4460       if (PreferPredicatedReductionSelect ||
4461           TTI->preferPredicatedReductionSelect(
4462               RdxDesc.getOpcode(), PhiTy,
4463               TargetTransformInfo::ReductionFlags())) {
4464         auto *VecRdxPhi =
4465             cast<PHINode>(State.get(PhiR, Part));
4466         VecRdxPhi->setIncomingValueForBlock(
4467             LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4468       }
4469     }
4470   }
4471 
4472   // If the vector reduction can be performed in a smaller type, we truncate
4473   // then extend the loop exit value to enable InstCombine to evaluate the
4474   // entire expression in the smaller type.
4475   if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
4476     assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
4477     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4478     Builder.SetInsertPoint(
4479         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4480     VectorParts RdxParts(UF);
4481     for (unsigned Part = 0; Part < UF; ++Part) {
4482       RdxParts[Part] = State.get(LoopExitInstDef, Part);
4483       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4484       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4485                                         : Builder.CreateZExt(Trunc, VecTy);
4486       for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users()))
4487         if (U != Trunc) {
4488           U->replaceUsesOfWith(RdxParts[Part], Extnd);
4489           RdxParts[Part] = Extnd;
4490         }
4491     }
4492     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4493     for (unsigned Part = 0; Part < UF; ++Part) {
4494       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4495       State.reset(LoopExitInstDef, RdxParts[Part], Part);
4496     }
4497   }
4498 
4499   // Reduce all of the unrolled parts into a single vector.
4500   Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
4501   unsigned Op = RecurrenceDescriptor::getOpcode(RK);
4502 
4503   // The middle block terminator has already been assigned a DebugLoc here (the
4504   // OrigLoop's single latch terminator). We want the whole middle block to
4505   // appear to execute on this line because: (a) it is all compiler generated,
4506   // (b) these instructions are always executed after evaluating the latch
4507   // conditional branch, and (c) other passes may add new predecessors which
4508   // terminate on this line. This is the easiest way to ensure we don't
4509   // accidentally cause an extra step back into the loop while debugging.
4510   setDebugLocFromInst(LoopMiddleBlock->getTerminator());
4511   if (PhiR->isOrdered())
4512     ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
4513   else {
4514     // Floating-point operations should have some FMF to enable the reduction.
4515     IRBuilderBase::FastMathFlagGuard FMFG(Builder);
4516     Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
4517     for (unsigned Part = 1; Part < UF; ++Part) {
4518       Value *RdxPart = State.get(LoopExitInstDef, Part);
4519       if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
4520         ReducedPartRdx = Builder.CreateBinOp(
4521             (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
4522       } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK))
4523         ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK,
4524                                            ReducedPartRdx, RdxPart);
4525       else
4526         ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
4527     }
4528   }
4529 
4530   // Create the reduction after the loop. Note that inloop reductions create the
4531   // target reduction in the loop using a Reduction recipe.
4532   if (VF.isVector() && !PhiR->isInLoop()) {
4533     ReducedPartRdx =
4534         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi);
4535     // If the reduction can be performed in a smaller type, we need to extend
4536     // the reduction to the wider type before we branch to the original loop.
4537     if (PhiTy != RdxDesc.getRecurrenceType())
4538       ReducedPartRdx = RdxDesc.isSigned()
4539                            ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
4540                            : Builder.CreateZExt(ReducedPartRdx, PhiTy);
4541   }
4542 
4543   // Create a phi node that merges control-flow from the backedge-taken check
4544   // block and the middle block.
4545   PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
4546                                         LoopScalarPreHeader->getTerminator());
4547   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4548     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4549   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4550 
4551   // Now, we need to fix the users of the reduction variable
4552   // inside and outside of the scalar remainder loop.
4553 
4554   // We know that the loop is in LCSSA form. We need to update the PHI nodes
4555   // in the exit blocks.  See comment on analogous loop in
4556   // fixFirstOrderRecurrence for a more complete explaination of the logic.
4557   if (!Cost->requiresScalarEpilogue(VF))
4558     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4559       if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst))
4560         LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4561 
4562   // Fix the scalar loop reduction variable with the incoming reduction sum
4563   // from the vector body and from the backedge value.
4564   int IncomingEdgeBlockIdx =
4565       OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4566   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4567   // Pick the other block.
4568   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4569   OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4570   OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4571 }
4572 
4573 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
4574                                                   VPTransformState &State) {
4575   RecurKind RK = RdxDesc.getRecurrenceKind();
4576   if (RK != RecurKind::Add && RK != RecurKind::Mul)
4577     return;
4578 
4579   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4580   assert(LoopExitInstr && "null loop exit instruction");
4581   SmallVector<Instruction *, 8> Worklist;
4582   SmallPtrSet<Instruction *, 8> Visited;
4583   Worklist.push_back(LoopExitInstr);
4584   Visited.insert(LoopExitInstr);
4585 
4586   while (!Worklist.empty()) {
4587     Instruction *Cur = Worklist.pop_back_val();
4588     if (isa<OverflowingBinaryOperator>(Cur))
4589       for (unsigned Part = 0; Part < UF; ++Part) {
4590         // FIXME: Should not rely on getVPValue at this point.
4591         Value *V = State.get(State.Plan->getVPValue(Cur, true), Part);
4592         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4593       }
4594 
4595     for (User *U : Cur->users()) {
4596       Instruction *UI = cast<Instruction>(U);
4597       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4598           Visited.insert(UI).second)
4599         Worklist.push_back(UI);
4600     }
4601   }
4602 }
4603 
4604 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) {
4605   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4606     if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
4607       // Some phis were already hand updated by the reduction and recurrence
4608       // code above, leave them alone.
4609       continue;
4610 
4611     auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4612     // Non-instruction incoming values will have only one value.
4613 
4614     VPLane Lane = VPLane::getFirstLane();
4615     if (isa<Instruction>(IncomingValue) &&
4616         !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue),
4617                                            VF))
4618       Lane = VPLane::getLastLaneForVF(VF);
4619 
4620     // Can be a loop invariant incoming value or the last scalar value to be
4621     // extracted from the vectorized loop.
4622     // FIXME: Should not rely on getVPValue at this point.
4623     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4624     Value *lastIncomingValue =
4625         OrigLoop->isLoopInvariant(IncomingValue)
4626             ? IncomingValue
4627             : State.get(State.Plan->getVPValue(IncomingValue, true),
4628                         VPIteration(UF - 1, Lane));
4629     LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4630   }
4631 }
4632 
4633 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4634   // The basic block and loop containing the predicated instruction.
4635   auto *PredBB = PredInst->getParent();
4636   auto *VectorLoop = LI->getLoopFor(PredBB);
4637 
4638   // Initialize a worklist with the operands of the predicated instruction.
4639   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4640 
4641   // Holds instructions that we need to analyze again. An instruction may be
4642   // reanalyzed if we don't yet know if we can sink it or not.
4643   SmallVector<Instruction *, 8> InstsToReanalyze;
4644 
4645   // Returns true if a given use occurs in the predicated block. Phi nodes use
4646   // their operands in their corresponding predecessor blocks.
4647   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4648     auto *I = cast<Instruction>(U.getUser());
4649     BasicBlock *BB = I->getParent();
4650     if (auto *Phi = dyn_cast<PHINode>(I))
4651       BB = Phi->getIncomingBlock(
4652           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4653     return BB == PredBB;
4654   };
4655 
4656   // Iteratively sink the scalarized operands of the predicated instruction
4657   // into the block we created for it. When an instruction is sunk, it's
4658   // operands are then added to the worklist. The algorithm ends after one pass
4659   // through the worklist doesn't sink a single instruction.
4660   bool Changed;
4661   do {
4662     // Add the instructions that need to be reanalyzed to the worklist, and
4663     // reset the changed indicator.
4664     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4665     InstsToReanalyze.clear();
4666     Changed = false;
4667 
4668     while (!Worklist.empty()) {
4669       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4670 
4671       // We can't sink an instruction if it is a phi node, is not in the loop,
4672       // or may have side effects.
4673       if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
4674           I->mayHaveSideEffects())
4675         continue;
4676 
4677       // If the instruction is already in PredBB, check if we can sink its
4678       // operands. In that case, VPlan's sinkScalarOperands() succeeded in
4679       // sinking the scalar instruction I, hence it appears in PredBB; but it
4680       // may have failed to sink I's operands (recursively), which we try
4681       // (again) here.
4682       if (I->getParent() == PredBB) {
4683         Worklist.insert(I->op_begin(), I->op_end());
4684         continue;
4685       }
4686 
4687       // It's legal to sink the instruction if all its uses occur in the
4688       // predicated block. Otherwise, there's nothing to do yet, and we may
4689       // need to reanalyze the instruction.
4690       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4691         InstsToReanalyze.push_back(I);
4692         continue;
4693       }
4694 
4695       // Move the instruction to the beginning of the predicated block, and add
4696       // it's operands to the worklist.
4697       I->moveBefore(&*PredBB->getFirstInsertionPt());
4698       Worklist.insert(I->op_begin(), I->op_end());
4699 
4700       // The sinking may have enabled other instructions to be sunk, so we will
4701       // need to iterate.
4702       Changed = true;
4703     }
4704   } while (Changed);
4705 }
4706 
4707 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
4708   for (PHINode *OrigPhi : OrigPHIsToFix) {
4709     VPWidenPHIRecipe *VPPhi =
4710         cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi));
4711     PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4712     // Make sure the builder has a valid insert point.
4713     Builder.SetInsertPoint(NewPhi);
4714     for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4715       VPValue *Inc = VPPhi->getIncomingValue(i);
4716       VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4717       NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4718     }
4719   }
4720 }
4721 
4722 bool InnerLoopVectorizer::useOrderedReductions(RecurrenceDescriptor &RdxDesc) {
4723   return Cost->useOrderedReductions(RdxDesc);
4724 }
4725 
4726 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP,
4727                                    VPWidenGEPRecipe *WidenGEPRec,
4728                                    VPUser &Operands, unsigned UF,
4729                                    ElementCount VF, bool IsPtrLoopInvariant,
4730                                    SmallBitVector &IsIndexLoopInvariant,
4731                                    VPTransformState &State) {
4732   // Construct a vector GEP by widening the operands of the scalar GEP as
4733   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4734   // results in a vector of pointers when at least one operand of the GEP
4735   // is vector-typed. Thus, to keep the representation compact, we only use
4736   // vector-typed operands for loop-varying values.
4737 
4738   if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4739     // If we are vectorizing, but the GEP has only loop-invariant operands,
4740     // the GEP we build (by only using vector-typed operands for
4741     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4742     // produce a vector of pointers, we need to either arbitrarily pick an
4743     // operand to broadcast, or broadcast a clone of the original GEP.
4744     // Here, we broadcast a clone of the original.
4745     //
4746     // TODO: If at some point we decide to scalarize instructions having
4747     //       loop-invariant operands, this special case will no longer be
4748     //       required. We would add the scalarization decision to
4749     //       collectLoopScalars() and teach getVectorValue() to broadcast
4750     //       the lane-zero scalar value.
4751     auto *Clone = Builder.Insert(GEP->clone());
4752     for (unsigned Part = 0; Part < UF; ++Part) {
4753       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4754       State.set(WidenGEPRec, EntryPart, Part);
4755       addMetadata(EntryPart, GEP);
4756     }
4757   } else {
4758     // If the GEP has at least one loop-varying operand, we are sure to
4759     // produce a vector of pointers. But if we are only unrolling, we want
4760     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4761     // produce with the code below will be scalar (if VF == 1) or vector
4762     // (otherwise). Note that for the unroll-only case, we still maintain
4763     // values in the vector mapping with initVector, as we do for other
4764     // instructions.
4765     for (unsigned Part = 0; Part < UF; ++Part) {
4766       // The pointer operand of the new GEP. If it's loop-invariant, we
4767       // won't broadcast it.
4768       auto *Ptr = IsPtrLoopInvariant
4769                       ? State.get(Operands.getOperand(0), VPIteration(0, 0))
4770                       : State.get(Operands.getOperand(0), Part);
4771 
4772       // Collect all the indices for the new GEP. If any index is
4773       // loop-invariant, we won't broadcast it.
4774       SmallVector<Value *, 4> Indices;
4775       for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
4776         VPValue *Operand = Operands.getOperand(I);
4777         if (IsIndexLoopInvariant[I - 1])
4778           Indices.push_back(State.get(Operand, VPIteration(0, 0)));
4779         else
4780           Indices.push_back(State.get(Operand, Part));
4781       }
4782 
4783       // If the GEP instruction is vectorized and was in a basic block that
4784       // needed predication, we can't propagate the poison-generating 'inbounds'
4785       // flag. The control flow has been linearized and the GEP is no longer
4786       // guarded by the predicate, which could make the 'inbounds' properties to
4787       // no longer hold.
4788       bool IsInBounds = GEP->isInBounds() &&
4789                         State.MayGeneratePoisonRecipes.count(WidenGEPRec) == 0;
4790 
4791       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4792       // but it should be a vector, otherwise.
4793       auto *NewGEP =
4794           IsInBounds
4795               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4796                                           Indices)
4797               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4798       assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
4799              "NewGEP is not a pointer vector");
4800       State.set(WidenGEPRec, NewGEP, Part);
4801       addMetadata(NewGEP, GEP);
4802     }
4803   }
4804 }
4805 
4806 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
4807                                               VPWidenPHIRecipe *PhiR,
4808                                               VPTransformState &State) {
4809   PHINode *P = cast<PHINode>(PN);
4810   if (EnableVPlanNativePath) {
4811     // Currently we enter here in the VPlan-native path for non-induction
4812     // PHIs where all control flow is uniform. We simply widen these PHIs.
4813     // Create a vector phi with no operands - the vector phi operands will be
4814     // set at the end of vector code generation.
4815     Type *VecTy = (State.VF.isScalar())
4816                       ? PN->getType()
4817                       : VectorType::get(PN->getType(), State.VF);
4818     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4819     State.set(PhiR, VecPhi, 0);
4820     OrigPHIsToFix.push_back(P);
4821 
4822     return;
4823   }
4824 
4825   assert(PN->getParent() == OrigLoop->getHeader() &&
4826          "Non-header phis should have been handled elsewhere");
4827 
4828   // In order to support recurrences we need to be able to vectorize Phi nodes.
4829   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4830   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4831   // this value when we vectorize all of the instructions that use the PHI.
4832 
4833   assert(!Legal->isReductionVariable(P) &&
4834          "reductions should be handled elsewhere");
4835 
4836   setDebugLocFromInst(P);
4837 
4838   // This PHINode must be an induction variable.
4839   // Make sure that we know about it.
4840   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4841 
4842   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4843   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4844 
4845   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4846   // which can be found from the original scalar operations.
4847   switch (II.getKind()) {
4848   case InductionDescriptor::IK_NoInduction:
4849     llvm_unreachable("Unknown induction");
4850   case InductionDescriptor::IK_IntInduction:
4851   case InductionDescriptor::IK_FpInduction:
4852     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4853   case InductionDescriptor::IK_PtrInduction: {
4854     // Handle the pointer induction variable case.
4855     assert(P->getType()->isPointerTy() && "Unexpected type.");
4856 
4857     if (Cost->isScalarAfterVectorization(P, State.VF)) {
4858       // This is the normalized GEP that starts counting at zero.
4859       Value *PtrInd =
4860           Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4861       // Determine the number of scalars we need to generate for each unroll
4862       // iteration. If the instruction is uniform, we only need to generate the
4863       // first lane. Otherwise, we generate all VF values.
4864       bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF);
4865       unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue();
4866 
4867       bool NeedsVectorIndex = !IsUniform && VF.isScalable();
4868       Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr;
4869       if (NeedsVectorIndex) {
4870         Type *VecIVTy = VectorType::get(PtrInd->getType(), VF);
4871         UnitStepVec = Builder.CreateStepVector(VecIVTy);
4872         PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd);
4873       }
4874 
4875       for (unsigned Part = 0; Part < UF; ++Part) {
4876         Value *PartStart =
4877             createStepForVF(Builder, PtrInd->getType(), VF, Part);
4878 
4879         if (NeedsVectorIndex) {
4880           // Here we cache the whole vector, which means we can support the
4881           // extraction of any lane. However, in some cases the extractelement
4882           // instruction that is generated for scalar uses of this vector (e.g.
4883           // a load instruction) is not folded away. Therefore we still
4884           // calculate values for the first n lanes to avoid redundant moves
4885           // (when extracting the 0th element) and to produce scalar code (i.e.
4886           // additional add/gep instructions instead of expensive extractelement
4887           // instructions) when extracting higher-order elements.
4888           Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart);
4889           Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec);
4890           Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices);
4891           Value *SclrGep =
4892               emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II);
4893           SclrGep->setName("next.gep");
4894           State.set(PhiR, SclrGep, Part);
4895         }
4896 
4897         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4898           Value *Idx = Builder.CreateAdd(
4899               PartStart, ConstantInt::get(PtrInd->getType(), Lane));
4900           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4901           Value *SclrGep =
4902               emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4903           SclrGep->setName("next.gep");
4904           State.set(PhiR, SclrGep, VPIteration(Part, Lane));
4905         }
4906       }
4907       return;
4908     }
4909     assert(isa<SCEVConstant>(II.getStep()) &&
4910            "Induction step not a SCEV constant!");
4911     Type *PhiType = II.getStep()->getType();
4912 
4913     // Build a pointer phi
4914     Value *ScalarStartValue = II.getStartValue();
4915     Type *ScStValueType = ScalarStartValue->getType();
4916     PHINode *NewPointerPhi =
4917         PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4918     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4919 
4920     // A pointer induction, performed by using a gep
4921     BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4922     Instruction *InductionLoc = LoopLatch->getTerminator();
4923     const SCEV *ScalarStep = II.getStep();
4924     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4925     Value *ScalarStepValue =
4926         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4927     Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF);
4928     Value *NumUnrolledElems =
4929         Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
4930     Value *InductionGEP = GetElementPtrInst::Create(
4931         II.getElementType(), NewPointerPhi,
4932         Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
4933         InductionLoc);
4934     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4935 
4936     // Create UF many actual address geps that use the pointer
4937     // phi as base and a vectorized version of the step value
4938     // (<step*0, ..., step*N>) as offset.
4939     for (unsigned Part = 0; Part < State.UF; ++Part) {
4940       Type *VecPhiType = VectorType::get(PhiType, State.VF);
4941       Value *StartOffsetScalar =
4942           Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
4943       Value *StartOffset =
4944           Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
4945       // Create a vector of consecutive numbers from zero to VF.
4946       StartOffset =
4947           Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType));
4948 
4949       Value *GEP = Builder.CreateGEP(
4950           II.getElementType(), NewPointerPhi,
4951           Builder.CreateMul(
4952               StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue),
4953               "vector.gep"));
4954       State.set(PhiR, GEP, Part);
4955     }
4956   }
4957   }
4958 }
4959 
4960 /// A helper function for checking whether an integer division-related
4961 /// instruction may divide by zero (in which case it must be predicated if
4962 /// executed conditionally in the scalar code).
4963 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4964 /// Non-zero divisors that are non compile-time constants will not be
4965 /// converted into multiplication, so we will still end up scalarizing
4966 /// the division, but can do so w/o predication.
4967 static bool mayDivideByZero(Instruction &I) {
4968   assert((I.getOpcode() == Instruction::UDiv ||
4969           I.getOpcode() == Instruction::SDiv ||
4970           I.getOpcode() == Instruction::URem ||
4971           I.getOpcode() == Instruction::SRem) &&
4972          "Unexpected instruction");
4973   Value *Divisor = I.getOperand(1);
4974   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4975   return !CInt || CInt->isZero();
4976 }
4977 
4978 void InnerLoopVectorizer::widenInstruction(Instruction &I,
4979                                            VPWidenRecipe *WidenRec,
4980                                            VPTransformState &State) {
4981   switch (I.getOpcode()) {
4982   case Instruction::Call:
4983   case Instruction::Br:
4984   case Instruction::PHI:
4985   case Instruction::GetElementPtr:
4986   case Instruction::Select:
4987     llvm_unreachable("This instruction is handled by a different recipe.");
4988   case Instruction::UDiv:
4989   case Instruction::SDiv:
4990   case Instruction::SRem:
4991   case Instruction::URem:
4992   case Instruction::Add:
4993   case Instruction::FAdd:
4994   case Instruction::Sub:
4995   case Instruction::FSub:
4996   case Instruction::FNeg:
4997   case Instruction::Mul:
4998   case Instruction::FMul:
4999   case Instruction::FDiv:
5000   case Instruction::FRem:
5001   case Instruction::Shl:
5002   case Instruction::LShr:
5003   case Instruction::AShr:
5004   case Instruction::And:
5005   case Instruction::Or:
5006   case Instruction::Xor: {
5007     // Just widen unops and binops.
5008     setDebugLocFromInst(&I);
5009 
5010     for (unsigned Part = 0; Part < UF; ++Part) {
5011       SmallVector<Value *, 2> Ops;
5012       for (VPValue *VPOp : WidenRec->operands())
5013         Ops.push_back(State.get(VPOp, Part));
5014 
5015       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
5016 
5017       if (auto *VecOp = dyn_cast<Instruction>(V)) {
5018         VecOp->copyIRFlags(&I);
5019 
5020         // If the instruction is vectorized and was in a basic block that needed
5021         // predication, we can't propagate poison-generating flags (nuw/nsw,
5022         // exact, etc.). The control flow has been linearized and the
5023         // instruction is no longer guarded by the predicate, which could make
5024         // the flag properties to no longer hold.
5025         if (State.MayGeneratePoisonRecipes.count(WidenRec) > 0)
5026           VecOp->dropPoisonGeneratingFlags();
5027       }
5028 
5029       // Use this vector value for all users of the original instruction.
5030       State.set(WidenRec, V, Part);
5031       addMetadata(V, &I);
5032     }
5033 
5034     break;
5035   }
5036   case Instruction::ICmp:
5037   case Instruction::FCmp: {
5038     // Widen compares. Generate vector compares.
5039     bool FCmp = (I.getOpcode() == Instruction::FCmp);
5040     auto *Cmp = cast<CmpInst>(&I);
5041     setDebugLocFromInst(Cmp);
5042     for (unsigned Part = 0; Part < UF; ++Part) {
5043       Value *A = State.get(WidenRec->getOperand(0), Part);
5044       Value *B = State.get(WidenRec->getOperand(1), Part);
5045       Value *C = nullptr;
5046       if (FCmp) {
5047         // Propagate fast math flags.
5048         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
5049         Builder.setFastMathFlags(Cmp->getFastMathFlags());
5050         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
5051       } else {
5052         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
5053       }
5054       State.set(WidenRec, C, Part);
5055       addMetadata(C, &I);
5056     }
5057 
5058     break;
5059   }
5060 
5061   case Instruction::ZExt:
5062   case Instruction::SExt:
5063   case Instruction::FPToUI:
5064   case Instruction::FPToSI:
5065   case Instruction::FPExt:
5066   case Instruction::PtrToInt:
5067   case Instruction::IntToPtr:
5068   case Instruction::SIToFP:
5069   case Instruction::UIToFP:
5070   case Instruction::Trunc:
5071   case Instruction::FPTrunc:
5072   case Instruction::BitCast: {
5073     auto *CI = cast<CastInst>(&I);
5074     setDebugLocFromInst(CI);
5075 
5076     /// Vectorize casts.
5077     Type *DestTy =
5078         (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
5079 
5080     for (unsigned Part = 0; Part < UF; ++Part) {
5081       Value *A = State.get(WidenRec->getOperand(0), Part);
5082       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
5083       State.set(WidenRec, Cast, Part);
5084       addMetadata(Cast, &I);
5085     }
5086     break;
5087   }
5088   default:
5089     // This instruction is not vectorized by simple widening.
5090     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
5091     llvm_unreachable("Unhandled instruction!");
5092   } // end of switch.
5093 }
5094 
5095 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
5096                                                VPUser &ArgOperands,
5097                                                VPTransformState &State) {
5098   assert(!isa<DbgInfoIntrinsic>(I) &&
5099          "DbgInfoIntrinsic should have been dropped during VPlan construction");
5100   setDebugLocFromInst(&I);
5101 
5102   Module *M = I.getParent()->getParent()->getParent();
5103   auto *CI = cast<CallInst>(&I);
5104 
5105   SmallVector<Type *, 4> Tys;
5106   for (Value *ArgOperand : CI->args())
5107     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
5108 
5109   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
5110 
5111   // The flag shows whether we use Intrinsic or a usual Call for vectorized
5112   // version of the instruction.
5113   // Is it beneficial to perform intrinsic call compared to lib call?
5114   bool NeedToScalarize = false;
5115   InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
5116   InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
5117   bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
5118   assert((UseVectorIntrinsic || !NeedToScalarize) &&
5119          "Instruction should be scalarized elsewhere.");
5120   assert((IntrinsicCost.isValid() || CallCost.isValid()) &&
5121          "Either the intrinsic cost or vector call cost must be valid");
5122 
5123   for (unsigned Part = 0; Part < UF; ++Part) {
5124     SmallVector<Type *, 2> TysForDecl = {CI->getType()};
5125     SmallVector<Value *, 4> Args;
5126     for (auto &I : enumerate(ArgOperands.operands())) {
5127       // Some intrinsics have a scalar argument - don't replace it with a
5128       // vector.
5129       Value *Arg;
5130       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
5131         Arg = State.get(I.value(), Part);
5132       else {
5133         Arg = State.get(I.value(), VPIteration(0, 0));
5134         if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index()))
5135           TysForDecl.push_back(Arg->getType());
5136       }
5137       Args.push_back(Arg);
5138     }
5139 
5140     Function *VectorF;
5141     if (UseVectorIntrinsic) {
5142       // Use vector version of the intrinsic.
5143       if (VF.isVector())
5144         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
5145       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
5146       assert(VectorF && "Can't retrieve vector intrinsic.");
5147     } else {
5148       // Use vector version of the function call.
5149       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
5150 #ifndef NDEBUG
5151       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
5152              "Can't create vector function.");
5153 #endif
5154         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
5155     }
5156       SmallVector<OperandBundleDef, 1> OpBundles;
5157       CI->getOperandBundlesAsDefs(OpBundles);
5158       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
5159 
5160       if (isa<FPMathOperator>(V))
5161         V->copyFastMathFlags(CI);
5162 
5163       State.set(Def, V, Part);
5164       addMetadata(V, &I);
5165   }
5166 }
5167 
5168 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef,
5169                                                  VPUser &Operands,
5170                                                  bool InvariantCond,
5171                                                  VPTransformState &State) {
5172   setDebugLocFromInst(&I);
5173 
5174   // The condition can be loop invariant  but still defined inside the
5175   // loop. This means that we can't just use the original 'cond' value.
5176   // We have to take the 'vectorized' value and pick the first lane.
5177   // Instcombine will make this a no-op.
5178   auto *InvarCond = InvariantCond
5179                         ? State.get(Operands.getOperand(0), VPIteration(0, 0))
5180                         : nullptr;
5181 
5182   for (unsigned Part = 0; Part < UF; ++Part) {
5183     Value *Cond =
5184         InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
5185     Value *Op0 = State.get(Operands.getOperand(1), Part);
5186     Value *Op1 = State.get(Operands.getOperand(2), Part);
5187     Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
5188     State.set(VPDef, Sel, Part);
5189     addMetadata(Sel, &I);
5190   }
5191 }
5192 
5193 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
5194   // We should not collect Scalars more than once per VF. Right now, this
5195   // function is called from collectUniformsAndScalars(), which already does
5196   // this check. Collecting Scalars for VF=1 does not make any sense.
5197   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
5198          "This function should not be visited twice for the same VF");
5199 
5200   SmallSetVector<Instruction *, 8> Worklist;
5201 
5202   // These sets are used to seed the analysis with pointers used by memory
5203   // accesses that will remain scalar.
5204   SmallSetVector<Instruction *, 8> ScalarPtrs;
5205   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
5206   auto *Latch = TheLoop->getLoopLatch();
5207 
5208   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
5209   // The pointer operands of loads and stores will be scalar as long as the
5210   // memory access is not a gather or scatter operation. The value operand of a
5211   // store will remain scalar if the store is scalarized.
5212   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
5213     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
5214     assert(WideningDecision != CM_Unknown &&
5215            "Widening decision should be ready at this moment");
5216     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
5217       if (Ptr == Store->getValueOperand())
5218         return WideningDecision == CM_Scalarize;
5219     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
5220            "Ptr is neither a value or pointer operand");
5221     return WideningDecision != CM_GatherScatter;
5222   };
5223 
5224   // A helper that returns true if the given value is a bitcast or
5225   // getelementptr instruction contained in the loop.
5226   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
5227     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
5228             isa<GetElementPtrInst>(V)) &&
5229            !TheLoop->isLoopInvariant(V);
5230   };
5231 
5232   auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
5233     if (!isa<PHINode>(Ptr) ||
5234         !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
5235       return false;
5236     auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
5237     if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
5238       return false;
5239     return isScalarUse(MemAccess, Ptr);
5240   };
5241 
5242   // A helper that evaluates a memory access's use of a pointer. If the
5243   // pointer is actually the pointer induction of a loop, it is being
5244   // inserted into Worklist. If the use will be a scalar use, and the
5245   // pointer is only used by memory accesses, we place the pointer in
5246   // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
5247   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
5248     if (isScalarPtrInduction(MemAccess, Ptr)) {
5249       Worklist.insert(cast<Instruction>(Ptr));
5250       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
5251                         << "\n");
5252 
5253       Instruction *Update = cast<Instruction>(
5254           cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
5255 
5256       // If there is more than one user of Update (Ptr), we shouldn't assume it
5257       // will be scalar after vectorisation as other users of the instruction
5258       // may require widening. Otherwise, add it to ScalarPtrs.
5259       if (Update->hasOneUse() && cast<Value>(*Update->user_begin()) == Ptr) {
5260         ScalarPtrs.insert(Update);
5261         return;
5262       }
5263     }
5264     // We only care about bitcast and getelementptr instructions contained in
5265     // the loop.
5266     if (!isLoopVaryingBitCastOrGEP(Ptr))
5267       return;
5268 
5269     // If the pointer has already been identified as scalar (e.g., if it was
5270     // also identified as uniform), there's nothing to do.
5271     auto *I = cast<Instruction>(Ptr);
5272     if (Worklist.count(I))
5273       return;
5274 
5275     // If the use of the pointer will be a scalar use, and all users of the
5276     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
5277     // place the pointer in PossibleNonScalarPtrs.
5278     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
5279           return isa<LoadInst>(U) || isa<StoreInst>(U);
5280         }))
5281       ScalarPtrs.insert(I);
5282     else
5283       PossibleNonScalarPtrs.insert(I);
5284   };
5285 
5286   // We seed the scalars analysis with three classes of instructions: (1)
5287   // instructions marked uniform-after-vectorization and (2) bitcast,
5288   // getelementptr and (pointer) phi instructions used by memory accesses
5289   // requiring a scalar use.
5290   //
5291   // (1) Add to the worklist all instructions that have been identified as
5292   // uniform-after-vectorization.
5293   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
5294 
5295   // (2) Add to the worklist all bitcast and getelementptr instructions used by
5296   // memory accesses requiring a scalar use. The pointer operands of loads and
5297   // stores will be scalar as long as the memory accesses is not a gather or
5298   // scatter operation. The value operand of a store will remain scalar if the
5299   // store is scalarized.
5300   for (auto *BB : TheLoop->blocks())
5301     for (auto &I : *BB) {
5302       if (auto *Load = dyn_cast<LoadInst>(&I)) {
5303         evaluatePtrUse(Load, Load->getPointerOperand());
5304       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
5305         evaluatePtrUse(Store, Store->getPointerOperand());
5306         evaluatePtrUse(Store, Store->getValueOperand());
5307       }
5308     }
5309   for (auto *I : ScalarPtrs)
5310     if (!PossibleNonScalarPtrs.count(I)) {
5311       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
5312       Worklist.insert(I);
5313     }
5314 
5315   // Insert the forced scalars.
5316   // FIXME: Currently widenPHIInstruction() often creates a dead vector
5317   // induction variable when the PHI user is scalarized.
5318   auto ForcedScalar = ForcedScalars.find(VF);
5319   if (ForcedScalar != ForcedScalars.end())
5320     for (auto *I : ForcedScalar->second)
5321       Worklist.insert(I);
5322 
5323   // Expand the worklist by looking through any bitcasts and getelementptr
5324   // instructions we've already identified as scalar. This is similar to the
5325   // expansion step in collectLoopUniforms(); however, here we're only
5326   // expanding to include additional bitcasts and getelementptr instructions.
5327   unsigned Idx = 0;
5328   while (Idx != Worklist.size()) {
5329     Instruction *Dst = Worklist[Idx++];
5330     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
5331       continue;
5332     auto *Src = cast<Instruction>(Dst->getOperand(0));
5333     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
5334           auto *J = cast<Instruction>(U);
5335           return !TheLoop->contains(J) || Worklist.count(J) ||
5336                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
5337                   isScalarUse(J, Src));
5338         })) {
5339       Worklist.insert(Src);
5340       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
5341     }
5342   }
5343 
5344   // An induction variable will remain scalar if all users of the induction
5345   // variable and induction variable update remain scalar.
5346   for (auto &Induction : Legal->getInductionVars()) {
5347     auto *Ind = Induction.first;
5348     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5349 
5350     // If tail-folding is applied, the primary induction variable will be used
5351     // to feed a vector compare.
5352     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
5353       continue;
5354 
5355     // Determine if all users of the induction variable are scalar after
5356     // vectorization.
5357     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5358       auto *I = cast<Instruction>(U);
5359       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
5360     });
5361     if (!ScalarInd)
5362       continue;
5363 
5364     // Determine if all users of the induction variable update instruction are
5365     // scalar after vectorization.
5366     auto ScalarIndUpdate =
5367         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5368           auto *I = cast<Instruction>(U);
5369           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
5370         });
5371     if (!ScalarIndUpdate)
5372       continue;
5373 
5374     // The induction variable and its update instruction will remain scalar.
5375     Worklist.insert(Ind);
5376     Worklist.insert(IndUpdate);
5377     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
5378     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
5379                       << "\n");
5380   }
5381 
5382   Scalars[VF].insert(Worklist.begin(), Worklist.end());
5383 }
5384 
5385 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const {
5386   if (!blockNeedsPredicationForAnyReason(I->getParent()))
5387     return false;
5388   switch(I->getOpcode()) {
5389   default:
5390     break;
5391   case Instruction::Load:
5392   case Instruction::Store: {
5393     if (!Legal->isMaskRequired(I))
5394       return false;
5395     auto *Ptr = getLoadStorePointerOperand(I);
5396     auto *Ty = getLoadStoreType(I);
5397     const Align Alignment = getLoadStoreAlignment(I);
5398     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
5399                                 TTI.isLegalMaskedGather(Ty, Alignment))
5400                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
5401                                 TTI.isLegalMaskedScatter(Ty, Alignment));
5402   }
5403   case Instruction::UDiv:
5404   case Instruction::SDiv:
5405   case Instruction::SRem:
5406   case Instruction::URem:
5407     return mayDivideByZero(*I);
5408   }
5409   return false;
5410 }
5411 
5412 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
5413     Instruction *I, ElementCount VF) {
5414   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
5415   assert(getWideningDecision(I, VF) == CM_Unknown &&
5416          "Decision should not be set yet.");
5417   auto *Group = getInterleavedAccessGroup(I);
5418   assert(Group && "Must have a group.");
5419 
5420   // If the instruction's allocated size doesn't equal it's type size, it
5421   // requires padding and will be scalarized.
5422   auto &DL = I->getModule()->getDataLayout();
5423   auto *ScalarTy = getLoadStoreType(I);
5424   if (hasIrregularType(ScalarTy, DL))
5425     return false;
5426 
5427   // Check if masking is required.
5428   // A Group may need masking for one of two reasons: it resides in a block that
5429   // needs predication, or it was decided to use masking to deal with gaps
5430   // (either a gap at the end of a load-access that may result in a speculative
5431   // load, or any gaps in a store-access).
5432   bool PredicatedAccessRequiresMasking =
5433       blockNeedsPredicationForAnyReason(I->getParent()) &&
5434       Legal->isMaskRequired(I);
5435   bool LoadAccessWithGapsRequiresEpilogMasking =
5436       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
5437       !isScalarEpilogueAllowed();
5438   bool StoreAccessWithGapsRequiresMasking =
5439       isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
5440   if (!PredicatedAccessRequiresMasking &&
5441       !LoadAccessWithGapsRequiresEpilogMasking &&
5442       !StoreAccessWithGapsRequiresMasking)
5443     return true;
5444 
5445   // If masked interleaving is required, we expect that the user/target had
5446   // enabled it, because otherwise it either wouldn't have been created or
5447   // it should have been invalidated by the CostModel.
5448   assert(useMaskedInterleavedAccesses(TTI) &&
5449          "Masked interleave-groups for predicated accesses are not enabled.");
5450 
5451   if (Group->isReverse())
5452     return false;
5453 
5454   auto *Ty = getLoadStoreType(I);
5455   const Align Alignment = getLoadStoreAlignment(I);
5456   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
5457                           : TTI.isLegalMaskedStore(Ty, Alignment);
5458 }
5459 
5460 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
5461     Instruction *I, ElementCount VF) {
5462   // Get and ensure we have a valid memory instruction.
5463   assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
5464 
5465   auto *Ptr = getLoadStorePointerOperand(I);
5466   auto *ScalarTy = getLoadStoreType(I);
5467 
5468   // In order to be widened, the pointer should be consecutive, first of all.
5469   if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
5470     return false;
5471 
5472   // If the instruction is a store located in a predicated block, it will be
5473   // scalarized.
5474   if (isScalarWithPredication(I))
5475     return false;
5476 
5477   // If the instruction's allocated size doesn't equal it's type size, it
5478   // requires padding and will be scalarized.
5479   auto &DL = I->getModule()->getDataLayout();
5480   if (hasIrregularType(ScalarTy, DL))
5481     return false;
5482 
5483   return true;
5484 }
5485 
5486 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
5487   // We should not collect Uniforms more than once per VF. Right now,
5488   // this function is called from collectUniformsAndScalars(), which
5489   // already does this check. Collecting Uniforms for VF=1 does not make any
5490   // sense.
5491 
5492   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
5493          "This function should not be visited twice for the same VF");
5494 
5495   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5496   // not analyze again.  Uniforms.count(VF) will return 1.
5497   Uniforms[VF].clear();
5498 
5499   // We now know that the loop is vectorizable!
5500   // Collect instructions inside the loop that will remain uniform after
5501   // vectorization.
5502 
5503   // Global values, params and instructions outside of current loop are out of
5504   // scope.
5505   auto isOutOfScope = [&](Value *V) -> bool {
5506     Instruction *I = dyn_cast<Instruction>(V);
5507     return (!I || !TheLoop->contains(I));
5508   };
5509 
5510   // Worklist containing uniform instructions demanding lane 0.
5511   SetVector<Instruction *> Worklist;
5512   BasicBlock *Latch = TheLoop->getLoopLatch();
5513 
5514   // Add uniform instructions demanding lane 0 to the worklist. Instructions
5515   // that are scalar with predication must not be considered uniform after
5516   // vectorization, because that would create an erroneous replicating region
5517   // where only a single instance out of VF should be formed.
5518   // TODO: optimize such seldom cases if found important, see PR40816.
5519   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
5520     if (isOutOfScope(I)) {
5521       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
5522                         << *I << "\n");
5523       return;
5524     }
5525     if (isScalarWithPredication(I)) {
5526       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
5527                         << *I << "\n");
5528       return;
5529     }
5530     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
5531     Worklist.insert(I);
5532   };
5533 
5534   // Start with the conditional branch. If the branch condition is an
5535   // instruction contained in the loop that is only used by the branch, it is
5536   // uniform.
5537   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5538   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5539     addToWorklistIfAllowed(Cmp);
5540 
5541   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5542     InstWidening WideningDecision = getWideningDecision(I, VF);
5543     assert(WideningDecision != CM_Unknown &&
5544            "Widening decision should be ready at this moment");
5545 
5546     // A uniform memory op is itself uniform.  We exclude uniform stores
5547     // here as they demand the last lane, not the first one.
5548     if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
5549       assert(WideningDecision == CM_Scalarize);
5550       return true;
5551     }
5552 
5553     return (WideningDecision == CM_Widen ||
5554             WideningDecision == CM_Widen_Reverse ||
5555             WideningDecision == CM_Interleave);
5556   };
5557 
5558 
5559   // Returns true if Ptr is the pointer operand of a memory access instruction
5560   // I, and I is known to not require scalarization.
5561   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5562     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5563   };
5564 
5565   // Holds a list of values which are known to have at least one uniform use.
5566   // Note that there may be other uses which aren't uniform.  A "uniform use"
5567   // here is something which only demands lane 0 of the unrolled iterations;
5568   // it does not imply that all lanes produce the same value (e.g. this is not
5569   // the usual meaning of uniform)
5570   SetVector<Value *> HasUniformUse;
5571 
5572   // Scan the loop for instructions which are either a) known to have only
5573   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
5574   for (auto *BB : TheLoop->blocks())
5575     for (auto &I : *BB) {
5576       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
5577         switch (II->getIntrinsicID()) {
5578         case Intrinsic::sideeffect:
5579         case Intrinsic::experimental_noalias_scope_decl:
5580         case Intrinsic::assume:
5581         case Intrinsic::lifetime_start:
5582         case Intrinsic::lifetime_end:
5583           if (TheLoop->hasLoopInvariantOperands(&I))
5584             addToWorklistIfAllowed(&I);
5585           break;
5586         default:
5587           break;
5588         }
5589       }
5590 
5591       // ExtractValue instructions must be uniform, because the operands are
5592       // known to be loop-invariant.
5593       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
5594         assert(isOutOfScope(EVI->getAggregateOperand()) &&
5595                "Expected aggregate value to be loop invariant");
5596         addToWorklistIfAllowed(EVI);
5597         continue;
5598       }
5599 
5600       // If there's no pointer operand, there's nothing to do.
5601       auto *Ptr = getLoadStorePointerOperand(&I);
5602       if (!Ptr)
5603         continue;
5604 
5605       // A uniform memory op is itself uniform.  We exclude uniform stores
5606       // here as they demand the last lane, not the first one.
5607       if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
5608         addToWorklistIfAllowed(&I);
5609 
5610       if (isUniformDecision(&I, VF)) {
5611         assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
5612         HasUniformUse.insert(Ptr);
5613       }
5614     }
5615 
5616   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
5617   // demanding) users.  Since loops are assumed to be in LCSSA form, this
5618   // disallows uses outside the loop as well.
5619   for (auto *V : HasUniformUse) {
5620     if (isOutOfScope(V))
5621       continue;
5622     auto *I = cast<Instruction>(V);
5623     auto UsersAreMemAccesses =
5624       llvm::all_of(I->users(), [&](User *U) -> bool {
5625         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
5626       });
5627     if (UsersAreMemAccesses)
5628       addToWorklistIfAllowed(I);
5629   }
5630 
5631   // Expand Worklist in topological order: whenever a new instruction
5632   // is added , its users should be already inside Worklist.  It ensures
5633   // a uniform instruction will only be used by uniform instructions.
5634   unsigned idx = 0;
5635   while (idx != Worklist.size()) {
5636     Instruction *I = Worklist[idx++];
5637 
5638     for (auto OV : I->operand_values()) {
5639       // isOutOfScope operands cannot be uniform instructions.
5640       if (isOutOfScope(OV))
5641         continue;
5642       // First order recurrence Phi's should typically be considered
5643       // non-uniform.
5644       auto *OP = dyn_cast<PHINode>(OV);
5645       if (OP && Legal->isFirstOrderRecurrence(OP))
5646         continue;
5647       // If all the users of the operand are uniform, then add the
5648       // operand into the uniform worklist.
5649       auto *OI = cast<Instruction>(OV);
5650       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5651             auto *J = cast<Instruction>(U);
5652             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
5653           }))
5654         addToWorklistIfAllowed(OI);
5655     }
5656   }
5657 
5658   // For an instruction to be added into Worklist above, all its users inside
5659   // the loop should also be in Worklist. However, this condition cannot be
5660   // true for phi nodes that form a cyclic dependence. We must process phi
5661   // nodes separately. An induction variable will remain uniform if all users
5662   // of the induction variable and induction variable update remain uniform.
5663   // The code below handles both pointer and non-pointer induction variables.
5664   for (auto &Induction : Legal->getInductionVars()) {
5665     auto *Ind = Induction.first;
5666     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5667 
5668     // Determine if all users of the induction variable are uniform after
5669     // vectorization.
5670     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5671       auto *I = cast<Instruction>(U);
5672       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5673              isVectorizedMemAccessUse(I, Ind);
5674     });
5675     if (!UniformInd)
5676       continue;
5677 
5678     // Determine if all users of the induction variable update instruction are
5679     // uniform after vectorization.
5680     auto UniformIndUpdate =
5681         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5682           auto *I = cast<Instruction>(U);
5683           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5684                  isVectorizedMemAccessUse(I, IndUpdate);
5685         });
5686     if (!UniformIndUpdate)
5687       continue;
5688 
5689     // The induction variable and its update instruction will remain uniform.
5690     addToWorklistIfAllowed(Ind);
5691     addToWorklistIfAllowed(IndUpdate);
5692   }
5693 
5694   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5695 }
5696 
5697 bool LoopVectorizationCostModel::runtimeChecksRequired() {
5698   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
5699 
5700   if (Legal->getRuntimePointerChecking()->Need) {
5701     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5702         "runtime pointer checks needed. Enable vectorization of this "
5703         "loop with '#pragma clang loop vectorize(enable)' when "
5704         "compiling with -Os/-Oz",
5705         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5706     return true;
5707   }
5708 
5709   if (!PSE.getUnionPredicate().getPredicates().empty()) {
5710     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5711         "runtime SCEV checks needed. Enable vectorization of this "
5712         "loop with '#pragma clang loop vectorize(enable)' when "
5713         "compiling with -Os/-Oz",
5714         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5715     return true;
5716   }
5717 
5718   // FIXME: Avoid specializing for stride==1 instead of bailing out.
5719   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5720     reportVectorizationFailure("Runtime stride check for small trip count",
5721         "runtime stride == 1 checks needed. Enable vectorization of "
5722         "this loop without such check by compiling with -Os/-Oz",
5723         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5724     return true;
5725   }
5726 
5727   return false;
5728 }
5729 
5730 ElementCount
5731 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
5732   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
5733     return ElementCount::getScalable(0);
5734 
5735   if (Hints->isScalableVectorizationDisabled()) {
5736     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
5737                             "ScalableVectorizationDisabled", ORE, TheLoop);
5738     return ElementCount::getScalable(0);
5739   }
5740 
5741   LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
5742 
5743   auto MaxScalableVF = ElementCount::getScalable(
5744       std::numeric_limits<ElementCount::ScalarTy>::max());
5745 
5746   // Test that the loop-vectorizer can legalize all operations for this MaxVF.
5747   // FIXME: While for scalable vectors this is currently sufficient, this should
5748   // be replaced by a more detailed mechanism that filters out specific VFs,
5749   // instead of invalidating vectorization for a whole set of VFs based on the
5750   // MaxVF.
5751 
5752   // Disable scalable vectorization if the loop contains unsupported reductions.
5753   if (!canVectorizeReductions(MaxScalableVF)) {
5754     reportVectorizationInfo(
5755         "Scalable vectorization not supported for the reduction "
5756         "operations found in this loop.",
5757         "ScalableVFUnfeasible", ORE, TheLoop);
5758     return ElementCount::getScalable(0);
5759   }
5760 
5761   // Disable scalable vectorization if the loop contains any instructions
5762   // with element types not supported for scalable vectors.
5763   if (any_of(ElementTypesInLoop, [&](Type *Ty) {
5764         return !Ty->isVoidTy() &&
5765                !this->TTI.isElementTypeLegalForScalableVector(Ty);
5766       })) {
5767     reportVectorizationInfo("Scalable vectorization is not supported "
5768                             "for all element types found in this loop.",
5769                             "ScalableVFUnfeasible", ORE, TheLoop);
5770     return ElementCount::getScalable(0);
5771   }
5772 
5773   if (Legal->isSafeForAnyVectorWidth())
5774     return MaxScalableVF;
5775 
5776   // Limit MaxScalableVF by the maximum safe dependence distance.
5777   Optional<unsigned> MaxVScale = TTI.getMaxVScale();
5778   if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
5779     unsigned VScaleMax = TheFunction->getFnAttribute(Attribute::VScaleRange)
5780                              .getVScaleRangeArgs()
5781                              .second;
5782     if (VScaleMax > 0)
5783       MaxVScale = VScaleMax;
5784   }
5785   MaxScalableVF = ElementCount::getScalable(
5786       MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
5787   if (!MaxScalableVF)
5788     reportVectorizationInfo(
5789         "Max legal vector width too small, scalable vectorization "
5790         "unfeasible.",
5791         "ScalableVFUnfeasible", ORE, TheLoop);
5792 
5793   return MaxScalableVF;
5794 }
5795 
5796 FixedScalableVFPair
5797 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
5798                                                  ElementCount UserVF) {
5799   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5800   unsigned SmallestType, WidestType;
5801   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5802 
5803   // Get the maximum safe dependence distance in bits computed by LAA.
5804   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5805   // the memory accesses that is most restrictive (involved in the smallest
5806   // dependence distance).
5807   unsigned MaxSafeElements =
5808       PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
5809 
5810   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
5811   auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
5812 
5813   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
5814                     << ".\n");
5815   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
5816                     << ".\n");
5817 
5818   // First analyze the UserVF, fall back if the UserVF should be ignored.
5819   if (UserVF) {
5820     auto MaxSafeUserVF =
5821         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
5822 
5823     if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
5824       // If `VF=vscale x N` is safe, then so is `VF=N`
5825       if (UserVF.isScalable())
5826         return FixedScalableVFPair(
5827             ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
5828       else
5829         return UserVF;
5830     }
5831 
5832     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
5833 
5834     // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
5835     // is better to ignore the hint and let the compiler choose a suitable VF.
5836     if (!UserVF.isScalable()) {
5837       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5838                         << " is unsafe, clamping to max safe VF="
5839                         << MaxSafeFixedVF << ".\n");
5840       ORE->emit([&]() {
5841         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5842                                           TheLoop->getStartLoc(),
5843                                           TheLoop->getHeader())
5844                << "User-specified vectorization factor "
5845                << ore::NV("UserVectorizationFactor", UserVF)
5846                << " is unsafe, clamping to maximum safe vectorization factor "
5847                << ore::NV("VectorizationFactor", MaxSafeFixedVF);
5848       });
5849       return MaxSafeFixedVF;
5850     }
5851 
5852     if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
5853       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5854                         << " is ignored because scalable vectors are not "
5855                            "available.\n");
5856       ORE->emit([&]() {
5857         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5858                                           TheLoop->getStartLoc(),
5859                                           TheLoop->getHeader())
5860                << "User-specified vectorization factor "
5861                << ore::NV("UserVectorizationFactor", UserVF)
5862                << " is ignored because the target does not support scalable "
5863                   "vectors. The compiler will pick a more suitable value.";
5864       });
5865     } else {
5866       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5867                         << " is unsafe. Ignoring scalable UserVF.\n");
5868       ORE->emit([&]() {
5869         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5870                                           TheLoop->getStartLoc(),
5871                                           TheLoop->getHeader())
5872                << "User-specified vectorization factor "
5873                << ore::NV("UserVectorizationFactor", UserVF)
5874                << " is unsafe. Ignoring the hint to let the compiler pick a "
5875                   "more suitable value.";
5876       });
5877     }
5878   }
5879 
5880   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5881                     << " / " << WidestType << " bits.\n");
5882 
5883   FixedScalableVFPair Result(ElementCount::getFixed(1),
5884                              ElementCount::getScalable(0));
5885   if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType,
5886                                            WidestType, MaxSafeFixedVF))
5887     Result.FixedVF = MaxVF;
5888 
5889   if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType,
5890                                            WidestType, MaxSafeScalableVF))
5891     if (MaxVF.isScalable()) {
5892       Result.ScalableVF = MaxVF;
5893       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
5894                         << "\n");
5895     }
5896 
5897   return Result;
5898 }
5899 
5900 FixedScalableVFPair
5901 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5902   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5903     // TODO: It may by useful to do since it's still likely to be dynamically
5904     // uniform if the target can skip.
5905     reportVectorizationFailure(
5906         "Not inserting runtime ptr check for divergent target",
5907         "runtime pointer checks needed. Not enabled for divergent target",
5908         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5909     return FixedScalableVFPair::getNone();
5910   }
5911 
5912   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5913   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5914   if (TC == 1) {
5915     reportVectorizationFailure("Single iteration (non) loop",
5916         "loop trip count is one, irrelevant for vectorization",
5917         "SingleIterationLoop", ORE, TheLoop);
5918     return FixedScalableVFPair::getNone();
5919   }
5920 
5921   switch (ScalarEpilogueStatus) {
5922   case CM_ScalarEpilogueAllowed:
5923     return computeFeasibleMaxVF(TC, UserVF);
5924   case CM_ScalarEpilogueNotAllowedUsePredicate:
5925     LLVM_FALLTHROUGH;
5926   case CM_ScalarEpilogueNotNeededUsePredicate:
5927     LLVM_DEBUG(
5928         dbgs() << "LV: vector predicate hint/switch found.\n"
5929                << "LV: Not allowing scalar epilogue, creating predicated "
5930                << "vector loop.\n");
5931     break;
5932   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5933     // fallthrough as a special case of OptForSize
5934   case CM_ScalarEpilogueNotAllowedOptSize:
5935     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5936       LLVM_DEBUG(
5937           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5938     else
5939       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5940                         << "count.\n");
5941 
5942     // Bail if runtime checks are required, which are not good when optimising
5943     // for size.
5944     if (runtimeChecksRequired())
5945       return FixedScalableVFPair::getNone();
5946 
5947     break;
5948   }
5949 
5950   // The only loops we can vectorize without a scalar epilogue, are loops with
5951   // a bottom-test and a single exiting block. We'd have to handle the fact
5952   // that not every instruction executes on the last iteration.  This will
5953   // require a lane mask which varies through the vector loop body.  (TODO)
5954   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5955     // If there was a tail-folding hint/switch, but we can't fold the tail by
5956     // masking, fallback to a vectorization with a scalar epilogue.
5957     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5958       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5959                            "scalar epilogue instead.\n");
5960       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5961       return computeFeasibleMaxVF(TC, UserVF);
5962     }
5963     return FixedScalableVFPair::getNone();
5964   }
5965 
5966   // Now try the tail folding
5967 
5968   // Invalidate interleave groups that require an epilogue if we can't mask
5969   // the interleave-group.
5970   if (!useMaskedInterleavedAccesses(TTI)) {
5971     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5972            "No decisions should have been taken at this point");
5973     // Note: There is no need to invalidate any cost modeling decisions here, as
5974     // non where taken so far.
5975     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5976   }
5977 
5978   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF);
5979   // Avoid tail folding if the trip count is known to be a multiple of any VF
5980   // we chose.
5981   // FIXME: The condition below pessimises the case for fixed-width vectors,
5982   // when scalable VFs are also candidates for vectorization.
5983   if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
5984     ElementCount MaxFixedVF = MaxFactors.FixedVF;
5985     assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&
5986            "MaxFixedVF must be a power of 2");
5987     unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
5988                                    : MaxFixedVF.getFixedValue();
5989     ScalarEvolution *SE = PSE.getSE();
5990     const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5991     const SCEV *ExitCount = SE->getAddExpr(
5992         BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5993     const SCEV *Rem = SE->getURemExpr(
5994         SE->applyLoopGuards(ExitCount, TheLoop),
5995         SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5996     if (Rem->isZero()) {
5997       // Accept MaxFixedVF if we do not have a tail.
5998       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5999       return MaxFactors;
6000     }
6001   }
6002 
6003   // For scalable vectors, don't use tail folding as this is currently not yet
6004   // supported. The code is likely to have ended up here if the tripcount is
6005   // low, in which case it makes sense not to use scalable vectors.
6006   if (MaxFactors.ScalableVF.isVector())
6007     MaxFactors.ScalableVF = ElementCount::getScalable(0);
6008 
6009   // If we don't know the precise trip count, or if the trip count that we
6010   // found modulo the vectorization factor is not zero, try to fold the tail
6011   // by masking.
6012   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
6013   if (Legal->prepareToFoldTailByMasking()) {
6014     FoldTailByMasking = true;
6015     return MaxFactors;
6016   }
6017 
6018   // If there was a tail-folding hint/switch, but we can't fold the tail by
6019   // masking, fallback to a vectorization with a scalar epilogue.
6020   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
6021     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
6022                          "scalar epilogue instead.\n");
6023     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
6024     return MaxFactors;
6025   }
6026 
6027   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
6028     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
6029     return FixedScalableVFPair::getNone();
6030   }
6031 
6032   if (TC == 0) {
6033     reportVectorizationFailure(
6034         "Unable to calculate the loop count due to complex control flow",
6035         "unable to calculate the loop count due to complex control flow",
6036         "UnknownLoopCountComplexCFG", ORE, TheLoop);
6037     return FixedScalableVFPair::getNone();
6038   }
6039 
6040   reportVectorizationFailure(
6041       "Cannot optimize for size and vectorize at the same time.",
6042       "cannot optimize for size and vectorize at the same time. "
6043       "Enable vectorization of this loop with '#pragma clang loop "
6044       "vectorize(enable)' when compiling with -Os/-Oz",
6045       "NoTailLoopWithOptForSize", ORE, TheLoop);
6046   return FixedScalableVFPair::getNone();
6047 }
6048 
6049 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
6050     unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
6051     const ElementCount &MaxSafeVF) {
6052   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
6053   TypeSize WidestRegister = TTI.getRegisterBitWidth(
6054       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
6055                            : TargetTransformInfo::RGK_FixedWidthVector);
6056 
6057   // Convenience function to return the minimum of two ElementCounts.
6058   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
6059     assert((LHS.isScalable() == RHS.isScalable()) &&
6060            "Scalable flags must match");
6061     return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
6062   };
6063 
6064   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
6065   // Note that both WidestRegister and WidestType may not be a powers of 2.
6066   auto MaxVectorElementCount = ElementCount::get(
6067       PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType),
6068       ComputeScalableMaxVF);
6069   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
6070   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
6071                     << (MaxVectorElementCount * WidestType) << " bits.\n");
6072 
6073   if (!MaxVectorElementCount) {
6074     LLVM_DEBUG(dbgs() << "LV: The target has no "
6075                       << (ComputeScalableMaxVF ? "scalable" : "fixed")
6076                       << " vector registers.\n");
6077     return ElementCount::getFixed(1);
6078   }
6079 
6080   const auto TripCountEC = ElementCount::getFixed(ConstTripCount);
6081   if (ConstTripCount &&
6082       ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) &&
6083       isPowerOf2_32(ConstTripCount)) {
6084     // We need to clamp the VF to be the ConstTripCount. There is no point in
6085     // choosing a higher viable VF as done in the loop below. If
6086     // MaxVectorElementCount is scalable, we only fall back on a fixed VF when
6087     // the TC is less than or equal to the known number of lanes.
6088     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
6089                       << ConstTripCount << "\n");
6090     return TripCountEC;
6091   }
6092 
6093   ElementCount MaxVF = MaxVectorElementCount;
6094   if (TTI.shouldMaximizeVectorBandwidth() ||
6095       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
6096     auto MaxVectorElementCountMaxBW = ElementCount::get(
6097         PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
6098         ComputeScalableMaxVF);
6099     MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
6100 
6101     // Collect all viable vectorization factors larger than the default MaxVF
6102     // (i.e. MaxVectorElementCount).
6103     SmallVector<ElementCount, 8> VFs;
6104     for (ElementCount VS = MaxVectorElementCount * 2;
6105          ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
6106       VFs.push_back(VS);
6107 
6108     // For each VF calculate its register usage.
6109     auto RUs = calculateRegisterUsage(VFs);
6110 
6111     // Select the largest VF which doesn't require more registers than existing
6112     // ones.
6113     for (int i = RUs.size() - 1; i >= 0; --i) {
6114       bool Selected = true;
6115       for (auto &pair : RUs[i].MaxLocalUsers) {
6116         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
6117         if (pair.second > TargetNumRegisters)
6118           Selected = false;
6119       }
6120       if (Selected) {
6121         MaxVF = VFs[i];
6122         break;
6123       }
6124     }
6125     if (ElementCount MinVF =
6126             TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
6127       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
6128         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
6129                           << ") with target's minimum: " << MinVF << '\n');
6130         MaxVF = MinVF;
6131       }
6132     }
6133   }
6134   return MaxVF;
6135 }
6136 
6137 bool LoopVectorizationCostModel::isMoreProfitable(
6138     const VectorizationFactor &A, const VectorizationFactor &B) const {
6139   InstructionCost CostA = A.Cost;
6140   InstructionCost CostB = B.Cost;
6141 
6142   unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
6143 
6144   if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking &&
6145       MaxTripCount) {
6146     // If we are folding the tail and the trip count is a known (possibly small)
6147     // constant, the trip count will be rounded up to an integer number of
6148     // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),
6149     // which we compare directly. When not folding the tail, the total cost will
6150     // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is
6151     // approximated with the per-lane cost below instead of using the tripcount
6152     // as here.
6153     auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue());
6154     auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue());
6155     return RTCostA < RTCostB;
6156   }
6157 
6158   // Improve estimate for the vector width if it is scalable.
6159   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
6160   unsigned EstimatedWidthB = B.Width.getKnownMinValue();
6161   if (Optional<unsigned> VScale = TTI.getVScaleForTuning()) {
6162     if (A.Width.isScalable())
6163       EstimatedWidthA *= VScale.getValue();
6164     if (B.Width.isScalable())
6165       EstimatedWidthB *= VScale.getValue();
6166   }
6167 
6168   // When set to preferred, for now assume vscale may be larger than 1 (or the
6169   // one being tuned for), so that scalable vectorization is slightly favorable
6170   // over fixed-width vectorization.
6171   if (Hints->isScalableVectorizationPreferred())
6172     if (A.Width.isScalable() && !B.Width.isScalable())
6173       return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
6174 
6175   // To avoid the need for FP division:
6176   //      (CostA / A.Width) < (CostB / B.Width)
6177   // <=>  (CostA * B.Width) < (CostB * A.Width)
6178   return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
6179 }
6180 
6181 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
6182     const ElementCountSet &VFCandidates) {
6183   InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
6184   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
6185   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
6186   assert(VFCandidates.count(ElementCount::getFixed(1)) &&
6187          "Expected Scalar VF to be a candidate");
6188 
6189   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost);
6190   VectorizationFactor ChosenFactor = ScalarCost;
6191 
6192   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
6193   if (ForceVectorization && VFCandidates.size() > 1) {
6194     // Ignore scalar width, because the user explicitly wants vectorization.
6195     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
6196     // evaluation.
6197     ChosenFactor.Cost = InstructionCost::getMax();
6198   }
6199 
6200   SmallVector<InstructionVFPair> InvalidCosts;
6201   for (const auto &i : VFCandidates) {
6202     // The cost for scalar VF=1 is already calculated, so ignore it.
6203     if (i.isScalar())
6204       continue;
6205 
6206     VectorizationCostTy C = expectedCost(i, &InvalidCosts);
6207     VectorizationFactor Candidate(i, C.first);
6208 
6209 #ifndef NDEBUG
6210     unsigned AssumedMinimumVscale = 1;
6211     if (Optional<unsigned> VScale = TTI.getVScaleForTuning())
6212       AssumedMinimumVscale = VScale.getValue();
6213     unsigned Width =
6214         Candidate.Width.isScalable()
6215             ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
6216             : Candidate.Width.getFixedValue();
6217     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
6218                       << " costs: " << (Candidate.Cost / Width));
6219     if (i.isScalable())
6220       LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
6221                         << AssumedMinimumVscale << ")");
6222     LLVM_DEBUG(dbgs() << ".\n");
6223 #endif
6224 
6225     if (!C.second && !ForceVectorization) {
6226       LLVM_DEBUG(
6227           dbgs() << "LV: Not considering vector loop of width " << i
6228                  << " because it will not generate any vector instructions.\n");
6229       continue;
6230     }
6231 
6232     // If profitable add it to ProfitableVF list.
6233     if (isMoreProfitable(Candidate, ScalarCost))
6234       ProfitableVFs.push_back(Candidate);
6235 
6236     if (isMoreProfitable(Candidate, ChosenFactor))
6237       ChosenFactor = Candidate;
6238   }
6239 
6240   // Emit a report of VFs with invalid costs in the loop.
6241   if (!InvalidCosts.empty()) {
6242     // Group the remarks per instruction, keeping the instruction order from
6243     // InvalidCosts.
6244     std::map<Instruction *, unsigned> Numbering;
6245     unsigned I = 0;
6246     for (auto &Pair : InvalidCosts)
6247       if (!Numbering.count(Pair.first))
6248         Numbering[Pair.first] = I++;
6249 
6250     // Sort the list, first on instruction(number) then on VF.
6251     llvm::sort(InvalidCosts,
6252                [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
6253                  if (Numbering[A.first] != Numbering[B.first])
6254                    return Numbering[A.first] < Numbering[B.first];
6255                  ElementCountComparator ECC;
6256                  return ECC(A.second, B.second);
6257                });
6258 
6259     // For a list of ordered instruction-vf pairs:
6260     //   [(load, vf1), (load, vf2), (store, vf1)]
6261     // Group the instructions together to emit separate remarks for:
6262     //   load  (vf1, vf2)
6263     //   store (vf1)
6264     auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
6265     auto Subset = ArrayRef<InstructionVFPair>();
6266     do {
6267       if (Subset.empty())
6268         Subset = Tail.take_front(1);
6269 
6270       Instruction *I = Subset.front().first;
6271 
6272       // If the next instruction is different, or if there are no other pairs,
6273       // emit a remark for the collated subset. e.g.
6274       //   [(load, vf1), (load, vf2))]
6275       // to emit:
6276       //  remark: invalid costs for 'load' at VF=(vf, vf2)
6277       if (Subset == Tail || Tail[Subset.size()].first != I) {
6278         std::string OutString;
6279         raw_string_ostream OS(OutString);
6280         assert(!Subset.empty() && "Unexpected empty range");
6281         OS << "Instruction with invalid costs prevented vectorization at VF=(";
6282         for (auto &Pair : Subset)
6283           OS << (Pair.second == Subset.front().second ? "" : ", ")
6284              << Pair.second;
6285         OS << "):";
6286         if (auto *CI = dyn_cast<CallInst>(I))
6287           OS << " call to " << CI->getCalledFunction()->getName();
6288         else
6289           OS << " " << I->getOpcodeName();
6290         OS.flush();
6291         reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
6292         Tail = Tail.drop_front(Subset.size());
6293         Subset = {};
6294       } else
6295         // Grow the subset by one element
6296         Subset = Tail.take_front(Subset.size() + 1);
6297     } while (!Tail.empty());
6298   }
6299 
6300   if (!EnableCondStoresVectorization && NumPredStores) {
6301     reportVectorizationFailure("There are conditional stores.",
6302         "store that is conditionally executed prevents vectorization",
6303         "ConditionalStore", ORE, TheLoop);
6304     ChosenFactor = ScalarCost;
6305   }
6306 
6307   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
6308                  ChosenFactor.Cost >= ScalarCost.Cost) dbgs()
6309              << "LV: Vectorization seems to be not beneficial, "
6310              << "but was forced by a user.\n");
6311   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
6312   return ChosenFactor;
6313 }
6314 
6315 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
6316     const Loop &L, ElementCount VF) const {
6317   // Cross iteration phis such as reductions need special handling and are
6318   // currently unsupported.
6319   if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) {
6320         return Legal->isFirstOrderRecurrence(&Phi) ||
6321                Legal->isReductionVariable(&Phi);
6322       }))
6323     return false;
6324 
6325   // Phis with uses outside of the loop require special handling and are
6326   // currently unsupported.
6327   for (auto &Entry : Legal->getInductionVars()) {
6328     // Look for uses of the value of the induction at the last iteration.
6329     Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
6330     for (User *U : PostInc->users())
6331       if (!L.contains(cast<Instruction>(U)))
6332         return false;
6333     // Look for uses of penultimate value of the induction.
6334     for (User *U : Entry.first->users())
6335       if (!L.contains(cast<Instruction>(U)))
6336         return false;
6337   }
6338 
6339   // Induction variables that are widened require special handling that is
6340   // currently not supported.
6341   if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
6342         return !(this->isScalarAfterVectorization(Entry.first, VF) ||
6343                  this->isProfitableToScalarize(Entry.first, VF));
6344       }))
6345     return false;
6346 
6347   // Epilogue vectorization code has not been auditted to ensure it handles
6348   // non-latch exits properly.  It may be fine, but it needs auditted and
6349   // tested.
6350   if (L.getExitingBlock() != L.getLoopLatch())
6351     return false;
6352 
6353   return true;
6354 }
6355 
6356 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
6357     const ElementCount VF) const {
6358   // FIXME: We need a much better cost-model to take different parameters such
6359   // as register pressure, code size increase and cost of extra branches into
6360   // account. For now we apply a very crude heuristic and only consider loops
6361   // with vectorization factors larger than a certain value.
6362   // We also consider epilogue vectorization unprofitable for targets that don't
6363   // consider interleaving beneficial (eg. MVE).
6364   if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
6365     return false;
6366   if (VF.getFixedValue() >= EpilogueVectorizationMinVF)
6367     return true;
6368   return false;
6369 }
6370 
6371 VectorizationFactor
6372 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
6373     const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
6374   VectorizationFactor Result = VectorizationFactor::Disabled();
6375   if (!EnableEpilogueVectorization) {
6376     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
6377     return Result;
6378   }
6379 
6380   if (!isScalarEpilogueAllowed()) {
6381     LLVM_DEBUG(
6382         dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
6383                   "allowed.\n";);
6384     return Result;
6385   }
6386 
6387   // Not really a cost consideration, but check for unsupported cases here to
6388   // simplify the logic.
6389   if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
6390     LLVM_DEBUG(
6391         dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
6392                   "not a supported candidate.\n";);
6393     return Result;
6394   }
6395 
6396   if (EpilogueVectorizationForceVF > 1) {
6397     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
6398     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
6399     if (LVP.hasPlanWithVF(ForcedEC))
6400       return {ForcedEC, 0};
6401     else {
6402       LLVM_DEBUG(
6403           dbgs()
6404               << "LEV: Epilogue vectorization forced factor is not viable.\n";);
6405       return Result;
6406     }
6407   }
6408 
6409   if (TheLoop->getHeader()->getParent()->hasOptSize() ||
6410       TheLoop->getHeader()->getParent()->hasMinSize()) {
6411     LLVM_DEBUG(
6412         dbgs()
6413             << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
6414     return Result;
6415   }
6416 
6417   auto FixedMainLoopVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
6418   if (MainLoopVF.isScalable())
6419     LLVM_DEBUG(
6420         dbgs() << "LEV: Epilogue vectorization using scalable vectors not "
6421                   "yet supported. Converting to fixed-width (VF="
6422                << FixedMainLoopVF << ") instead\n");
6423 
6424   if (!isEpilogueVectorizationProfitable(FixedMainLoopVF)) {
6425     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
6426                          "this loop\n");
6427     return Result;
6428   }
6429 
6430   for (auto &NextVF : ProfitableVFs)
6431     if (ElementCount::isKnownLT(NextVF.Width, FixedMainLoopVF) &&
6432         (Result.Width.getFixedValue() == 1 ||
6433          isMoreProfitable(NextVF, Result)) &&
6434         LVP.hasPlanWithVF(NextVF.Width))
6435       Result = NextVF;
6436 
6437   if (Result != VectorizationFactor::Disabled())
6438     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
6439                       << Result.Width.getFixedValue() << "\n";);
6440   return Result;
6441 }
6442 
6443 std::pair<unsigned, unsigned>
6444 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
6445   unsigned MinWidth = -1U;
6446   unsigned MaxWidth = 8;
6447   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
6448   for (Type *T : ElementTypesInLoop) {
6449     MinWidth = std::min<unsigned>(
6450         MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
6451     MaxWidth = std::max<unsigned>(
6452         MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
6453   }
6454   return {MinWidth, MaxWidth};
6455 }
6456 
6457 void LoopVectorizationCostModel::collectElementTypesForWidening() {
6458   ElementTypesInLoop.clear();
6459   // For each block.
6460   for (BasicBlock *BB : TheLoop->blocks()) {
6461     // For each instruction in the loop.
6462     for (Instruction &I : BB->instructionsWithoutDebug()) {
6463       Type *T = I.getType();
6464 
6465       // Skip ignored values.
6466       if (ValuesToIgnore.count(&I))
6467         continue;
6468 
6469       // Only examine Loads, Stores and PHINodes.
6470       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
6471         continue;
6472 
6473       // Examine PHI nodes that are reduction variables. Update the type to
6474       // account for the recurrence type.
6475       if (auto *PN = dyn_cast<PHINode>(&I)) {
6476         if (!Legal->isReductionVariable(PN))
6477           continue;
6478         const RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[PN];
6479         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
6480             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
6481                                       RdxDesc.getRecurrenceType(),
6482                                       TargetTransformInfo::ReductionFlags()))
6483           continue;
6484         T = RdxDesc.getRecurrenceType();
6485       }
6486 
6487       // Examine the stored values.
6488       if (auto *ST = dyn_cast<StoreInst>(&I))
6489         T = ST->getValueOperand()->getType();
6490 
6491       // Ignore loaded pointer types and stored pointer types that are not
6492       // vectorizable.
6493       //
6494       // FIXME: The check here attempts to predict whether a load or store will
6495       //        be vectorized. We only know this for certain after a VF has
6496       //        been selected. Here, we assume that if an access can be
6497       //        vectorized, it will be. We should also look at extending this
6498       //        optimization to non-pointer types.
6499       //
6500       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
6501           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
6502         continue;
6503 
6504       ElementTypesInLoop.insert(T);
6505     }
6506   }
6507 }
6508 
6509 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
6510                                                            unsigned LoopCost) {
6511   // -- The interleave heuristics --
6512   // We interleave the loop in order to expose ILP and reduce the loop overhead.
6513   // There are many micro-architectural considerations that we can't predict
6514   // at this level. For example, frontend pressure (on decode or fetch) due to
6515   // code size, or the number and capabilities of the execution ports.
6516   //
6517   // We use the following heuristics to select the interleave count:
6518   // 1. If the code has reductions, then we interleave to break the cross
6519   // iteration dependency.
6520   // 2. If the loop is really small, then we interleave to reduce the loop
6521   // overhead.
6522   // 3. We don't interleave if we think that we will spill registers to memory
6523   // due to the increased register pressure.
6524 
6525   if (!isScalarEpilogueAllowed())
6526     return 1;
6527 
6528   // We used the distance for the interleave count.
6529   if (Legal->getMaxSafeDepDistBytes() != -1U)
6530     return 1;
6531 
6532   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
6533   const bool HasReductions = !Legal->getReductionVars().empty();
6534   // Do not interleave loops with a relatively small known or estimated trip
6535   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
6536   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
6537   // because with the above conditions interleaving can expose ILP and break
6538   // cross iteration dependences for reductions.
6539   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
6540       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
6541     return 1;
6542 
6543   RegisterUsage R = calculateRegisterUsage({VF})[0];
6544   // We divide by these constants so assume that we have at least one
6545   // instruction that uses at least one register.
6546   for (auto& pair : R.MaxLocalUsers) {
6547     pair.second = std::max(pair.second, 1U);
6548   }
6549 
6550   // We calculate the interleave count using the following formula.
6551   // Subtract the number of loop invariants from the number of available
6552   // registers. These registers are used by all of the interleaved instances.
6553   // Next, divide the remaining registers by the number of registers that is
6554   // required by the loop, in order to estimate how many parallel instances
6555   // fit without causing spills. All of this is rounded down if necessary to be
6556   // a power of two. We want power of two interleave count to simplify any
6557   // addressing operations or alignment considerations.
6558   // We also want power of two interleave counts to ensure that the induction
6559   // variable of the vector loop wraps to zero, when tail is folded by masking;
6560   // this currently happens when OptForSize, in which case IC is set to 1 above.
6561   unsigned IC = UINT_MAX;
6562 
6563   for (auto& pair : R.MaxLocalUsers) {
6564     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
6565     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
6566                       << " registers of "
6567                       << TTI.getRegisterClassName(pair.first) << " register class\n");
6568     if (VF.isScalar()) {
6569       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
6570         TargetNumRegisters = ForceTargetNumScalarRegs;
6571     } else {
6572       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
6573         TargetNumRegisters = ForceTargetNumVectorRegs;
6574     }
6575     unsigned MaxLocalUsers = pair.second;
6576     unsigned LoopInvariantRegs = 0;
6577     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
6578       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
6579 
6580     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
6581     // Don't count the induction variable as interleaved.
6582     if (EnableIndVarRegisterHeur) {
6583       TmpIC =
6584           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
6585                         std::max(1U, (MaxLocalUsers - 1)));
6586     }
6587 
6588     IC = std::min(IC, TmpIC);
6589   }
6590 
6591   // Clamp the interleave ranges to reasonable counts.
6592   unsigned MaxInterleaveCount =
6593       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
6594 
6595   // Check if the user has overridden the max.
6596   if (VF.isScalar()) {
6597     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
6598       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
6599   } else {
6600     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
6601       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
6602   }
6603 
6604   // If trip count is known or estimated compile time constant, limit the
6605   // interleave count to be less than the trip count divided by VF, provided it
6606   // is at least 1.
6607   //
6608   // For scalable vectors we can't know if interleaving is beneficial. It may
6609   // not be beneficial for small loops if none of the lanes in the second vector
6610   // iterations is enabled. However, for larger loops, there is likely to be a
6611   // similar benefit as for fixed-width vectors. For now, we choose to leave
6612   // the InterleaveCount as if vscale is '1', although if some information about
6613   // the vector is known (e.g. min vector size), we can make a better decision.
6614   if (BestKnownTC) {
6615     MaxInterleaveCount =
6616         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
6617     // Make sure MaxInterleaveCount is greater than 0.
6618     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
6619   }
6620 
6621   assert(MaxInterleaveCount > 0 &&
6622          "Maximum interleave count must be greater than 0");
6623 
6624   // Clamp the calculated IC to be between the 1 and the max interleave count
6625   // that the target and trip count allows.
6626   if (IC > MaxInterleaveCount)
6627     IC = MaxInterleaveCount;
6628   else
6629     // Make sure IC is greater than 0.
6630     IC = std::max(1u, IC);
6631 
6632   assert(IC > 0 && "Interleave count must be greater than 0.");
6633 
6634   // If we did not calculate the cost for VF (because the user selected the VF)
6635   // then we calculate the cost of VF here.
6636   if (LoopCost == 0) {
6637     InstructionCost C = expectedCost(VF).first;
6638     assert(C.isValid() && "Expected to have chosen a VF with valid cost");
6639     LoopCost = *C.getValue();
6640   }
6641 
6642   assert(LoopCost && "Non-zero loop cost expected");
6643 
6644   // Interleave if we vectorized this loop and there is a reduction that could
6645   // benefit from interleaving.
6646   if (VF.isVector() && HasReductions) {
6647     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
6648     return IC;
6649   }
6650 
6651   // Note that if we've already vectorized the loop we will have done the
6652   // runtime check and so interleaving won't require further checks.
6653   bool InterleavingRequiresRuntimePointerCheck =
6654       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
6655 
6656   // We want to interleave small loops in order to reduce the loop overhead and
6657   // potentially expose ILP opportunities.
6658   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
6659                     << "LV: IC is " << IC << '\n'
6660                     << "LV: VF is " << VF << '\n');
6661   const bool AggressivelyInterleaveReductions =
6662       TTI.enableAggressiveInterleaving(HasReductions);
6663   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
6664     // We assume that the cost overhead is 1 and we use the cost model
6665     // to estimate the cost of the loop and interleave until the cost of the
6666     // loop overhead is about 5% of the cost of the loop.
6667     unsigned SmallIC =
6668         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
6669 
6670     // Interleave until store/load ports (estimated by max interleave count) are
6671     // saturated.
6672     unsigned NumStores = Legal->getNumStores();
6673     unsigned NumLoads = Legal->getNumLoads();
6674     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
6675     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
6676 
6677     // There is little point in interleaving for reductions containing selects
6678     // and compares when VF=1 since it may just create more overhead than it's
6679     // worth for loops with small trip counts. This is because we still have to
6680     // do the final reduction after the loop.
6681     bool HasSelectCmpReductions =
6682         HasReductions &&
6683         any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
6684           const RecurrenceDescriptor &RdxDesc = Reduction.second;
6685           return RecurrenceDescriptor::isSelectCmpRecurrenceKind(
6686               RdxDesc.getRecurrenceKind());
6687         });
6688     if (HasSelectCmpReductions) {
6689       LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
6690       return 1;
6691     }
6692 
6693     // If we have a scalar reduction (vector reductions are already dealt with
6694     // by this point), we can increase the critical path length if the loop
6695     // we're interleaving is inside another loop. For tree-wise reductions
6696     // set the limit to 2, and for ordered reductions it's best to disable
6697     // interleaving entirely.
6698     if (HasReductions && TheLoop->getLoopDepth() > 1) {
6699       bool HasOrderedReductions =
6700           any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
6701             const RecurrenceDescriptor &RdxDesc = Reduction.second;
6702             return RdxDesc.isOrdered();
6703           });
6704       if (HasOrderedReductions) {
6705         LLVM_DEBUG(
6706             dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
6707         return 1;
6708       }
6709 
6710       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
6711       SmallIC = std::min(SmallIC, F);
6712       StoresIC = std::min(StoresIC, F);
6713       LoadsIC = std::min(LoadsIC, F);
6714     }
6715 
6716     if (EnableLoadStoreRuntimeInterleave &&
6717         std::max(StoresIC, LoadsIC) > SmallIC) {
6718       LLVM_DEBUG(
6719           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
6720       return std::max(StoresIC, LoadsIC);
6721     }
6722 
6723     // If there are scalar reductions and TTI has enabled aggressive
6724     // interleaving for reductions, we will interleave to expose ILP.
6725     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
6726         AggressivelyInterleaveReductions) {
6727       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6728       // Interleave no less than SmallIC but not as aggressive as the normal IC
6729       // to satisfy the rare situation when resources are too limited.
6730       return std::max(IC / 2, SmallIC);
6731     } else {
6732       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
6733       return SmallIC;
6734     }
6735   }
6736 
6737   // Interleave if this is a large loop (small loops are already dealt with by
6738   // this point) that could benefit from interleaving.
6739   if (AggressivelyInterleaveReductions) {
6740     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6741     return IC;
6742   }
6743 
6744   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
6745   return 1;
6746 }
6747 
6748 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
6749 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
6750   // This function calculates the register usage by measuring the highest number
6751   // of values that are alive at a single location. Obviously, this is a very
6752   // rough estimation. We scan the loop in a topological order in order and
6753   // assign a number to each instruction. We use RPO to ensure that defs are
6754   // met before their users. We assume that each instruction that has in-loop
6755   // users starts an interval. We record every time that an in-loop value is
6756   // used, so we have a list of the first and last occurrences of each
6757   // instruction. Next, we transpose this data structure into a multi map that
6758   // holds the list of intervals that *end* at a specific location. This multi
6759   // map allows us to perform a linear search. We scan the instructions linearly
6760   // and record each time that a new interval starts, by placing it in a set.
6761   // If we find this value in the multi-map then we remove it from the set.
6762   // The max register usage is the maximum size of the set.
6763   // We also search for instructions that are defined outside the loop, but are
6764   // used inside the loop. We need this number separately from the max-interval
6765   // usage number because when we unroll, loop-invariant values do not take
6766   // more register.
6767   LoopBlocksDFS DFS(TheLoop);
6768   DFS.perform(LI);
6769 
6770   RegisterUsage RU;
6771 
6772   // Each 'key' in the map opens a new interval. The values
6773   // of the map are the index of the 'last seen' usage of the
6774   // instruction that is the key.
6775   using IntervalMap = DenseMap<Instruction *, unsigned>;
6776 
6777   // Maps instruction to its index.
6778   SmallVector<Instruction *, 64> IdxToInstr;
6779   // Marks the end of each interval.
6780   IntervalMap EndPoint;
6781   // Saves the list of instruction indices that are used in the loop.
6782   SmallPtrSet<Instruction *, 8> Ends;
6783   // Saves the list of values that are used in the loop but are
6784   // defined outside the loop, such as arguments and constants.
6785   SmallPtrSet<Value *, 8> LoopInvariants;
6786 
6787   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6788     for (Instruction &I : BB->instructionsWithoutDebug()) {
6789       IdxToInstr.push_back(&I);
6790 
6791       // Save the end location of each USE.
6792       for (Value *U : I.operands()) {
6793         auto *Instr = dyn_cast<Instruction>(U);
6794 
6795         // Ignore non-instruction values such as arguments, constants, etc.
6796         if (!Instr)
6797           continue;
6798 
6799         // If this instruction is outside the loop then record it and continue.
6800         if (!TheLoop->contains(Instr)) {
6801           LoopInvariants.insert(Instr);
6802           continue;
6803         }
6804 
6805         // Overwrite previous end points.
6806         EndPoint[Instr] = IdxToInstr.size();
6807         Ends.insert(Instr);
6808       }
6809     }
6810   }
6811 
6812   // Saves the list of intervals that end with the index in 'key'.
6813   using InstrList = SmallVector<Instruction *, 2>;
6814   DenseMap<unsigned, InstrList> TransposeEnds;
6815 
6816   // Transpose the EndPoints to a list of values that end at each index.
6817   for (auto &Interval : EndPoint)
6818     TransposeEnds[Interval.second].push_back(Interval.first);
6819 
6820   SmallPtrSet<Instruction *, 8> OpenIntervals;
6821   SmallVector<RegisterUsage, 8> RUs(VFs.size());
6822   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
6823 
6824   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
6825 
6826   // A lambda that gets the register usage for the given type and VF.
6827   const auto &TTICapture = TTI;
6828   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
6829     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
6830       return 0;
6831     InstructionCost::CostType RegUsage =
6832         *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue();
6833     assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() &&
6834            "Nonsensical values for register usage.");
6835     return RegUsage;
6836   };
6837 
6838   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
6839     Instruction *I = IdxToInstr[i];
6840 
6841     // Remove all of the instructions that end at this location.
6842     InstrList &List = TransposeEnds[i];
6843     for (Instruction *ToRemove : List)
6844       OpenIntervals.erase(ToRemove);
6845 
6846     // Ignore instructions that are never used within the loop.
6847     if (!Ends.count(I))
6848       continue;
6849 
6850     // Skip ignored values.
6851     if (ValuesToIgnore.count(I))
6852       continue;
6853 
6854     // For each VF find the maximum usage of registers.
6855     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6856       // Count the number of live intervals.
6857       SmallMapVector<unsigned, unsigned, 4> RegUsage;
6858 
6859       if (VFs[j].isScalar()) {
6860         for (auto Inst : OpenIntervals) {
6861           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6862           if (RegUsage.find(ClassID) == RegUsage.end())
6863             RegUsage[ClassID] = 1;
6864           else
6865             RegUsage[ClassID] += 1;
6866         }
6867       } else {
6868         collectUniformsAndScalars(VFs[j]);
6869         for (auto Inst : OpenIntervals) {
6870           // Skip ignored values for VF > 1.
6871           if (VecValuesToIgnore.count(Inst))
6872             continue;
6873           if (isScalarAfterVectorization(Inst, VFs[j])) {
6874             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6875             if (RegUsage.find(ClassID) == RegUsage.end())
6876               RegUsage[ClassID] = 1;
6877             else
6878               RegUsage[ClassID] += 1;
6879           } else {
6880             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
6881             if (RegUsage.find(ClassID) == RegUsage.end())
6882               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
6883             else
6884               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6885           }
6886         }
6887       }
6888 
6889       for (auto& pair : RegUsage) {
6890         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
6891           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
6892         else
6893           MaxUsages[j][pair.first] = pair.second;
6894       }
6895     }
6896 
6897     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6898                       << OpenIntervals.size() << '\n');
6899 
6900     // Add the current instruction to the list of open intervals.
6901     OpenIntervals.insert(I);
6902   }
6903 
6904   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6905     SmallMapVector<unsigned, unsigned, 4> Invariant;
6906 
6907     for (auto Inst : LoopInvariants) {
6908       unsigned Usage =
6909           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6910       unsigned ClassID =
6911           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6912       if (Invariant.find(ClassID) == Invariant.end())
6913         Invariant[ClassID] = Usage;
6914       else
6915         Invariant[ClassID] += Usage;
6916     }
6917 
6918     LLVM_DEBUG({
6919       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
6920       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
6921              << " item\n";
6922       for (const auto &pair : MaxUsages[i]) {
6923         dbgs() << "LV(REG): RegisterClass: "
6924                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6925                << " registers\n";
6926       }
6927       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
6928              << " item\n";
6929       for (const auto &pair : Invariant) {
6930         dbgs() << "LV(REG): RegisterClass: "
6931                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6932                << " registers\n";
6933       }
6934     });
6935 
6936     RU.LoopInvariantRegs = Invariant;
6937     RU.MaxLocalUsers = MaxUsages[i];
6938     RUs[i] = RU;
6939   }
6940 
6941   return RUs;
6942 }
6943 
6944 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
6945   // TODO: Cost model for emulated masked load/store is completely
6946   // broken. This hack guides the cost model to use an artificially
6947   // high enough value to practically disable vectorization with such
6948   // operations, except where previously deployed legality hack allowed
6949   // using very low cost values. This is to avoid regressions coming simply
6950   // from moving "masked load/store" check from legality to cost model.
6951   // Masked Load/Gather emulation was previously never allowed.
6952   // Limited number of Masked Store/Scatter emulation was allowed.
6953   assert(isPredicatedInst(I) &&
6954          "Expecting a scalar emulated instruction");
6955   return isa<LoadInst>(I) ||
6956          (isa<StoreInst>(I) &&
6957           NumPredStores > NumberOfStoresToPredicate);
6958 }
6959 
6960 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6961   // If we aren't vectorizing the loop, or if we've already collected the
6962   // instructions to scalarize, there's nothing to do. Collection may already
6963   // have occurred if we have a user-selected VF and are now computing the
6964   // expected cost for interleaving.
6965   if (VF.isScalar() || VF.isZero() ||
6966       InstsToScalarize.find(VF) != InstsToScalarize.end())
6967     return;
6968 
6969   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6970   // not profitable to scalarize any instructions, the presence of VF in the
6971   // map will indicate that we've analyzed it already.
6972   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6973 
6974   // Find all the instructions that are scalar with predication in the loop and
6975   // determine if it would be better to not if-convert the blocks they are in.
6976   // If so, we also record the instructions to scalarize.
6977   for (BasicBlock *BB : TheLoop->blocks()) {
6978     if (!blockNeedsPredicationForAnyReason(BB))
6979       continue;
6980     for (Instruction &I : *BB)
6981       if (isScalarWithPredication(&I)) {
6982         ScalarCostsTy ScalarCosts;
6983         // Do not apply discount if scalable, because that would lead to
6984         // invalid scalarization costs.
6985         // Do not apply discount logic if hacked cost is needed
6986         // for emulated masked memrefs.
6987         if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I) &&
6988             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6989           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6990         // Remember that BB will remain after vectorization.
6991         PredicatedBBsAfterVectorization.insert(BB);
6992       }
6993   }
6994 }
6995 
6996 int LoopVectorizationCostModel::computePredInstDiscount(
6997     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
6998   assert(!isUniformAfterVectorization(PredInst, VF) &&
6999          "Instruction marked uniform-after-vectorization will be predicated");
7000 
7001   // Initialize the discount to zero, meaning that the scalar version and the
7002   // vector version cost the same.
7003   InstructionCost Discount = 0;
7004 
7005   // Holds instructions to analyze. The instructions we visit are mapped in
7006   // ScalarCosts. Those instructions are the ones that would be scalarized if
7007   // we find that the scalar version costs less.
7008   SmallVector<Instruction *, 8> Worklist;
7009 
7010   // Returns true if the given instruction can be scalarized.
7011   auto canBeScalarized = [&](Instruction *I) -> bool {
7012     // We only attempt to scalarize instructions forming a single-use chain
7013     // from the original predicated block that would otherwise be vectorized.
7014     // Although not strictly necessary, we give up on instructions we know will
7015     // already be scalar to avoid traversing chains that are unlikely to be
7016     // beneficial.
7017     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
7018         isScalarAfterVectorization(I, VF))
7019       return false;
7020 
7021     // If the instruction is scalar with predication, it will be analyzed
7022     // separately. We ignore it within the context of PredInst.
7023     if (isScalarWithPredication(I))
7024       return false;
7025 
7026     // If any of the instruction's operands are uniform after vectorization,
7027     // the instruction cannot be scalarized. This prevents, for example, a
7028     // masked load from being scalarized.
7029     //
7030     // We assume we will only emit a value for lane zero of an instruction
7031     // marked uniform after vectorization, rather than VF identical values.
7032     // Thus, if we scalarize an instruction that uses a uniform, we would
7033     // create uses of values corresponding to the lanes we aren't emitting code
7034     // for. This behavior can be changed by allowing getScalarValue to clone
7035     // the lane zero values for uniforms rather than asserting.
7036     for (Use &U : I->operands())
7037       if (auto *J = dyn_cast<Instruction>(U.get()))
7038         if (isUniformAfterVectorization(J, VF))
7039           return false;
7040 
7041     // Otherwise, we can scalarize the instruction.
7042     return true;
7043   };
7044 
7045   // Compute the expected cost discount from scalarizing the entire expression
7046   // feeding the predicated instruction. We currently only consider expressions
7047   // that are single-use instruction chains.
7048   Worklist.push_back(PredInst);
7049   while (!Worklist.empty()) {
7050     Instruction *I = Worklist.pop_back_val();
7051 
7052     // If we've already analyzed the instruction, there's nothing to do.
7053     if (ScalarCosts.find(I) != ScalarCosts.end())
7054       continue;
7055 
7056     // Compute the cost of the vector instruction. Note that this cost already
7057     // includes the scalarization overhead of the predicated instruction.
7058     InstructionCost VectorCost = getInstructionCost(I, VF).first;
7059 
7060     // Compute the cost of the scalarized instruction. This cost is the cost of
7061     // the instruction as if it wasn't if-converted and instead remained in the
7062     // predicated block. We will scale this cost by block probability after
7063     // computing the scalarization overhead.
7064     InstructionCost ScalarCost =
7065         VF.getFixedValue() *
7066         getInstructionCost(I, ElementCount::getFixed(1)).first;
7067 
7068     // Compute the scalarization overhead of needed insertelement instructions
7069     // and phi nodes.
7070     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
7071       ScalarCost += TTI.getScalarizationOverhead(
7072           cast<VectorType>(ToVectorTy(I->getType(), VF)),
7073           APInt::getAllOnes(VF.getFixedValue()), true, false);
7074       ScalarCost +=
7075           VF.getFixedValue() *
7076           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
7077     }
7078 
7079     // Compute the scalarization overhead of needed extractelement
7080     // instructions. For each of the instruction's operands, if the operand can
7081     // be scalarized, add it to the worklist; otherwise, account for the
7082     // overhead.
7083     for (Use &U : I->operands())
7084       if (auto *J = dyn_cast<Instruction>(U.get())) {
7085         assert(VectorType::isValidElementType(J->getType()) &&
7086                "Instruction has non-scalar type");
7087         if (canBeScalarized(J))
7088           Worklist.push_back(J);
7089         else if (needsExtract(J, VF)) {
7090           ScalarCost += TTI.getScalarizationOverhead(
7091               cast<VectorType>(ToVectorTy(J->getType(), VF)),
7092               APInt::getAllOnes(VF.getFixedValue()), false, true);
7093         }
7094       }
7095 
7096     // Scale the total scalar cost by block probability.
7097     ScalarCost /= getReciprocalPredBlockProb();
7098 
7099     // Compute the discount. A non-negative discount means the vector version
7100     // of the instruction costs more, and scalarizing would be beneficial.
7101     Discount += VectorCost - ScalarCost;
7102     ScalarCosts[I] = ScalarCost;
7103   }
7104 
7105   return *Discount.getValue();
7106 }
7107 
7108 LoopVectorizationCostModel::VectorizationCostTy
7109 LoopVectorizationCostModel::expectedCost(
7110     ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
7111   VectorizationCostTy Cost;
7112 
7113   // For each block.
7114   for (BasicBlock *BB : TheLoop->blocks()) {
7115     VectorizationCostTy BlockCost;
7116 
7117     // For each instruction in the old loop.
7118     for (Instruction &I : BB->instructionsWithoutDebug()) {
7119       // Skip ignored values.
7120       if (ValuesToIgnore.count(&I) ||
7121           (VF.isVector() && VecValuesToIgnore.count(&I)))
7122         continue;
7123 
7124       VectorizationCostTy C = getInstructionCost(&I, VF);
7125 
7126       // Check if we should override the cost.
7127       if (C.first.isValid() &&
7128           ForceTargetInstructionCost.getNumOccurrences() > 0)
7129         C.first = InstructionCost(ForceTargetInstructionCost);
7130 
7131       // Keep a list of instructions with invalid costs.
7132       if (Invalid && !C.first.isValid())
7133         Invalid->emplace_back(&I, VF);
7134 
7135       BlockCost.first += C.first;
7136       BlockCost.second |= C.second;
7137       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
7138                         << " for VF " << VF << " For instruction: " << I
7139                         << '\n');
7140     }
7141 
7142     // If we are vectorizing a predicated block, it will have been
7143     // if-converted. This means that the block's instructions (aside from
7144     // stores and instructions that may divide by zero) will now be
7145     // unconditionally executed. For the scalar case, we may not always execute
7146     // the predicated block, if it is an if-else block. Thus, scale the block's
7147     // cost by the probability of executing it. blockNeedsPredication from
7148     // Legal is used so as to not include all blocks in tail folded loops.
7149     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
7150       BlockCost.first /= getReciprocalPredBlockProb();
7151 
7152     Cost.first += BlockCost.first;
7153     Cost.second |= BlockCost.second;
7154   }
7155 
7156   return Cost;
7157 }
7158 
7159 /// Gets Address Access SCEV after verifying that the access pattern
7160 /// is loop invariant except the induction variable dependence.
7161 ///
7162 /// This SCEV can be sent to the Target in order to estimate the address
7163 /// calculation cost.
7164 static const SCEV *getAddressAccessSCEV(
7165               Value *Ptr,
7166               LoopVectorizationLegality *Legal,
7167               PredicatedScalarEvolution &PSE,
7168               const Loop *TheLoop) {
7169 
7170   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
7171   if (!Gep)
7172     return nullptr;
7173 
7174   // We are looking for a gep with all loop invariant indices except for one
7175   // which should be an induction variable.
7176   auto SE = PSE.getSE();
7177   unsigned NumOperands = Gep->getNumOperands();
7178   for (unsigned i = 1; i < NumOperands; ++i) {
7179     Value *Opd = Gep->getOperand(i);
7180     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
7181         !Legal->isInductionVariable(Opd))
7182       return nullptr;
7183   }
7184 
7185   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
7186   return PSE.getSCEV(Ptr);
7187 }
7188 
7189 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
7190   return Legal->hasStride(I->getOperand(0)) ||
7191          Legal->hasStride(I->getOperand(1));
7192 }
7193 
7194 InstructionCost
7195 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
7196                                                         ElementCount VF) {
7197   assert(VF.isVector() &&
7198          "Scalarization cost of instruction implies vectorization.");
7199   if (VF.isScalable())
7200     return InstructionCost::getInvalid();
7201 
7202   Type *ValTy = getLoadStoreType(I);
7203   auto SE = PSE.getSE();
7204 
7205   unsigned AS = getLoadStoreAddressSpace(I);
7206   Value *Ptr = getLoadStorePointerOperand(I);
7207   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
7208 
7209   // Figure out whether the access is strided and get the stride value
7210   // if it's known in compile time
7211   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
7212 
7213   // Get the cost of the scalar memory instruction and address computation.
7214   InstructionCost Cost =
7215       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
7216 
7217   // Don't pass *I here, since it is scalar but will actually be part of a
7218   // vectorized loop where the user of it is a vectorized instruction.
7219   const Align Alignment = getLoadStoreAlignment(I);
7220   Cost += VF.getKnownMinValue() *
7221           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
7222                               AS, TTI::TCK_RecipThroughput);
7223 
7224   // Get the overhead of the extractelement and insertelement instructions
7225   // we might create due to scalarization.
7226   Cost += getScalarizationOverhead(I, VF);
7227 
7228   // If we have a predicated load/store, it will need extra i1 extracts and
7229   // conditional branches, but may not be executed for each vector lane. Scale
7230   // the cost by the probability of executing the predicated block.
7231   if (isPredicatedInst(I)) {
7232     Cost /= getReciprocalPredBlockProb();
7233 
7234     // Add the cost of an i1 extract and a branch
7235     auto *Vec_i1Ty =
7236         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
7237     Cost += TTI.getScalarizationOverhead(
7238         Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
7239         /*Insert=*/false, /*Extract=*/true);
7240     Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
7241 
7242     if (useEmulatedMaskMemRefHack(I))
7243       // Artificially setting to a high enough value to practically disable
7244       // vectorization with such operations.
7245       Cost = 3000000;
7246   }
7247 
7248   return Cost;
7249 }
7250 
7251 InstructionCost
7252 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
7253                                                     ElementCount VF) {
7254   Type *ValTy = getLoadStoreType(I);
7255   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
7256   Value *Ptr = getLoadStorePointerOperand(I);
7257   unsigned AS = getLoadStoreAddressSpace(I);
7258   int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
7259   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7260 
7261   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
7262          "Stride should be 1 or -1 for consecutive memory access");
7263   const Align Alignment = getLoadStoreAlignment(I);
7264   InstructionCost Cost = 0;
7265   if (Legal->isMaskRequired(I))
7266     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
7267                                       CostKind);
7268   else
7269     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
7270                                 CostKind, I);
7271 
7272   bool Reverse = ConsecutiveStride < 0;
7273   if (Reverse)
7274     Cost +=
7275         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
7276   return Cost;
7277 }
7278 
7279 InstructionCost
7280 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
7281                                                 ElementCount VF) {
7282   assert(Legal->isUniformMemOp(*I));
7283 
7284   Type *ValTy = getLoadStoreType(I);
7285   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
7286   const Align Alignment = getLoadStoreAlignment(I);
7287   unsigned AS = getLoadStoreAddressSpace(I);
7288   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7289   if (isa<LoadInst>(I)) {
7290     return TTI.getAddressComputationCost(ValTy) +
7291            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
7292                                CostKind) +
7293            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
7294   }
7295   StoreInst *SI = cast<StoreInst>(I);
7296 
7297   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
7298   return TTI.getAddressComputationCost(ValTy) +
7299          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
7300                              CostKind) +
7301          (isLoopInvariantStoreValue
7302               ? 0
7303               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
7304                                        VF.getKnownMinValue() - 1));
7305 }
7306 
7307 InstructionCost
7308 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
7309                                                  ElementCount VF) {
7310   Type *ValTy = getLoadStoreType(I);
7311   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
7312   const Align Alignment = getLoadStoreAlignment(I);
7313   const Value *Ptr = getLoadStorePointerOperand(I);
7314 
7315   return TTI.getAddressComputationCost(VectorTy) +
7316          TTI.getGatherScatterOpCost(
7317              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
7318              TargetTransformInfo::TCK_RecipThroughput, I);
7319 }
7320 
7321 InstructionCost
7322 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
7323                                                    ElementCount VF) {
7324   // TODO: Once we have support for interleaving with scalable vectors
7325   // we can calculate the cost properly here.
7326   if (VF.isScalable())
7327     return InstructionCost::getInvalid();
7328 
7329   Type *ValTy = getLoadStoreType(I);
7330   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
7331   unsigned AS = getLoadStoreAddressSpace(I);
7332 
7333   auto Group = getInterleavedAccessGroup(I);
7334   assert(Group && "Fail to get an interleaved access group.");
7335 
7336   unsigned InterleaveFactor = Group->getFactor();
7337   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
7338 
7339   // Holds the indices of existing members in the interleaved group.
7340   SmallVector<unsigned, 4> Indices;
7341   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
7342     if (Group->getMember(IF))
7343       Indices.push_back(IF);
7344 
7345   // Calculate the cost of the whole interleaved group.
7346   bool UseMaskForGaps =
7347       (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
7348       (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
7349   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
7350       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
7351       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
7352 
7353   if (Group->isReverse()) {
7354     // TODO: Add support for reversed masked interleaved access.
7355     assert(!Legal->isMaskRequired(I) &&
7356            "Reverse masked interleaved access not supported.");
7357     Cost +=
7358         Group->getNumMembers() *
7359         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
7360   }
7361   return Cost;
7362 }
7363 
7364 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
7365     Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
7366   using namespace llvm::PatternMatch;
7367   // Early exit for no inloop reductions
7368   if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
7369     return None;
7370   auto *VectorTy = cast<VectorType>(Ty);
7371 
7372   // We are looking for a pattern of, and finding the minimal acceptable cost:
7373   //  reduce(mul(ext(A), ext(B))) or
7374   //  reduce(mul(A, B)) or
7375   //  reduce(ext(A)) or
7376   //  reduce(A).
7377   // The basic idea is that we walk down the tree to do that, finding the root
7378   // reduction instruction in InLoopReductionImmediateChains. From there we find
7379   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
7380   // of the components. If the reduction cost is lower then we return it for the
7381   // reduction instruction and 0 for the other instructions in the pattern. If
7382   // it is not we return an invalid cost specifying the orignal cost method
7383   // should be used.
7384   Instruction *RetI = I;
7385   if (match(RetI, m_ZExtOrSExt(m_Value()))) {
7386     if (!RetI->hasOneUser())
7387       return None;
7388     RetI = RetI->user_back();
7389   }
7390   if (match(RetI, m_Mul(m_Value(), m_Value())) &&
7391       RetI->user_back()->getOpcode() == Instruction::Add) {
7392     if (!RetI->hasOneUser())
7393       return None;
7394     RetI = RetI->user_back();
7395   }
7396 
7397   // Test if the found instruction is a reduction, and if not return an invalid
7398   // cost specifying the parent to use the original cost modelling.
7399   if (!InLoopReductionImmediateChains.count(RetI))
7400     return None;
7401 
7402   // Find the reduction this chain is a part of and calculate the basic cost of
7403   // the reduction on its own.
7404   Instruction *LastChain = InLoopReductionImmediateChains[RetI];
7405   Instruction *ReductionPhi = LastChain;
7406   while (!isa<PHINode>(ReductionPhi))
7407     ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
7408 
7409   const RecurrenceDescriptor &RdxDesc =
7410       Legal->getReductionVars()[cast<PHINode>(ReductionPhi)];
7411 
7412   InstructionCost BaseCost = TTI.getArithmeticReductionCost(
7413       RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
7414 
7415   // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
7416   // normal fmul instruction to the cost of the fadd reduction.
7417   if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
7418     BaseCost +=
7419         TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
7420 
7421   // If we're using ordered reductions then we can just return the base cost
7422   // here, since getArithmeticReductionCost calculates the full ordered
7423   // reduction cost when FP reassociation is not allowed.
7424   if (useOrderedReductions(RdxDesc))
7425     return BaseCost;
7426 
7427   // Get the operand that was not the reduction chain and match it to one of the
7428   // patterns, returning the better cost if it is found.
7429   Instruction *RedOp = RetI->getOperand(1) == LastChain
7430                            ? dyn_cast<Instruction>(RetI->getOperand(0))
7431                            : dyn_cast<Instruction>(RetI->getOperand(1));
7432 
7433   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
7434 
7435   Instruction *Op0, *Op1;
7436   if (RedOp &&
7437       match(RedOp,
7438             m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
7439       match(Op0, m_ZExtOrSExt(m_Value())) &&
7440       Op0->getOpcode() == Op1->getOpcode() &&
7441       Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
7442       !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
7443       (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
7444 
7445     // Matched reduce(ext(mul(ext(A), ext(B)))
7446     // Note that the extend opcodes need to all match, or if A==B they will have
7447     // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
7448     // which is equally fine.
7449     bool IsUnsigned = isa<ZExtInst>(Op0);
7450     auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
7451     auto *MulType = VectorType::get(Op0->getType(), VectorTy);
7452 
7453     InstructionCost ExtCost =
7454         TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
7455                              TTI::CastContextHint::None, CostKind, Op0);
7456     InstructionCost MulCost =
7457         TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
7458     InstructionCost Ext2Cost =
7459         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
7460                              TTI::CastContextHint::None, CostKind, RedOp);
7461 
7462     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7463         /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
7464         CostKind);
7465 
7466     if (RedCost.isValid() &&
7467         RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
7468       return I == RetI ? RedCost : 0;
7469   } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
7470              !TheLoop->isLoopInvariant(RedOp)) {
7471     // Matched reduce(ext(A))
7472     bool IsUnsigned = isa<ZExtInst>(RedOp);
7473     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
7474     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7475         /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
7476         CostKind);
7477 
7478     InstructionCost ExtCost =
7479         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
7480                              TTI::CastContextHint::None, CostKind, RedOp);
7481     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
7482       return I == RetI ? RedCost : 0;
7483   } else if (RedOp &&
7484              match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
7485     if (match(Op0, m_ZExtOrSExt(m_Value())) &&
7486         Op0->getOpcode() == Op1->getOpcode() &&
7487         Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
7488         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
7489       bool IsUnsigned = isa<ZExtInst>(Op0);
7490       auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
7491       // Matched reduce(mul(ext, ext))
7492       InstructionCost ExtCost =
7493           TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType,
7494                                TTI::CastContextHint::None, CostKind, Op0);
7495       InstructionCost MulCost =
7496           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7497 
7498       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7499           /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
7500           CostKind);
7501 
7502       if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost)
7503         return I == RetI ? RedCost : 0;
7504     } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
7505       // Matched reduce(mul())
7506       InstructionCost MulCost =
7507           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7508 
7509       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7510           /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy,
7511           CostKind);
7512 
7513       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
7514         return I == RetI ? RedCost : 0;
7515     }
7516   }
7517 
7518   return I == RetI ? Optional<InstructionCost>(BaseCost) : None;
7519 }
7520 
7521 InstructionCost
7522 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
7523                                                      ElementCount VF) {
7524   // Calculate scalar cost only. Vectorization cost should be ready at this
7525   // moment.
7526   if (VF.isScalar()) {
7527     Type *ValTy = getLoadStoreType(I);
7528     const Align Alignment = getLoadStoreAlignment(I);
7529     unsigned AS = getLoadStoreAddressSpace(I);
7530 
7531     return TTI.getAddressComputationCost(ValTy) +
7532            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
7533                                TTI::TCK_RecipThroughput, I);
7534   }
7535   return getWideningCost(I, VF);
7536 }
7537 
7538 LoopVectorizationCostModel::VectorizationCostTy
7539 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
7540                                                ElementCount VF) {
7541   // If we know that this instruction will remain uniform, check the cost of
7542   // the scalar version.
7543   if (isUniformAfterVectorization(I, VF))
7544     VF = ElementCount::getFixed(1);
7545 
7546   if (VF.isVector() && isProfitableToScalarize(I, VF))
7547     return VectorizationCostTy(InstsToScalarize[VF][I], false);
7548 
7549   // Forced scalars do not have any scalarization overhead.
7550   auto ForcedScalar = ForcedScalars.find(VF);
7551   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
7552     auto InstSet = ForcedScalar->second;
7553     if (InstSet.count(I))
7554       return VectorizationCostTy(
7555           (getInstructionCost(I, ElementCount::getFixed(1)).first *
7556            VF.getKnownMinValue()),
7557           false);
7558   }
7559 
7560   Type *VectorTy;
7561   InstructionCost C = getInstructionCost(I, VF, VectorTy);
7562 
7563   bool TypeNotScalarized = false;
7564   if (VF.isVector() && VectorTy->isVectorTy()) {
7565     unsigned NumParts = TTI.getNumberOfParts(VectorTy);
7566     if (NumParts)
7567       TypeNotScalarized = NumParts < VF.getKnownMinValue();
7568     else
7569       C = InstructionCost::getInvalid();
7570   }
7571   return VectorizationCostTy(C, TypeNotScalarized);
7572 }
7573 
7574 InstructionCost
7575 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
7576                                                      ElementCount VF) const {
7577 
7578   // There is no mechanism yet to create a scalable scalarization loop,
7579   // so this is currently Invalid.
7580   if (VF.isScalable())
7581     return InstructionCost::getInvalid();
7582 
7583   if (VF.isScalar())
7584     return 0;
7585 
7586   InstructionCost Cost = 0;
7587   Type *RetTy = ToVectorTy(I->getType(), VF);
7588   if (!RetTy->isVoidTy() &&
7589       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
7590     Cost += TTI.getScalarizationOverhead(
7591         cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true,
7592         false);
7593 
7594   // Some targets keep addresses scalar.
7595   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
7596     return Cost;
7597 
7598   // Some targets support efficient element stores.
7599   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
7600     return Cost;
7601 
7602   // Collect operands to consider.
7603   CallInst *CI = dyn_cast<CallInst>(I);
7604   Instruction::op_range Ops = CI ? CI->args() : I->operands();
7605 
7606   // Skip operands that do not require extraction/scalarization and do not incur
7607   // any overhead.
7608   SmallVector<Type *> Tys;
7609   for (auto *V : filterExtractingOperands(Ops, VF))
7610     Tys.push_back(MaybeVectorizeType(V->getType(), VF));
7611   return Cost + TTI.getOperandsScalarizationOverhead(
7612                     filterExtractingOperands(Ops, VF), Tys);
7613 }
7614 
7615 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
7616   if (VF.isScalar())
7617     return;
7618   NumPredStores = 0;
7619   for (BasicBlock *BB : TheLoop->blocks()) {
7620     // For each instruction in the old loop.
7621     for (Instruction &I : *BB) {
7622       Value *Ptr =  getLoadStorePointerOperand(&I);
7623       if (!Ptr)
7624         continue;
7625 
7626       // TODO: We should generate better code and update the cost model for
7627       // predicated uniform stores. Today they are treated as any other
7628       // predicated store (see added test cases in
7629       // invariant-store-vectorization.ll).
7630       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
7631         NumPredStores++;
7632 
7633       if (Legal->isUniformMemOp(I)) {
7634         // TODO: Avoid replicating loads and stores instead of
7635         // relying on instcombine to remove them.
7636         // Load: Scalar load + broadcast
7637         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
7638         InstructionCost Cost;
7639         if (isa<StoreInst>(&I) && VF.isScalable() &&
7640             isLegalGatherOrScatter(&I)) {
7641           Cost = getGatherScatterCost(&I, VF);
7642           setWideningDecision(&I, VF, CM_GatherScatter, Cost);
7643         } else {
7644           assert((isa<LoadInst>(&I) || !VF.isScalable()) &&
7645                  "Cannot yet scalarize uniform stores");
7646           Cost = getUniformMemOpCost(&I, VF);
7647           setWideningDecision(&I, VF, CM_Scalarize, Cost);
7648         }
7649         continue;
7650       }
7651 
7652       // We assume that widening is the best solution when possible.
7653       if (memoryInstructionCanBeWidened(&I, VF)) {
7654         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
7655         int ConsecutiveStride = Legal->isConsecutivePtr(
7656             getLoadStoreType(&I), getLoadStorePointerOperand(&I));
7657         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
7658                "Expected consecutive stride.");
7659         InstWidening Decision =
7660             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
7661         setWideningDecision(&I, VF, Decision, Cost);
7662         continue;
7663       }
7664 
7665       // Choose between Interleaving, Gather/Scatter or Scalarization.
7666       InstructionCost InterleaveCost = InstructionCost::getInvalid();
7667       unsigned NumAccesses = 1;
7668       if (isAccessInterleaved(&I)) {
7669         auto Group = getInterleavedAccessGroup(&I);
7670         assert(Group && "Fail to get an interleaved access group.");
7671 
7672         // Make one decision for the whole group.
7673         if (getWideningDecision(&I, VF) != CM_Unknown)
7674           continue;
7675 
7676         NumAccesses = Group->getNumMembers();
7677         if (interleavedAccessCanBeWidened(&I, VF))
7678           InterleaveCost = getInterleaveGroupCost(&I, VF);
7679       }
7680 
7681       InstructionCost GatherScatterCost =
7682           isLegalGatherOrScatter(&I)
7683               ? getGatherScatterCost(&I, VF) * NumAccesses
7684               : InstructionCost::getInvalid();
7685 
7686       InstructionCost ScalarizationCost =
7687           getMemInstScalarizationCost(&I, VF) * NumAccesses;
7688 
7689       // Choose better solution for the current VF,
7690       // write down this decision and use it during vectorization.
7691       InstructionCost Cost;
7692       InstWidening Decision;
7693       if (InterleaveCost <= GatherScatterCost &&
7694           InterleaveCost < ScalarizationCost) {
7695         Decision = CM_Interleave;
7696         Cost = InterleaveCost;
7697       } else if (GatherScatterCost < ScalarizationCost) {
7698         Decision = CM_GatherScatter;
7699         Cost = GatherScatterCost;
7700       } else {
7701         Decision = CM_Scalarize;
7702         Cost = ScalarizationCost;
7703       }
7704       // If the instructions belongs to an interleave group, the whole group
7705       // receives the same decision. The whole group receives the cost, but
7706       // the cost will actually be assigned to one instruction.
7707       if (auto Group = getInterleavedAccessGroup(&I))
7708         setWideningDecision(Group, VF, Decision, Cost);
7709       else
7710         setWideningDecision(&I, VF, Decision, Cost);
7711     }
7712   }
7713 
7714   // Make sure that any load of address and any other address computation
7715   // remains scalar unless there is gather/scatter support. This avoids
7716   // inevitable extracts into address registers, and also has the benefit of
7717   // activating LSR more, since that pass can't optimize vectorized
7718   // addresses.
7719   if (TTI.prefersVectorizedAddressing())
7720     return;
7721 
7722   // Start with all scalar pointer uses.
7723   SmallPtrSet<Instruction *, 8> AddrDefs;
7724   for (BasicBlock *BB : TheLoop->blocks())
7725     for (Instruction &I : *BB) {
7726       Instruction *PtrDef =
7727         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
7728       if (PtrDef && TheLoop->contains(PtrDef) &&
7729           getWideningDecision(&I, VF) != CM_GatherScatter)
7730         AddrDefs.insert(PtrDef);
7731     }
7732 
7733   // Add all instructions used to generate the addresses.
7734   SmallVector<Instruction *, 4> Worklist;
7735   append_range(Worklist, AddrDefs);
7736   while (!Worklist.empty()) {
7737     Instruction *I = Worklist.pop_back_val();
7738     for (auto &Op : I->operands())
7739       if (auto *InstOp = dyn_cast<Instruction>(Op))
7740         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
7741             AddrDefs.insert(InstOp).second)
7742           Worklist.push_back(InstOp);
7743   }
7744 
7745   for (auto *I : AddrDefs) {
7746     if (isa<LoadInst>(I)) {
7747       // Setting the desired widening decision should ideally be handled in
7748       // by cost functions, but since this involves the task of finding out
7749       // if the loaded register is involved in an address computation, it is
7750       // instead changed here when we know this is the case.
7751       InstWidening Decision = getWideningDecision(I, VF);
7752       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
7753         // Scalarize a widened load of address.
7754         setWideningDecision(
7755             I, VF, CM_Scalarize,
7756             (VF.getKnownMinValue() *
7757              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
7758       else if (auto Group = getInterleavedAccessGroup(I)) {
7759         // Scalarize an interleave group of address loads.
7760         for (unsigned I = 0; I < Group->getFactor(); ++I) {
7761           if (Instruction *Member = Group->getMember(I))
7762             setWideningDecision(
7763                 Member, VF, CM_Scalarize,
7764                 (VF.getKnownMinValue() *
7765                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
7766         }
7767       }
7768     } else
7769       // Make sure I gets scalarized and a cost estimate without
7770       // scalarization overhead.
7771       ForcedScalars[VF].insert(I);
7772   }
7773 }
7774 
7775 InstructionCost
7776 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
7777                                                Type *&VectorTy) {
7778   Type *RetTy = I->getType();
7779   if (canTruncateToMinimalBitwidth(I, VF))
7780     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
7781   auto SE = PSE.getSE();
7782   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7783 
7784   auto hasSingleCopyAfterVectorization = [this](Instruction *I,
7785                                                 ElementCount VF) -> bool {
7786     if (VF.isScalar())
7787       return true;
7788 
7789     auto Scalarized = InstsToScalarize.find(VF);
7790     assert(Scalarized != InstsToScalarize.end() &&
7791            "VF not yet analyzed for scalarization profitability");
7792     return !Scalarized->second.count(I) &&
7793            llvm::all_of(I->users(), [&](User *U) {
7794              auto *UI = cast<Instruction>(U);
7795              return !Scalarized->second.count(UI);
7796            });
7797   };
7798   (void) hasSingleCopyAfterVectorization;
7799 
7800   if (isScalarAfterVectorization(I, VF)) {
7801     // With the exception of GEPs and PHIs, after scalarization there should
7802     // only be one copy of the instruction generated in the loop. This is
7803     // because the VF is either 1, or any instructions that need scalarizing
7804     // have already been dealt with by the the time we get here. As a result,
7805     // it means we don't have to multiply the instruction cost by VF.
7806     assert(I->getOpcode() == Instruction::GetElementPtr ||
7807            I->getOpcode() == Instruction::PHI ||
7808            (I->getOpcode() == Instruction::BitCast &&
7809             I->getType()->isPointerTy()) ||
7810            hasSingleCopyAfterVectorization(I, VF));
7811     VectorTy = RetTy;
7812   } else
7813     VectorTy = ToVectorTy(RetTy, VF);
7814 
7815   // TODO: We need to estimate the cost of intrinsic calls.
7816   switch (I->getOpcode()) {
7817   case Instruction::GetElementPtr:
7818     // We mark this instruction as zero-cost because the cost of GEPs in
7819     // vectorized code depends on whether the corresponding memory instruction
7820     // is scalarized or not. Therefore, we handle GEPs with the memory
7821     // instruction cost.
7822     return 0;
7823   case Instruction::Br: {
7824     // In cases of scalarized and predicated instructions, there will be VF
7825     // predicated blocks in the vectorized loop. Each branch around these
7826     // blocks requires also an extract of its vector compare i1 element.
7827     bool ScalarPredicatedBB = false;
7828     BranchInst *BI = cast<BranchInst>(I);
7829     if (VF.isVector() && BI->isConditional() &&
7830         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
7831          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
7832       ScalarPredicatedBB = true;
7833 
7834     if (ScalarPredicatedBB) {
7835       // Not possible to scalarize scalable vector with predicated instructions.
7836       if (VF.isScalable())
7837         return InstructionCost::getInvalid();
7838       // Return cost for branches around scalarized and predicated blocks.
7839       auto *Vec_i1Ty =
7840           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
7841       return (
7842           TTI.getScalarizationOverhead(
7843               Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) +
7844           (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
7845     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
7846       // The back-edge branch will remain, as will all scalar branches.
7847       return TTI.getCFInstrCost(Instruction::Br, CostKind);
7848     else
7849       // This branch will be eliminated by if-conversion.
7850       return 0;
7851     // Note: We currently assume zero cost for an unconditional branch inside
7852     // a predicated block since it will become a fall-through, although we
7853     // may decide in the future to call TTI for all branches.
7854   }
7855   case Instruction::PHI: {
7856     auto *Phi = cast<PHINode>(I);
7857 
7858     // First-order recurrences are replaced by vector shuffles inside the loop.
7859     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
7860     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
7861       return TTI.getShuffleCost(
7862           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
7863           None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
7864 
7865     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7866     // converted into select instructions. We require N - 1 selects per phi
7867     // node, where N is the number of incoming values.
7868     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
7869       return (Phi->getNumIncomingValues() - 1) *
7870              TTI.getCmpSelInstrCost(
7871                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
7872                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
7873                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
7874 
7875     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
7876   }
7877   case Instruction::UDiv:
7878   case Instruction::SDiv:
7879   case Instruction::URem:
7880   case Instruction::SRem:
7881     // If we have a predicated instruction, it may not be executed for each
7882     // vector lane. Get the scalarization cost and scale this amount by the
7883     // probability of executing the predicated block. If the instruction is not
7884     // predicated, we fall through to the next case.
7885     if (VF.isVector() && isScalarWithPredication(I)) {
7886       InstructionCost Cost = 0;
7887 
7888       // These instructions have a non-void type, so account for the phi nodes
7889       // that we will create. This cost is likely to be zero. The phi node
7890       // cost, if any, should be scaled by the block probability because it
7891       // models a copy at the end of each predicated block.
7892       Cost += VF.getKnownMinValue() *
7893               TTI.getCFInstrCost(Instruction::PHI, CostKind);
7894 
7895       // The cost of the non-predicated instruction.
7896       Cost += VF.getKnownMinValue() *
7897               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
7898 
7899       // The cost of insertelement and extractelement instructions needed for
7900       // scalarization.
7901       Cost += getScalarizationOverhead(I, VF);
7902 
7903       // Scale the cost by the probability of executing the predicated blocks.
7904       // This assumes the predicated block for each vector lane is equally
7905       // likely.
7906       return Cost / getReciprocalPredBlockProb();
7907     }
7908     LLVM_FALLTHROUGH;
7909   case Instruction::Add:
7910   case Instruction::FAdd:
7911   case Instruction::Sub:
7912   case Instruction::FSub:
7913   case Instruction::Mul:
7914   case Instruction::FMul:
7915   case Instruction::FDiv:
7916   case Instruction::FRem:
7917   case Instruction::Shl:
7918   case Instruction::LShr:
7919   case Instruction::AShr:
7920   case Instruction::And:
7921   case Instruction::Or:
7922   case Instruction::Xor: {
7923     // Since we will replace the stride by 1 the multiplication should go away.
7924     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7925       return 0;
7926 
7927     // Detect reduction patterns
7928     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7929       return *RedCost;
7930 
7931     // Certain instructions can be cheaper to vectorize if they have a constant
7932     // second vector operand. One example of this are shifts on x86.
7933     Value *Op2 = I->getOperand(1);
7934     TargetTransformInfo::OperandValueProperties Op2VP;
7935     TargetTransformInfo::OperandValueKind Op2VK =
7936         TTI.getOperandInfo(Op2, Op2VP);
7937     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7938       Op2VK = TargetTransformInfo::OK_UniformValue;
7939 
7940     SmallVector<const Value *, 4> Operands(I->operand_values());
7941     return TTI.getArithmeticInstrCost(
7942         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7943         Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
7944   }
7945   case Instruction::FNeg: {
7946     return TTI.getArithmeticInstrCost(
7947         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7948         TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None,
7949         TargetTransformInfo::OP_None, I->getOperand(0), I);
7950   }
7951   case Instruction::Select: {
7952     SelectInst *SI = cast<SelectInst>(I);
7953     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7954     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7955 
7956     const Value *Op0, *Op1;
7957     using namespace llvm::PatternMatch;
7958     if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
7959                         match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
7960       // select x, y, false --> x & y
7961       // select x, true, y --> x | y
7962       TTI::OperandValueProperties Op1VP = TTI::OP_None;
7963       TTI::OperandValueProperties Op2VP = TTI::OP_None;
7964       TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP);
7965       TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP);
7966       assert(Op0->getType()->getScalarSizeInBits() == 1 &&
7967               Op1->getType()->getScalarSizeInBits() == 1);
7968 
7969       SmallVector<const Value *, 2> Operands{Op0, Op1};
7970       return TTI.getArithmeticInstrCost(
7971           match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
7972           CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I);
7973     }
7974 
7975     Type *CondTy = SI->getCondition()->getType();
7976     if (!ScalarCond)
7977       CondTy = VectorType::get(CondTy, VF);
7978     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
7979                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
7980   }
7981   case Instruction::ICmp:
7982   case Instruction::FCmp: {
7983     Type *ValTy = I->getOperand(0)->getType();
7984     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7985     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7986       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7987     VectorTy = ToVectorTy(ValTy, VF);
7988     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7989                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
7990   }
7991   case Instruction::Store:
7992   case Instruction::Load: {
7993     ElementCount Width = VF;
7994     if (Width.isVector()) {
7995       InstWidening Decision = getWideningDecision(I, Width);
7996       assert(Decision != CM_Unknown &&
7997              "CM decision should be taken at this point");
7998       if (Decision == CM_Scalarize)
7999         Width = ElementCount::getFixed(1);
8000     }
8001     VectorTy = ToVectorTy(getLoadStoreType(I), Width);
8002     return getMemoryInstructionCost(I, VF);
8003   }
8004   case Instruction::BitCast:
8005     if (I->getType()->isPointerTy())
8006       return 0;
8007     LLVM_FALLTHROUGH;
8008   case Instruction::ZExt:
8009   case Instruction::SExt:
8010   case Instruction::FPToUI:
8011   case Instruction::FPToSI:
8012   case Instruction::FPExt:
8013   case Instruction::PtrToInt:
8014   case Instruction::IntToPtr:
8015   case Instruction::SIToFP:
8016   case Instruction::UIToFP:
8017   case Instruction::Trunc:
8018   case Instruction::FPTrunc: {
8019     // Computes the CastContextHint from a Load/Store instruction.
8020     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
8021       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8022              "Expected a load or a store!");
8023 
8024       if (VF.isScalar() || !TheLoop->contains(I))
8025         return TTI::CastContextHint::Normal;
8026 
8027       switch (getWideningDecision(I, VF)) {
8028       case LoopVectorizationCostModel::CM_GatherScatter:
8029         return TTI::CastContextHint::GatherScatter;
8030       case LoopVectorizationCostModel::CM_Interleave:
8031         return TTI::CastContextHint::Interleave;
8032       case LoopVectorizationCostModel::CM_Scalarize:
8033       case LoopVectorizationCostModel::CM_Widen:
8034         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
8035                                         : TTI::CastContextHint::Normal;
8036       case LoopVectorizationCostModel::CM_Widen_Reverse:
8037         return TTI::CastContextHint::Reversed;
8038       case LoopVectorizationCostModel::CM_Unknown:
8039         llvm_unreachable("Instr did not go through cost modelling?");
8040       }
8041 
8042       llvm_unreachable("Unhandled case!");
8043     };
8044 
8045     unsigned Opcode = I->getOpcode();
8046     TTI::CastContextHint CCH = TTI::CastContextHint::None;
8047     // For Trunc, the context is the only user, which must be a StoreInst.
8048     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
8049       if (I->hasOneUse())
8050         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
8051           CCH = ComputeCCH(Store);
8052     }
8053     // For Z/Sext, the context is the operand, which must be a LoadInst.
8054     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
8055              Opcode == Instruction::FPExt) {
8056       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
8057         CCH = ComputeCCH(Load);
8058     }
8059 
8060     // We optimize the truncation of induction variables having constant
8061     // integer steps. The cost of these truncations is the same as the scalar
8062     // operation.
8063     if (isOptimizableIVTruncate(I, VF)) {
8064       auto *Trunc = cast<TruncInst>(I);
8065       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
8066                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
8067     }
8068 
8069     // Detect reduction patterns
8070     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
8071       return *RedCost;
8072 
8073     Type *SrcScalarTy = I->getOperand(0)->getType();
8074     Type *SrcVecTy =
8075         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
8076     if (canTruncateToMinimalBitwidth(I, VF)) {
8077       // This cast is going to be shrunk. This may remove the cast or it might
8078       // turn it into slightly different cast. For example, if MinBW == 16,
8079       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
8080       //
8081       // Calculate the modified src and dest types.
8082       Type *MinVecTy = VectorTy;
8083       if (Opcode == Instruction::Trunc) {
8084         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
8085         VectorTy =
8086             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
8087       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
8088         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
8089         VectorTy =
8090             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
8091       }
8092     }
8093 
8094     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
8095   }
8096   case Instruction::Call: {
8097     if (RecurrenceDescriptor::isFMulAddIntrinsic(I))
8098       if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
8099         return *RedCost;
8100     bool NeedToScalarize;
8101     CallInst *CI = cast<CallInst>(I);
8102     InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
8103     if (getVectorIntrinsicIDForCall(CI, TLI)) {
8104       InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
8105       return std::min(CallCost, IntrinsicCost);
8106     }
8107     return CallCost;
8108   }
8109   case Instruction::ExtractValue:
8110     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
8111   case Instruction::Alloca:
8112     // We cannot easily widen alloca to a scalable alloca, as
8113     // the result would need to be a vector of pointers.
8114     if (VF.isScalable())
8115       return InstructionCost::getInvalid();
8116     LLVM_FALLTHROUGH;
8117   default:
8118     // This opcode is unknown. Assume that it is the same as 'mul'.
8119     return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
8120   } // end of switch.
8121 }
8122 
8123 char LoopVectorize::ID = 0;
8124 
8125 static const char lv_name[] = "Loop Vectorization";
8126 
8127 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
8128 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
8129 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
8130 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
8131 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
8132 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
8133 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
8134 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
8135 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
8136 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
8137 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
8138 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
8139 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
8140 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
8141 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
8142 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
8143 
8144 namespace llvm {
8145 
8146 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
8147 
8148 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
8149                               bool VectorizeOnlyWhenForced) {
8150   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
8151 }
8152 
8153 } // end namespace llvm
8154 
8155 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
8156   // Check if the pointer operand of a load or store instruction is
8157   // consecutive.
8158   if (auto *Ptr = getLoadStorePointerOperand(Inst))
8159     return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr);
8160   return false;
8161 }
8162 
8163 void LoopVectorizationCostModel::collectValuesToIgnore() {
8164   // Ignore ephemeral values.
8165   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
8166 
8167   // Ignore type-promoting instructions we identified during reduction
8168   // detection.
8169   for (auto &Reduction : Legal->getReductionVars()) {
8170     RecurrenceDescriptor &RedDes = Reduction.second;
8171     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
8172     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
8173   }
8174   // Ignore type-casting instructions we identified during induction
8175   // detection.
8176   for (auto &Induction : Legal->getInductionVars()) {
8177     InductionDescriptor &IndDes = Induction.second;
8178     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
8179     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
8180   }
8181 }
8182 
8183 void LoopVectorizationCostModel::collectInLoopReductions() {
8184   for (auto &Reduction : Legal->getReductionVars()) {
8185     PHINode *Phi = Reduction.first;
8186     RecurrenceDescriptor &RdxDesc = Reduction.second;
8187 
8188     // We don't collect reductions that are type promoted (yet).
8189     if (RdxDesc.getRecurrenceType() != Phi->getType())
8190       continue;
8191 
8192     // If the target would prefer this reduction to happen "in-loop", then we
8193     // want to record it as such.
8194     unsigned Opcode = RdxDesc.getOpcode();
8195     if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
8196         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
8197                                    TargetTransformInfo::ReductionFlags()))
8198       continue;
8199 
8200     // Check that we can correctly put the reductions into the loop, by
8201     // finding the chain of operations that leads from the phi to the loop
8202     // exit value.
8203     SmallVector<Instruction *, 4> ReductionOperations =
8204         RdxDesc.getReductionOpChain(Phi, TheLoop);
8205     bool InLoop = !ReductionOperations.empty();
8206     if (InLoop) {
8207       InLoopReductionChains[Phi] = ReductionOperations;
8208       // Add the elements to InLoopReductionImmediateChains for cost modelling.
8209       Instruction *LastChain = Phi;
8210       for (auto *I : ReductionOperations) {
8211         InLoopReductionImmediateChains[I] = LastChain;
8212         LastChain = I;
8213       }
8214     }
8215     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
8216                       << " reduction for phi: " << *Phi << "\n");
8217   }
8218 }
8219 
8220 // TODO: we could return a pair of values that specify the max VF and
8221 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
8222 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
8223 // doesn't have a cost model that can choose which plan to execute if
8224 // more than one is generated.
8225 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
8226                                  LoopVectorizationCostModel &CM) {
8227   unsigned WidestType;
8228   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
8229   return WidestVectorRegBits / WidestType;
8230 }
8231 
8232 VectorizationFactor
8233 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
8234   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
8235   ElementCount VF = UserVF;
8236   // Outer loop handling: They may require CFG and instruction level
8237   // transformations before even evaluating whether vectorization is profitable.
8238   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8239   // the vectorization pipeline.
8240   if (!OrigLoop->isInnermost()) {
8241     // If the user doesn't provide a vectorization factor, determine a
8242     // reasonable one.
8243     if (UserVF.isZero()) {
8244       VF = ElementCount::getFixed(determineVPlanVF(
8245           TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
8246               .getFixedSize(),
8247           CM));
8248       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
8249 
8250       // Make sure we have a VF > 1 for stress testing.
8251       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
8252         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
8253                           << "overriding computed VF.\n");
8254         VF = ElementCount::getFixed(4);
8255       }
8256     }
8257     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8258     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
8259            "VF needs to be a power of two");
8260     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
8261                       << "VF " << VF << " to build VPlans.\n");
8262     buildVPlans(VF, VF);
8263 
8264     // For VPlan build stress testing, we bail out after VPlan construction.
8265     if (VPlanBuildStressTest)
8266       return VectorizationFactor::Disabled();
8267 
8268     return {VF, 0 /*Cost*/};
8269   }
8270 
8271   LLVM_DEBUG(
8272       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
8273                 "VPlan-native path.\n");
8274   return VectorizationFactor::Disabled();
8275 }
8276 
8277 Optional<VectorizationFactor>
8278 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
8279   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8280   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
8281   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
8282     return None;
8283 
8284   // Invalidate interleave groups if all blocks of loop will be predicated.
8285   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
8286       !useMaskedInterleavedAccesses(*TTI)) {
8287     LLVM_DEBUG(
8288         dbgs()
8289         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
8290            "which requires masked-interleaved support.\n");
8291     if (CM.InterleaveInfo.invalidateGroups())
8292       // Invalidating interleave groups also requires invalidating all decisions
8293       // based on them, which includes widening decisions and uniform and scalar
8294       // values.
8295       CM.invalidateCostModelingDecisions();
8296   }
8297 
8298   ElementCount MaxUserVF =
8299       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
8300   bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
8301   if (!UserVF.isZero() && UserVFIsLegal) {
8302     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
8303            "VF needs to be a power of two");
8304     // Collect the instructions (and their associated costs) that will be more
8305     // profitable to scalarize.
8306     if (CM.selectUserVectorizationFactor(UserVF)) {
8307       LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
8308       CM.collectInLoopReductions();
8309       buildVPlansWithVPRecipes(UserVF, UserVF);
8310       LLVM_DEBUG(printPlans(dbgs()));
8311       return {{UserVF, 0}};
8312     } else
8313       reportVectorizationInfo("UserVF ignored because of invalid costs.",
8314                               "InvalidCost", ORE, OrigLoop);
8315   }
8316 
8317   // Populate the set of Vectorization Factor Candidates.
8318   ElementCountSet VFCandidates;
8319   for (auto VF = ElementCount::getFixed(1);
8320        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
8321     VFCandidates.insert(VF);
8322   for (auto VF = ElementCount::getScalable(1);
8323        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
8324     VFCandidates.insert(VF);
8325 
8326   for (const auto &VF : VFCandidates) {
8327     // Collect Uniform and Scalar instructions after vectorization with VF.
8328     CM.collectUniformsAndScalars(VF);
8329 
8330     // Collect the instructions (and their associated costs) that will be more
8331     // profitable to scalarize.
8332     if (VF.isVector())
8333       CM.collectInstsToScalarize(VF);
8334   }
8335 
8336   CM.collectInLoopReductions();
8337   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
8338   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
8339 
8340   LLVM_DEBUG(printPlans(dbgs()));
8341   if (!MaxFactors.hasVector())
8342     return VectorizationFactor::Disabled();
8343 
8344   // Select the optimal vectorization factor.
8345   auto SelectedVF = CM.selectVectorizationFactor(VFCandidates);
8346 
8347   // Check if it is profitable to vectorize with runtime checks.
8348   unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
8349   if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) {
8350     bool PragmaThresholdReached =
8351         NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
8352     bool ThresholdReached =
8353         NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
8354     if ((ThresholdReached && !Hints.allowReordering()) ||
8355         PragmaThresholdReached) {
8356       ORE->emit([&]() {
8357         return OptimizationRemarkAnalysisAliasing(
8358                    DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(),
8359                    OrigLoop->getHeader())
8360                << "loop not vectorized: cannot prove it is safe to reorder "
8361                   "memory operations";
8362       });
8363       LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
8364       Hints.emitRemarkWithHints();
8365       return VectorizationFactor::Disabled();
8366     }
8367   }
8368   return SelectedVF;
8369 }
8370 
8371 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
8372   assert(count_if(VPlans,
8373                   [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
8374              1 &&
8375          "Best VF has not a single VPlan.");
8376 
8377   for (const VPlanPtr &Plan : VPlans) {
8378     if (Plan->hasVF(VF))
8379       return *Plan.get();
8380   }
8381   llvm_unreachable("No plan found!");
8382 }
8383 
8384 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
8385                                            VPlan &BestVPlan,
8386                                            InnerLoopVectorizer &ILV,
8387                                            DominatorTree *DT) {
8388   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
8389                     << '\n');
8390 
8391   // Perform the actual loop transformation.
8392 
8393   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
8394   VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
8395   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
8396   State.TripCount = ILV.getOrCreateTripCount(nullptr);
8397   State.CanonicalIV = ILV.Induction;
8398   ILV.collectPoisonGeneratingRecipes(State);
8399 
8400   ILV.printDebugTracesAtStart();
8401 
8402   //===------------------------------------------------===//
8403   //
8404   // Notice: any optimization or new instruction that go
8405   // into the code below should also be implemented in
8406   // the cost-model.
8407   //
8408   //===------------------------------------------------===//
8409 
8410   // 2. Copy and widen instructions from the old loop into the new loop.
8411   BestVPlan.execute(&State);
8412 
8413   // 3. Fix the vectorized code: take care of header phi's, live-outs,
8414   //    predication, updating analyses.
8415   ILV.fixVectorizedLoop(State);
8416 
8417   ILV.printDebugTracesAtEnd();
8418 }
8419 
8420 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
8421 void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
8422   for (const auto &Plan : VPlans)
8423     if (PrintVPlansInDotFormat)
8424       Plan->printDOT(O);
8425     else
8426       Plan->print(O);
8427 }
8428 #endif
8429 
8430 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
8431     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
8432 
8433   // We create new control-flow for the vectorized loop, so the original exit
8434   // conditions will be dead after vectorization if it's only used by the
8435   // terminator
8436   SmallVector<BasicBlock*> ExitingBlocks;
8437   OrigLoop->getExitingBlocks(ExitingBlocks);
8438   for (auto *BB : ExitingBlocks) {
8439     auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));
8440     if (!Cmp || !Cmp->hasOneUse())
8441       continue;
8442 
8443     // TODO: we should introduce a getUniqueExitingBlocks on Loop
8444     if (!DeadInstructions.insert(Cmp).second)
8445       continue;
8446 
8447     // The operands of the icmp is often a dead trunc, used by IndUpdate.
8448     // TODO: can recurse through operands in general
8449     for (Value *Op : Cmp->operands()) {
8450       if (isa<TruncInst>(Op) && Op->hasOneUse())
8451           DeadInstructions.insert(cast<Instruction>(Op));
8452     }
8453   }
8454 
8455   // We create new "steps" for induction variable updates to which the original
8456   // induction variables map. An original update instruction will be dead if
8457   // all its users except the induction variable are dead.
8458   auto *Latch = OrigLoop->getLoopLatch();
8459   for (auto &Induction : Legal->getInductionVars()) {
8460     PHINode *Ind = Induction.first;
8461     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
8462 
8463     // If the tail is to be folded by masking, the primary induction variable,
8464     // if exists, isn't dead: it will be used for masking. Don't kill it.
8465     if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
8466       continue;
8467 
8468     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
8469           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
8470         }))
8471       DeadInstructions.insert(IndUpdate);
8472 
8473     // We record as "Dead" also the type-casting instructions we had identified
8474     // during induction analysis. We don't need any handling for them in the
8475     // vectorized loop because we have proven that, under a proper runtime
8476     // test guarding the vectorized loop, the value of the phi, and the casted
8477     // value of the phi, are the same. The last instruction in this casting chain
8478     // will get its scalar/vector/widened def from the scalar/vector/widened def
8479     // of the respective phi node. Any other casts in the induction def-use chain
8480     // have no other uses outside the phi update chain, and will be ignored.
8481     InductionDescriptor &IndDes = Induction.second;
8482     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
8483     DeadInstructions.insert(Casts.begin(), Casts.end());
8484   }
8485 }
8486 
8487 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
8488 
8489 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
8490 
8491 Value *InnerLoopUnroller::getStepVector(Value *Val, Value *StartIdx,
8492                                         Value *Step,
8493                                         Instruction::BinaryOps BinOp) {
8494   // When unrolling and the VF is 1, we only need to add a simple scalar.
8495   Type *Ty = Val->getType();
8496   assert(!Ty->isVectorTy() && "Val must be a scalar");
8497 
8498   if (Ty->isFloatingPointTy()) {
8499     // Floating-point operations inherit FMF via the builder's flags.
8500     Value *MulOp = Builder.CreateFMul(StartIdx, Step);
8501     return Builder.CreateBinOp(BinOp, Val, MulOp);
8502   }
8503   return Builder.CreateAdd(Val, Builder.CreateMul(StartIdx, Step), "induction");
8504 }
8505 
8506 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
8507   SmallVector<Metadata *, 4> MDs;
8508   // Reserve first location for self reference to the LoopID metadata node.
8509   MDs.push_back(nullptr);
8510   bool IsUnrollMetadata = false;
8511   MDNode *LoopID = L->getLoopID();
8512   if (LoopID) {
8513     // First find existing loop unrolling disable metadata.
8514     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
8515       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
8516       if (MD) {
8517         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
8518         IsUnrollMetadata =
8519             S && S->getString().startswith("llvm.loop.unroll.disable");
8520       }
8521       MDs.push_back(LoopID->getOperand(i));
8522     }
8523   }
8524 
8525   if (!IsUnrollMetadata) {
8526     // Add runtime unroll disable metadata.
8527     LLVMContext &Context = L->getHeader()->getContext();
8528     SmallVector<Metadata *, 1> DisableOperands;
8529     DisableOperands.push_back(
8530         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
8531     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
8532     MDs.push_back(DisableNode);
8533     MDNode *NewLoopID = MDNode::get(Context, MDs);
8534     // Set operand 0 to refer to the loop id itself.
8535     NewLoopID->replaceOperandWith(0, NewLoopID);
8536     L->setLoopID(NewLoopID);
8537   }
8538 }
8539 
8540 //===--------------------------------------------------------------------===//
8541 // EpilogueVectorizerMainLoop
8542 //===--------------------------------------------------------------------===//
8543 
8544 /// This function is partially responsible for generating the control flow
8545 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
8546 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
8547   MDNode *OrigLoopID = OrigLoop->getLoopID();
8548   Loop *Lp = createVectorLoopSkeleton("");
8549 
8550   // Generate the code to check the minimum iteration count of the vector
8551   // epilogue (see below).
8552   EPI.EpilogueIterationCountCheck =
8553       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true);
8554   EPI.EpilogueIterationCountCheck->setName("iter.check");
8555 
8556   // Generate the code to check any assumptions that we've made for SCEV
8557   // expressions.
8558   EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader);
8559 
8560   // Generate the code that checks at runtime if arrays overlap. We put the
8561   // checks into a separate block to make the more common case of few elements
8562   // faster.
8563   EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
8564 
8565   // Generate the iteration count check for the main loop, *after* the check
8566   // for the epilogue loop, so that the path-length is shorter for the case
8567   // that goes directly through the vector epilogue. The longer-path length for
8568   // the main loop is compensated for, by the gain from vectorizing the larger
8569   // trip count. Note: the branch will get updated later on when we vectorize
8570   // the epilogue.
8571   EPI.MainLoopIterationCountCheck =
8572       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
8573 
8574   // Generate the induction variable.
8575   OldInduction = Legal->getPrimaryInduction();
8576   Type *IdxTy = Legal->getWidestInductionType();
8577   Value *StartIdx = ConstantInt::get(IdxTy, 0);
8578 
8579   IRBuilder<> B(&*Lp->getLoopPreheader()->getFirstInsertionPt());
8580   Value *Step = getRuntimeVF(B, IdxTy, VF * UF);
8581   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
8582   EPI.VectorTripCount = CountRoundDown;
8583   Induction =
8584       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
8585                               getDebugLocFromInstOrOperands(OldInduction));
8586 
8587   // Skip induction resume value creation here because they will be created in
8588   // the second pass. If we created them here, they wouldn't be used anyway,
8589   // because the vplan in the second pass still contains the inductions from the
8590   // original loop.
8591 
8592   return completeLoopSkeleton(Lp, OrigLoopID);
8593 }
8594 
8595 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
8596   LLVM_DEBUG({
8597     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
8598            << "Main Loop VF:" << EPI.MainLoopVF
8599            << ", Main Loop UF:" << EPI.MainLoopUF
8600            << ", Epilogue Loop VF:" << EPI.EpilogueVF
8601            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8602   });
8603 }
8604 
8605 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
8606   DEBUG_WITH_TYPE(VerboseDebug, {
8607     dbgs() << "intermediate fn:\n"
8608            << *OrigLoop->getHeader()->getParent() << "\n";
8609   });
8610 }
8611 
8612 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
8613     Loop *L, BasicBlock *Bypass, bool ForEpilogue) {
8614   assert(L && "Expected valid Loop.");
8615   assert(Bypass && "Expected valid bypass basic block.");
8616   ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
8617   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
8618   Value *Count = getOrCreateTripCount(L);
8619   // Reuse existing vector loop preheader for TC checks.
8620   // Note that new preheader block is generated for vector loop.
8621   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
8622   IRBuilder<> Builder(TCCheckBlock->getTerminator());
8623 
8624   // Generate code to check if the loop's trip count is less than VF * UF of the
8625   // main vector loop.
8626   auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ?
8627       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
8628 
8629   Value *CheckMinIters = Builder.CreateICmp(
8630       P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
8631       "min.iters.check");
8632 
8633   if (!ForEpilogue)
8634     TCCheckBlock->setName("vector.main.loop.iter.check");
8635 
8636   // Create new preheader for vector loop.
8637   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
8638                                    DT, LI, nullptr, "vector.ph");
8639 
8640   if (ForEpilogue) {
8641     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
8642                                  DT->getNode(Bypass)->getIDom()) &&
8643            "TC check is expected to dominate Bypass");
8644 
8645     // Update dominator for Bypass & LoopExit.
8646     DT->changeImmediateDominator(Bypass, TCCheckBlock);
8647     if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
8648       // For loops with multiple exits, there's no edge from the middle block
8649       // to exit blocks (as the epilogue must run) and thus no need to update
8650       // the immediate dominator of the exit blocks.
8651       DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
8652 
8653     LoopBypassBlocks.push_back(TCCheckBlock);
8654 
8655     // Save the trip count so we don't have to regenerate it in the
8656     // vec.epilog.iter.check. This is safe to do because the trip count
8657     // generated here dominates the vector epilog iter check.
8658     EPI.TripCount = Count;
8659   }
8660 
8661   ReplaceInstWithInst(
8662       TCCheckBlock->getTerminator(),
8663       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
8664 
8665   return TCCheckBlock;
8666 }
8667 
8668 //===--------------------------------------------------------------------===//
8669 // EpilogueVectorizerEpilogueLoop
8670 //===--------------------------------------------------------------------===//
8671 
8672 /// This function is partially responsible for generating the control flow
8673 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
8674 BasicBlock *
8675 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
8676   MDNode *OrigLoopID = OrigLoop->getLoopID();
8677   Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
8678 
8679   // Now, compare the remaining count and if there aren't enough iterations to
8680   // execute the vectorized epilogue skip to the scalar part.
8681   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
8682   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
8683   LoopVectorPreHeader =
8684       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
8685                  LI, nullptr, "vec.epilog.ph");
8686   emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader,
8687                                           VecEpilogueIterationCountCheck);
8688 
8689   // Adjust the control flow taking the state info from the main loop
8690   // vectorization into account.
8691   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
8692          "expected this to be saved from the previous pass.");
8693   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
8694       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
8695 
8696   DT->changeImmediateDominator(LoopVectorPreHeader,
8697                                EPI.MainLoopIterationCountCheck);
8698 
8699   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
8700       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8701 
8702   if (EPI.SCEVSafetyCheck)
8703     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
8704         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8705   if (EPI.MemSafetyCheck)
8706     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
8707         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8708 
8709   DT->changeImmediateDominator(
8710       VecEpilogueIterationCountCheck,
8711       VecEpilogueIterationCountCheck->getSinglePredecessor());
8712 
8713   DT->changeImmediateDominator(LoopScalarPreHeader,
8714                                EPI.EpilogueIterationCountCheck);
8715   if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
8716     // If there is an epilogue which must run, there's no edge from the
8717     // middle block to exit blocks  and thus no need to update the immediate
8718     // dominator of the exit blocks.
8719     DT->changeImmediateDominator(LoopExitBlock,
8720                                  EPI.EpilogueIterationCountCheck);
8721 
8722   // Keep track of bypass blocks, as they feed start values to the induction
8723   // phis in the scalar loop preheader.
8724   if (EPI.SCEVSafetyCheck)
8725     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
8726   if (EPI.MemSafetyCheck)
8727     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
8728   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
8729 
8730   // Generate a resume induction for the vector epilogue and put it in the
8731   // vector epilogue preheader
8732   Type *IdxTy = Legal->getWidestInductionType();
8733   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
8734                                          LoopVectorPreHeader->getFirstNonPHI());
8735   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
8736   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
8737                            EPI.MainLoopIterationCountCheck);
8738 
8739   // Generate the induction variable.
8740   OldInduction = Legal->getPrimaryInduction();
8741   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
8742   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
8743   Value *StartIdx = EPResumeVal;
8744   Induction =
8745       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
8746                               getDebugLocFromInstOrOperands(OldInduction));
8747 
8748   // Generate induction resume values. These variables save the new starting
8749   // indexes for the scalar loop. They are used to test if there are any tail
8750   // iterations left once the vector loop has completed.
8751   // Note that when the vectorized epilogue is skipped due to iteration count
8752   // check, then the resume value for the induction variable comes from
8753   // the trip count of the main vector loop, hence passing the AdditionalBypass
8754   // argument.
8755   createInductionResumeValues(Lp, CountRoundDown,
8756                               {VecEpilogueIterationCountCheck,
8757                                EPI.VectorTripCount} /* AdditionalBypass */);
8758 
8759   AddRuntimeUnrollDisableMetaData(Lp);
8760   return completeLoopSkeleton(Lp, OrigLoopID);
8761 }
8762 
8763 BasicBlock *
8764 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
8765     Loop *L, BasicBlock *Bypass, BasicBlock *Insert) {
8766 
8767   assert(EPI.TripCount &&
8768          "Expected trip count to have been safed in the first pass.");
8769   assert(
8770       (!isa<Instruction>(EPI.TripCount) ||
8771        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
8772       "saved trip count does not dominate insertion point.");
8773   Value *TC = EPI.TripCount;
8774   IRBuilder<> Builder(Insert->getTerminator());
8775   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
8776 
8777   // Generate code to check if the loop's trip count is less than VF * UF of the
8778   // vector epilogue loop.
8779   auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ?
8780       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
8781 
8782   Value *CheckMinIters =
8783       Builder.CreateICmp(P, Count,
8784                          createStepForVF(Builder, Count->getType(),
8785                                          EPI.EpilogueVF, EPI.EpilogueUF),
8786                          "min.epilog.iters.check");
8787 
8788   ReplaceInstWithInst(
8789       Insert->getTerminator(),
8790       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
8791 
8792   LoopBypassBlocks.push_back(Insert);
8793   return Insert;
8794 }
8795 
8796 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
8797   LLVM_DEBUG({
8798     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
8799            << "Epilogue Loop VF:" << EPI.EpilogueVF
8800            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8801   });
8802 }
8803 
8804 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
8805   DEBUG_WITH_TYPE(VerboseDebug, {
8806     dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
8807   });
8808 }
8809 
8810 bool LoopVectorizationPlanner::getDecisionAndClampRange(
8811     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
8812   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
8813   bool PredicateAtRangeStart = Predicate(Range.Start);
8814 
8815   for (ElementCount TmpVF = Range.Start * 2;
8816        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
8817     if (Predicate(TmpVF) != PredicateAtRangeStart) {
8818       Range.End = TmpVF;
8819       break;
8820     }
8821 
8822   return PredicateAtRangeStart;
8823 }
8824 
8825 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
8826 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
8827 /// of VF's starting at a given VF and extending it as much as possible. Each
8828 /// vectorization decision can potentially shorten this sub-range during
8829 /// buildVPlan().
8830 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
8831                                            ElementCount MaxVF) {
8832   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8833   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8834     VFRange SubRange = {VF, MaxVFPlusOne};
8835     VPlans.push_back(buildVPlan(SubRange));
8836     VF = SubRange.End;
8837   }
8838 }
8839 
8840 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
8841                                          VPlanPtr &Plan) {
8842   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8843 
8844   // Look for cached value.
8845   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8846   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8847   if (ECEntryIt != EdgeMaskCache.end())
8848     return ECEntryIt->second;
8849 
8850   VPValue *SrcMask = createBlockInMask(Src, Plan);
8851 
8852   // The terminator has to be a branch inst!
8853   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8854   assert(BI && "Unexpected terminator found");
8855 
8856   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8857     return EdgeMaskCache[Edge] = SrcMask;
8858 
8859   // If source is an exiting block, we know the exit edge is dynamically dead
8860   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
8861   // adding uses of an otherwise potentially dead instruction.
8862   if (OrigLoop->isLoopExiting(Src))
8863     return EdgeMaskCache[Edge] = SrcMask;
8864 
8865   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
8866   assert(EdgeMask && "No Edge Mask found for condition");
8867 
8868   if (BI->getSuccessor(0) != Dst)
8869     EdgeMask = Builder.createNot(EdgeMask);
8870 
8871   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8872     // The condition is 'SrcMask && EdgeMask', which is equivalent to
8873     // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8874     // The select version does not introduce new UB if SrcMask is false and
8875     // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8876     VPValue *False = Plan->getOrAddVPValue(
8877         ConstantInt::getFalse(BI->getCondition()->getType()));
8878     EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False);
8879   }
8880 
8881   return EdgeMaskCache[Edge] = EdgeMask;
8882 }
8883 
8884 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
8885   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8886 
8887   // Look for cached value.
8888   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
8889   if (BCEntryIt != BlockMaskCache.end())
8890     return BCEntryIt->second;
8891 
8892   // All-one mask is modelled as no-mask following the convention for masked
8893   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8894   VPValue *BlockMask = nullptr;
8895 
8896   if (OrigLoop->getHeader() == BB) {
8897     if (!CM.blockNeedsPredicationForAnyReason(BB))
8898       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
8899 
8900     // Create the block in mask as the first non-phi instruction in the block.
8901     VPBuilder::InsertPointGuard Guard(Builder);
8902     auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
8903     Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
8904 
8905     // Introduce the early-exit compare IV <= BTC to form header block mask.
8906     // This is used instead of IV < TC because TC may wrap, unlike BTC.
8907     // Start by constructing the desired canonical IV.
8908     VPValue *IV = nullptr;
8909     if (Legal->getPrimaryInduction())
8910       IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
8911     else {
8912       auto *IVRecipe = new VPWidenCanonicalIVRecipe();
8913       Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint);
8914       IV = IVRecipe;
8915     }
8916     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
8917     bool TailFolded = !CM.isScalarEpilogueAllowed();
8918 
8919     if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
8920       // While ActiveLaneMask is a binary op that consumes the loop tripcount
8921       // as a second argument, we only pass the IV here and extract the
8922       // tripcount from the transform state where codegen of the VP instructions
8923       // happen.
8924       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
8925     } else {
8926       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
8927     }
8928     return BlockMaskCache[BB] = BlockMask;
8929   }
8930 
8931   // This is the block mask. We OR all incoming edges.
8932   for (auto *Predecessor : predecessors(BB)) {
8933     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8934     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
8935       return BlockMaskCache[BB] = EdgeMask;
8936 
8937     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8938       BlockMask = EdgeMask;
8939       continue;
8940     }
8941 
8942     BlockMask = Builder.createOr(BlockMask, EdgeMask);
8943   }
8944 
8945   return BlockMaskCache[BB] = BlockMask;
8946 }
8947 
8948 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
8949                                                 ArrayRef<VPValue *> Operands,
8950                                                 VFRange &Range,
8951                                                 VPlanPtr &Plan) {
8952   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8953          "Must be called with either a load or store");
8954 
8955   auto willWiden = [&](ElementCount VF) -> bool {
8956     if (VF.isScalar())
8957       return false;
8958     LoopVectorizationCostModel::InstWidening Decision =
8959         CM.getWideningDecision(I, VF);
8960     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8961            "CM decision should be taken at this point.");
8962     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8963       return true;
8964     if (CM.isScalarAfterVectorization(I, VF) ||
8965         CM.isProfitableToScalarize(I, VF))
8966       return false;
8967     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8968   };
8969 
8970   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8971     return nullptr;
8972 
8973   VPValue *Mask = nullptr;
8974   if (Legal->isMaskRequired(I))
8975     Mask = createBlockInMask(I->getParent(), Plan);
8976 
8977   // Determine if the pointer operand of the access is either consecutive or
8978   // reverse consecutive.
8979   LoopVectorizationCostModel::InstWidening Decision =
8980       CM.getWideningDecision(I, Range.Start);
8981   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8982   bool Consecutive =
8983       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8984 
8985   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8986     return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
8987                                               Consecutive, Reverse);
8988 
8989   StoreInst *Store = cast<StoreInst>(I);
8990   return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
8991                                             Mask, Consecutive, Reverse);
8992 }
8993 
8994 VPWidenIntOrFpInductionRecipe *
8995 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi,
8996                                            ArrayRef<VPValue *> Operands) const {
8997   // Check if this is an integer or fp induction. If so, build the recipe that
8998   // produces its scalar and vector values.
8999   InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
9000   if (II.getKind() == InductionDescriptor::IK_IntInduction ||
9001       II.getKind() == InductionDescriptor::IK_FpInduction) {
9002     assert(II.getStartValue() ==
9003            Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
9004     const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts();
9005     return new VPWidenIntOrFpInductionRecipe(
9006         Phi, Operands[0], Casts.empty() ? nullptr : Casts.front());
9007   }
9008 
9009   return nullptr;
9010 }
9011 
9012 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
9013     TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range,
9014     VPlan &Plan) const {
9015   // Optimize the special case where the source is a constant integer
9016   // induction variable. Notice that we can only optimize the 'trunc' case
9017   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
9018   // (c) other casts depend on pointer size.
9019 
9020   // Determine whether \p K is a truncation based on an induction variable that
9021   // can be optimized.
9022   auto isOptimizableIVTruncate =
9023       [&](Instruction *K) -> std::function<bool(ElementCount)> {
9024     return [=](ElementCount VF) -> bool {
9025       return CM.isOptimizableIVTruncate(K, VF);
9026     };
9027   };
9028 
9029   if (LoopVectorizationPlanner::getDecisionAndClampRange(
9030           isOptimizableIVTruncate(I), Range)) {
9031 
9032     InductionDescriptor II =
9033         Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0)));
9034     VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
9035     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
9036                                              Start, nullptr, I);
9037   }
9038   return nullptr;
9039 }
9040 
9041 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
9042                                                 ArrayRef<VPValue *> Operands,
9043                                                 VPlanPtr &Plan) {
9044   // If all incoming values are equal, the incoming VPValue can be used directly
9045   // instead of creating a new VPBlendRecipe.
9046   VPValue *FirstIncoming = Operands[0];
9047   if (all_of(Operands, [FirstIncoming](const VPValue *Inc) {
9048         return FirstIncoming == Inc;
9049       })) {
9050     return Operands[0];
9051   }
9052 
9053   // We know that all PHIs in non-header blocks are converted into selects, so
9054   // we don't have to worry about the insertion order and we can just use the
9055   // builder. At this point we generate the predication tree. There may be
9056   // duplications since this is a simple recursive scan, but future
9057   // optimizations will clean it up.
9058   SmallVector<VPValue *, 2> OperandsWithMask;
9059   unsigned NumIncoming = Phi->getNumIncomingValues();
9060 
9061   for (unsigned In = 0; In < NumIncoming; In++) {
9062     VPValue *EdgeMask =
9063       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
9064     assert((EdgeMask || NumIncoming == 1) &&
9065            "Multiple predecessors with one having a full mask");
9066     OperandsWithMask.push_back(Operands[In]);
9067     if (EdgeMask)
9068       OperandsWithMask.push_back(EdgeMask);
9069   }
9070   return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
9071 }
9072 
9073 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
9074                                                    ArrayRef<VPValue *> Operands,
9075                                                    VFRange &Range) const {
9076 
9077   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
9078       [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); },
9079       Range);
9080 
9081   if (IsPredicated)
9082     return nullptr;
9083 
9084   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
9085   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
9086              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
9087              ID == Intrinsic::pseudoprobe ||
9088              ID == Intrinsic::experimental_noalias_scope_decl))
9089     return nullptr;
9090 
9091   auto willWiden = [&](ElementCount VF) -> bool {
9092     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
9093     // The following case may be scalarized depending on the VF.
9094     // The flag shows whether we use Intrinsic or a usual Call for vectorized
9095     // version of the instruction.
9096     // Is it beneficial to perform intrinsic call compared to lib call?
9097     bool NeedToScalarize = false;
9098     InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
9099     InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
9100     bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
9101     return UseVectorIntrinsic || !NeedToScalarize;
9102   };
9103 
9104   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
9105     return nullptr;
9106 
9107   ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());
9108   return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()));
9109 }
9110 
9111 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
9112   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
9113          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
9114   // Instruction should be widened, unless it is scalar after vectorization,
9115   // scalarization is profitable or it is predicated.
9116   auto WillScalarize = [this, I](ElementCount VF) -> bool {
9117     return CM.isScalarAfterVectorization(I, VF) ||
9118            CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I);
9119   };
9120   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
9121                                                              Range);
9122 }
9123 
9124 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
9125                                            ArrayRef<VPValue *> Operands) const {
9126   auto IsVectorizableOpcode = [](unsigned Opcode) {
9127     switch (Opcode) {
9128     case Instruction::Add:
9129     case Instruction::And:
9130     case Instruction::AShr:
9131     case Instruction::BitCast:
9132     case Instruction::FAdd:
9133     case Instruction::FCmp:
9134     case Instruction::FDiv:
9135     case Instruction::FMul:
9136     case Instruction::FNeg:
9137     case Instruction::FPExt:
9138     case Instruction::FPToSI:
9139     case Instruction::FPToUI:
9140     case Instruction::FPTrunc:
9141     case Instruction::FRem:
9142     case Instruction::FSub:
9143     case Instruction::ICmp:
9144     case Instruction::IntToPtr:
9145     case Instruction::LShr:
9146     case Instruction::Mul:
9147     case Instruction::Or:
9148     case Instruction::PtrToInt:
9149     case Instruction::SDiv:
9150     case Instruction::Select:
9151     case Instruction::SExt:
9152     case Instruction::Shl:
9153     case Instruction::SIToFP:
9154     case Instruction::SRem:
9155     case Instruction::Sub:
9156     case Instruction::Trunc:
9157     case Instruction::UDiv:
9158     case Instruction::UIToFP:
9159     case Instruction::URem:
9160     case Instruction::Xor:
9161     case Instruction::ZExt:
9162       return true;
9163     }
9164     return false;
9165   };
9166 
9167   if (!IsVectorizableOpcode(I->getOpcode()))
9168     return nullptr;
9169 
9170   // Success: widen this instruction.
9171   return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
9172 }
9173 
9174 void VPRecipeBuilder::fixHeaderPhis() {
9175   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
9176   for (VPWidenPHIRecipe *R : PhisToFix) {
9177     auto *PN = cast<PHINode>(R->getUnderlyingValue());
9178     VPRecipeBase *IncR =
9179         getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
9180     R->addOperand(IncR->getVPSingleValue());
9181   }
9182 }
9183 
9184 VPBasicBlock *VPRecipeBuilder::handleReplication(
9185     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
9186     VPlanPtr &Plan) {
9187   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
9188       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
9189       Range);
9190 
9191   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
9192       [&](ElementCount VF) { return CM.isPredicatedInst(I, IsUniform); },
9193       Range);
9194 
9195   // Even if the instruction is not marked as uniform, there are certain
9196   // intrinsic calls that can be effectively treated as such, so we check for
9197   // them here. Conservatively, we only do this for scalable vectors, since
9198   // for fixed-width VFs we can always fall back on full scalarization.
9199   if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
9200     switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
9201     case Intrinsic::assume:
9202     case Intrinsic::lifetime_start:
9203     case Intrinsic::lifetime_end:
9204       // For scalable vectors if one of the operands is variant then we still
9205       // want to mark as uniform, which will generate one instruction for just
9206       // the first lane of the vector. We can't scalarize the call in the same
9207       // way as for fixed-width vectors because we don't know how many lanes
9208       // there are.
9209       //
9210       // The reasons for doing it this way for scalable vectors are:
9211       //   1. For the assume intrinsic generating the instruction for the first
9212       //      lane is still be better than not generating any at all. For
9213       //      example, the input may be a splat across all lanes.
9214       //   2. For the lifetime start/end intrinsics the pointer operand only
9215       //      does anything useful when the input comes from a stack object,
9216       //      which suggests it should always be uniform. For non-stack objects
9217       //      the effect is to poison the object, which still allows us to
9218       //      remove the call.
9219       IsUniform = true;
9220       break;
9221     default:
9222       break;
9223     }
9224   }
9225 
9226   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
9227                                        IsUniform, IsPredicated);
9228   setRecipe(I, Recipe);
9229   Plan->addVPValue(I, Recipe);
9230 
9231   // Find if I uses a predicated instruction. If so, it will use its scalar
9232   // value. Avoid hoisting the insert-element which packs the scalar value into
9233   // a vector value, as that happens iff all users use the vector value.
9234   for (VPValue *Op : Recipe->operands()) {
9235     auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef());
9236     if (!PredR)
9237       continue;
9238     auto *RepR =
9239         cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef());
9240     assert(RepR->isPredicated() &&
9241            "expected Replicate recipe to be predicated");
9242     RepR->setAlsoPack(false);
9243   }
9244 
9245   // Finalize the recipe for Instr, first if it is not predicated.
9246   if (!IsPredicated) {
9247     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
9248     VPBB->appendRecipe(Recipe);
9249     return VPBB;
9250   }
9251   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
9252   assert(VPBB->getSuccessors().empty() &&
9253          "VPBB has successors when handling predicated replication.");
9254   // Record predicated instructions for above packing optimizations.
9255   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
9256   VPBlockUtils::insertBlockAfter(Region, VPBB);
9257   auto *RegSucc = new VPBasicBlock();
9258   VPBlockUtils::insertBlockAfter(RegSucc, Region);
9259   return RegSucc;
9260 }
9261 
9262 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
9263                                                       VPRecipeBase *PredRecipe,
9264                                                       VPlanPtr &Plan) {
9265   // Instructions marked for predication are replicated and placed under an
9266   // if-then construct to prevent side-effects.
9267 
9268   // Generate recipes to compute the block mask for this region.
9269   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
9270 
9271   // Build the triangular if-then region.
9272   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
9273   assert(Instr->getParent() && "Predicated instruction not in any basic block");
9274   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
9275   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
9276   auto *PHIRecipe = Instr->getType()->isVoidTy()
9277                         ? nullptr
9278                         : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
9279   if (PHIRecipe) {
9280     Plan->removeVPValueFor(Instr);
9281     Plan->addVPValue(Instr, PHIRecipe);
9282   }
9283   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
9284   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
9285   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
9286 
9287   // Note: first set Entry as region entry and then connect successors starting
9288   // from it in order, to propagate the "parent" of each VPBasicBlock.
9289   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
9290   VPBlockUtils::connectBlocks(Pred, Exit);
9291 
9292   return Region;
9293 }
9294 
9295 VPRecipeOrVPValueTy
9296 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
9297                                         ArrayRef<VPValue *> Operands,
9298                                         VFRange &Range, VPlanPtr &Plan) {
9299   // First, check for specific widening recipes that deal with calls, memory
9300   // operations, inductions and Phi nodes.
9301   if (auto *CI = dyn_cast<CallInst>(Instr))
9302     return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
9303 
9304   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
9305     return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
9306 
9307   VPRecipeBase *Recipe;
9308   if (auto Phi = dyn_cast<PHINode>(Instr)) {
9309     if (Phi->getParent() != OrigLoop->getHeader())
9310       return tryToBlend(Phi, Operands, Plan);
9311     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands)))
9312       return toVPRecipeResult(Recipe);
9313 
9314     VPWidenPHIRecipe *PhiRecipe = nullptr;
9315     if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) {
9316       VPValue *StartV = Operands[0];
9317       if (Legal->isReductionVariable(Phi)) {
9318         RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
9319         assert(RdxDesc.getRecurrenceStartValue() ==
9320                Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
9321         PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
9322                                              CM.isInLoopReduction(Phi),
9323                                              CM.useOrderedReductions(RdxDesc));
9324       } else {
9325         PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
9326       }
9327 
9328       // Record the incoming value from the backedge, so we can add the incoming
9329       // value from the backedge after all recipes have been created.
9330       recordRecipeOf(cast<Instruction>(
9331           Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())));
9332       PhisToFix.push_back(PhiRecipe);
9333     } else {
9334       // TODO: record start and backedge value for remaining pointer induction
9335       // phis.
9336       assert(Phi->getType()->isPointerTy() &&
9337              "only pointer phis should be handled here");
9338       PhiRecipe = new VPWidenPHIRecipe(Phi);
9339     }
9340 
9341     return toVPRecipeResult(PhiRecipe);
9342   }
9343 
9344   if (isa<TruncInst>(Instr) &&
9345       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
9346                                                Range, *Plan)))
9347     return toVPRecipeResult(Recipe);
9348 
9349   if (!shouldWiden(Instr, Range))
9350     return nullptr;
9351 
9352   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
9353     return toVPRecipeResult(new VPWidenGEPRecipe(
9354         GEP, make_range(Operands.begin(), Operands.end()), OrigLoop));
9355 
9356   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
9357     bool InvariantCond =
9358         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
9359     return toVPRecipeResult(new VPWidenSelectRecipe(
9360         *SI, make_range(Operands.begin(), Operands.end()), InvariantCond));
9361   }
9362 
9363   return toVPRecipeResult(tryToWiden(Instr, Operands));
9364 }
9365 
9366 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
9367                                                         ElementCount MaxVF) {
9368   assert(OrigLoop->isInnermost() && "Inner loop expected.");
9369 
9370   // Collect instructions from the original loop that will become trivially dead
9371   // in the vectorized loop. We don't need to vectorize these instructions. For
9372   // example, original induction update instructions can become dead because we
9373   // separately emit induction "steps" when generating code for the new loop.
9374   // Similarly, we create a new latch condition when setting up the structure
9375   // of the new loop, so the old one can become dead.
9376   SmallPtrSet<Instruction *, 4> DeadInstructions;
9377   collectTriviallyDeadInstructions(DeadInstructions);
9378 
9379   // Add assume instructions we need to drop to DeadInstructions, to prevent
9380   // them from being added to the VPlan.
9381   // TODO: We only need to drop assumes in blocks that get flattend. If the
9382   // control flow is preserved, we should keep them.
9383   auto &ConditionalAssumes = Legal->getConditionalAssumes();
9384   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
9385 
9386   MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
9387   // Dead instructions do not need sinking. Remove them from SinkAfter.
9388   for (Instruction *I : DeadInstructions)
9389     SinkAfter.erase(I);
9390 
9391   // Cannot sink instructions after dead instructions (there won't be any
9392   // recipes for them). Instead, find the first non-dead previous instruction.
9393   for (auto &P : Legal->getSinkAfter()) {
9394     Instruction *SinkTarget = P.second;
9395     Instruction *FirstInst = &*SinkTarget->getParent()->begin();
9396     (void)FirstInst;
9397     while (DeadInstructions.contains(SinkTarget)) {
9398       assert(
9399           SinkTarget != FirstInst &&
9400           "Must find a live instruction (at least the one feeding the "
9401           "first-order recurrence PHI) before reaching beginning of the block");
9402       SinkTarget = SinkTarget->getPrevNode();
9403       assert(SinkTarget != P.first &&
9404              "sink source equals target, no sinking required");
9405     }
9406     P.second = SinkTarget;
9407   }
9408 
9409   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
9410   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
9411     VFRange SubRange = {VF, MaxVFPlusOne};
9412     VPlans.push_back(
9413         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
9414     VF = SubRange.End;
9415   }
9416 }
9417 
9418 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
9419     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
9420     const MapVector<Instruction *, Instruction *> &SinkAfter) {
9421 
9422   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
9423 
9424   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
9425 
9426   // ---------------------------------------------------------------------------
9427   // Pre-construction: record ingredients whose recipes we'll need to further
9428   // process after constructing the initial VPlan.
9429   // ---------------------------------------------------------------------------
9430 
9431   // Mark instructions we'll need to sink later and their targets as
9432   // ingredients whose recipe we'll need to record.
9433   for (auto &Entry : SinkAfter) {
9434     RecipeBuilder.recordRecipeOf(Entry.first);
9435     RecipeBuilder.recordRecipeOf(Entry.second);
9436   }
9437   for (auto &Reduction : CM.getInLoopReductionChains()) {
9438     PHINode *Phi = Reduction.first;
9439     RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind();
9440     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9441 
9442     RecipeBuilder.recordRecipeOf(Phi);
9443     for (auto &R : ReductionOperations) {
9444       RecipeBuilder.recordRecipeOf(R);
9445       // For min/max reducitons, where we have a pair of icmp/select, we also
9446       // need to record the ICmp recipe, so it can be removed later.
9447       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
9448              "Only min/max recurrences allowed for inloop reductions");
9449       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
9450         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
9451     }
9452   }
9453 
9454   // For each interleave group which is relevant for this (possibly trimmed)
9455   // Range, add it to the set of groups to be later applied to the VPlan and add
9456   // placeholders for its members' Recipes which we'll be replacing with a
9457   // single VPInterleaveRecipe.
9458   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
9459     auto applyIG = [IG, this](ElementCount VF) -> bool {
9460       return (VF.isVector() && // Query is illegal for VF == 1
9461               CM.getWideningDecision(IG->getInsertPos(), VF) ==
9462                   LoopVectorizationCostModel::CM_Interleave);
9463     };
9464     if (!getDecisionAndClampRange(applyIG, Range))
9465       continue;
9466     InterleaveGroups.insert(IG);
9467     for (unsigned i = 0; i < IG->getFactor(); i++)
9468       if (Instruction *Member = IG->getMember(i))
9469         RecipeBuilder.recordRecipeOf(Member);
9470   };
9471 
9472   // ---------------------------------------------------------------------------
9473   // Build initial VPlan: Scan the body of the loop in a topological order to
9474   // visit each basic block after having visited its predecessor basic blocks.
9475   // ---------------------------------------------------------------------------
9476 
9477   auto Plan = std::make_unique<VPlan>();
9478 
9479   // Scan the body of the loop in a topological order to visit each basic block
9480   // after having visited its predecessor basic blocks.
9481   LoopBlocksDFS DFS(OrigLoop);
9482   DFS.perform(LI);
9483 
9484   VPBasicBlock *VPBB = nullptr;
9485   VPBasicBlock *HeaderVPBB = nullptr;
9486   SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove;
9487   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
9488     // Relevant instructions from basic block BB will be grouped into VPRecipe
9489     // ingredients and fill a new VPBasicBlock.
9490     unsigned VPBBsForBB = 0;
9491     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
9492     if (VPBB)
9493       VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
9494     else {
9495       auto *TopRegion = new VPRegionBlock("vector loop");
9496       TopRegion->setEntry(FirstVPBBForBB);
9497       Plan->setEntry(TopRegion);
9498       HeaderVPBB = FirstVPBBForBB;
9499     }
9500     VPBB = FirstVPBBForBB;
9501     Builder.setInsertPoint(VPBB);
9502 
9503     // Introduce each ingredient into VPlan.
9504     // TODO: Model and preserve debug instrinsics in VPlan.
9505     for (Instruction &I : BB->instructionsWithoutDebug()) {
9506       Instruction *Instr = &I;
9507 
9508       // First filter out irrelevant instructions, to ensure no recipes are
9509       // built for them.
9510       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
9511         continue;
9512 
9513       SmallVector<VPValue *, 4> Operands;
9514       auto *Phi = dyn_cast<PHINode>(Instr);
9515       if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
9516         Operands.push_back(Plan->getOrAddVPValue(
9517             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
9518       } else {
9519         auto OpRange = Plan->mapToVPValues(Instr->operands());
9520         Operands = {OpRange.begin(), OpRange.end()};
9521       }
9522       if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
9523               Instr, Operands, Range, Plan)) {
9524         // If Instr can be simplified to an existing VPValue, use it.
9525         if (RecipeOrValue.is<VPValue *>()) {
9526           auto *VPV = RecipeOrValue.get<VPValue *>();
9527           Plan->addVPValue(Instr, VPV);
9528           // If the re-used value is a recipe, register the recipe for the
9529           // instruction, in case the recipe for Instr needs to be recorded.
9530           if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef()))
9531             RecipeBuilder.setRecipe(Instr, R);
9532           continue;
9533         }
9534         // Otherwise, add the new recipe.
9535         VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>();
9536         for (auto *Def : Recipe->definedValues()) {
9537           auto *UV = Def->getUnderlyingValue();
9538           Plan->addVPValue(UV, Def);
9539         }
9540 
9541         if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) &&
9542             HeaderVPBB->getFirstNonPhi() != VPBB->end()) {
9543           // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section
9544           // of the header block. That can happen for truncates of induction
9545           // variables. Those recipes are moved to the phi section of the header
9546           // block after applying SinkAfter, which relies on the original
9547           // position of the trunc.
9548           assert(isa<TruncInst>(Instr));
9549           InductionsToMove.push_back(
9550               cast<VPWidenIntOrFpInductionRecipe>(Recipe));
9551         }
9552         RecipeBuilder.setRecipe(Instr, Recipe);
9553         VPBB->appendRecipe(Recipe);
9554         continue;
9555       }
9556 
9557       // Otherwise, if all widening options failed, Instruction is to be
9558       // replicated. This may create a successor for VPBB.
9559       VPBasicBlock *NextVPBB =
9560           RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan);
9561       if (NextVPBB != VPBB) {
9562         VPBB = NextVPBB;
9563         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
9564                                     : "");
9565       }
9566     }
9567   }
9568 
9569   assert(isa<VPRegionBlock>(Plan->getEntry()) &&
9570          !Plan->getEntry()->getEntryBasicBlock()->empty() &&
9571          "entry block must be set to a VPRegionBlock having a non-empty entry "
9572          "VPBasicBlock");
9573   cast<VPRegionBlock>(Plan->getEntry())->setExit(VPBB);
9574   RecipeBuilder.fixHeaderPhis();
9575 
9576   // ---------------------------------------------------------------------------
9577   // Transform initial VPlan: Apply previously taken decisions, in order, to
9578   // bring the VPlan to its final state.
9579   // ---------------------------------------------------------------------------
9580 
9581   // Apply Sink-After legal constraints.
9582   auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {
9583     auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
9584     if (Region && Region->isReplicator()) {
9585       assert(Region->getNumSuccessors() == 1 &&
9586              Region->getNumPredecessors() == 1 && "Expected SESE region!");
9587       assert(R->getParent()->size() == 1 &&
9588              "A recipe in an original replicator region must be the only "
9589              "recipe in its block");
9590       return Region;
9591     }
9592     return nullptr;
9593   };
9594   for (auto &Entry : SinkAfter) {
9595     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
9596     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
9597 
9598     auto *TargetRegion = GetReplicateRegion(Target);
9599     auto *SinkRegion = GetReplicateRegion(Sink);
9600     if (!SinkRegion) {
9601       // If the sink source is not a replicate region, sink the recipe directly.
9602       if (TargetRegion) {
9603         // The target is in a replication region, make sure to move Sink to
9604         // the block after it, not into the replication region itself.
9605         VPBasicBlock *NextBlock =
9606             cast<VPBasicBlock>(TargetRegion->getSuccessors().front());
9607         Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
9608       } else
9609         Sink->moveAfter(Target);
9610       continue;
9611     }
9612 
9613     // The sink source is in a replicate region. Unhook the region from the CFG.
9614     auto *SinkPred = SinkRegion->getSinglePredecessor();
9615     auto *SinkSucc = SinkRegion->getSingleSuccessor();
9616     VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion);
9617     VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc);
9618     VPBlockUtils::connectBlocks(SinkPred, SinkSucc);
9619 
9620     if (TargetRegion) {
9621       // The target recipe is also in a replicate region, move the sink region
9622       // after the target region.
9623       auto *TargetSucc = TargetRegion->getSingleSuccessor();
9624       VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc);
9625       VPBlockUtils::connectBlocks(TargetRegion, SinkRegion);
9626       VPBlockUtils::connectBlocks(SinkRegion, TargetSucc);
9627     } else {
9628       // The sink source is in a replicate region, we need to move the whole
9629       // replicate region, which should only contain a single recipe in the
9630       // main block.
9631       auto *SplitBlock =
9632           Target->getParent()->splitAt(std::next(Target->getIterator()));
9633 
9634       auto *SplitPred = SplitBlock->getSinglePredecessor();
9635 
9636       VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock);
9637       VPBlockUtils::connectBlocks(SplitPred, SinkRegion);
9638       VPBlockUtils::connectBlocks(SinkRegion, SplitBlock);
9639       if (VPBB == SplitPred)
9640         VPBB = SplitBlock;
9641     }
9642   }
9643 
9644   // Now that sink-after is done, move induction recipes for optimized truncates
9645   // to the phi section of the header block.
9646   for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove)
9647     Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
9648 
9649   // Adjust the recipes for any inloop reductions.
9650   adjustRecipesForReductions(VPBB, Plan, RecipeBuilder, Range.Start);
9651 
9652   // Introduce a recipe to combine the incoming and previous values of a
9653   // first-order recurrence.
9654   for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
9655     auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R);
9656     if (!RecurPhi)
9657       continue;
9658 
9659     VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe();
9660     VPBasicBlock *InsertBlock = PrevRecipe->getParent();
9661     auto *Region = GetReplicateRegion(PrevRecipe);
9662     if (Region)
9663       InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor());
9664     if (Region || PrevRecipe->isPhi())
9665       Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
9666     else
9667       Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator()));
9668 
9669     auto *RecurSplice = cast<VPInstruction>(
9670         Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
9671                              {RecurPhi, RecurPhi->getBackedgeValue()}));
9672 
9673     RecurPhi->replaceAllUsesWith(RecurSplice);
9674     // Set the first operand of RecurSplice to RecurPhi again, after replacing
9675     // all users.
9676     RecurSplice->setOperand(0, RecurPhi);
9677   }
9678 
9679   // Interleave memory: for each Interleave Group we marked earlier as relevant
9680   // for this VPlan, replace the Recipes widening its memory instructions with a
9681   // single VPInterleaveRecipe at its insertion point.
9682   for (auto IG : InterleaveGroups) {
9683     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
9684         RecipeBuilder.getRecipe(IG->getInsertPos()));
9685     SmallVector<VPValue *, 4> StoredValues;
9686     for (unsigned i = 0; i < IG->getFactor(); ++i)
9687       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
9688         auto *StoreR =
9689             cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
9690         StoredValues.push_back(StoreR->getStoredValue());
9691       }
9692 
9693     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
9694                                         Recipe->getMask());
9695     VPIG->insertBefore(Recipe);
9696     unsigned J = 0;
9697     for (unsigned i = 0; i < IG->getFactor(); ++i)
9698       if (Instruction *Member = IG->getMember(i)) {
9699         if (!Member->getType()->isVoidTy()) {
9700           VPValue *OriginalV = Plan->getVPValue(Member);
9701           Plan->removeVPValueFor(Member);
9702           Plan->addVPValue(Member, VPIG->getVPValue(J));
9703           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
9704           J++;
9705         }
9706         RecipeBuilder.getRecipe(Member)->eraseFromParent();
9707       }
9708   }
9709 
9710   // From this point onwards, VPlan-to-VPlan transformations may change the plan
9711   // in ways that accessing values using original IR values is incorrect.
9712   Plan->disableValue2VPValue();
9713 
9714   VPlanTransforms::sinkScalarOperands(*Plan);
9715   VPlanTransforms::mergeReplicateRegions(*Plan);
9716 
9717   std::string PlanName;
9718   raw_string_ostream RSO(PlanName);
9719   ElementCount VF = Range.Start;
9720   Plan->addVF(VF);
9721   RSO << "Initial VPlan for VF={" << VF;
9722   for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
9723     Plan->addVF(VF);
9724     RSO << "," << VF;
9725   }
9726   RSO << "},UF>=1";
9727   RSO.flush();
9728   Plan->setName(PlanName);
9729 
9730   assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
9731   return Plan;
9732 }
9733 
9734 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9735   // Outer loop handling: They may require CFG and instruction level
9736   // transformations before even evaluating whether vectorization is profitable.
9737   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9738   // the vectorization pipeline.
9739   assert(!OrigLoop->isInnermost());
9740   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9741 
9742   // Create new empty VPlan
9743   auto Plan = std::make_unique<VPlan>();
9744 
9745   // Build hierarchical CFG
9746   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9747   HCFGBuilder.buildHierarchicalCFG();
9748 
9749   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
9750        VF *= 2)
9751     Plan->addVF(VF);
9752 
9753   if (EnableVPlanPredication) {
9754     VPlanPredicator VPP(*Plan);
9755     VPP.predicate();
9756 
9757     // Avoid running transformation to recipes until masked code generation in
9758     // VPlan-native path is in place.
9759     return Plan;
9760   }
9761 
9762   SmallPtrSet<Instruction *, 1> DeadInstructions;
9763   VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan,
9764                                              Legal->getInductionVars(),
9765                                              DeadInstructions, *PSE.getSE());
9766   return Plan;
9767 }
9768 
9769 // Adjust the recipes for reductions. For in-loop reductions the chain of
9770 // instructions leading from the loop exit instr to the phi need to be converted
9771 // to reductions, with one operand being vector and the other being the scalar
9772 // reduction chain. For other reductions, a select is introduced between the phi
9773 // and live-out recipes when folding the tail.
9774 void LoopVectorizationPlanner::adjustRecipesForReductions(
9775     VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
9776     ElementCount MinVF) {
9777   for (auto &Reduction : CM.getInLoopReductionChains()) {
9778     PHINode *Phi = Reduction.first;
9779     RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
9780     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9781 
9782     if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc))
9783       continue;
9784 
9785     // ReductionOperations are orders top-down from the phi's use to the
9786     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
9787     // which of the two operands will remain scalar and which will be reduced.
9788     // For minmax the chain will be the select instructions.
9789     Instruction *Chain = Phi;
9790     for (Instruction *R : ReductionOperations) {
9791       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
9792       RecurKind Kind = RdxDesc.getRecurrenceKind();
9793 
9794       VPValue *ChainOp = Plan->getVPValue(Chain);
9795       unsigned FirstOpId;
9796       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
9797              "Only min/max recurrences allowed for inloop reductions");
9798       // Recognize a call to the llvm.fmuladd intrinsic.
9799       bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9800       assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) &&
9801              "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9802       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9803         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
9804                "Expected to replace a VPWidenSelectSC");
9805         FirstOpId = 1;
9806       } else {
9807         assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) ||
9808                 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) &&
9809                "Expected to replace a VPWidenSC");
9810         FirstOpId = 0;
9811       }
9812       unsigned VecOpId =
9813           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
9814       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
9815 
9816       auto *CondOp = CM.foldTailByMasking()
9817                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
9818                          : nullptr;
9819 
9820       if (IsFMulAdd) {
9821         // If the instruction is a call to the llvm.fmuladd intrinsic then we
9822         // need to create an fmul recipe to use as the vector operand for the
9823         // fadd reduction.
9824         VPInstruction *FMulRecipe = new VPInstruction(
9825             Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))});
9826         FMulRecipe->setFastMathFlags(R->getFastMathFlags());
9827         WidenRecipe->getParent()->insert(FMulRecipe,
9828                                          WidenRecipe->getIterator());
9829         VecOp = FMulRecipe;
9830       }
9831       VPReductionRecipe *RedRecipe =
9832           new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
9833       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9834       Plan->removeVPValueFor(R);
9835       Plan->addVPValue(R, RedRecipe);
9836       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
9837       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9838       WidenRecipe->eraseFromParent();
9839 
9840       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9841         VPRecipeBase *CompareRecipe =
9842             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
9843         assert(isa<VPWidenRecipe>(CompareRecipe) &&
9844                "Expected to replace a VPWidenSC");
9845         assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
9846                "Expected no remaining users");
9847         CompareRecipe->eraseFromParent();
9848       }
9849       Chain = R;
9850     }
9851   }
9852 
9853   // If tail is folded by masking, introduce selects between the phi
9854   // and the live-out instruction of each reduction, at the end of the latch.
9855   if (CM.foldTailByMasking()) {
9856     for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
9857       VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9858       if (!PhiR || PhiR->isInLoop())
9859         continue;
9860       Builder.setInsertPoint(LatchVPBB);
9861       VPValue *Cond =
9862           RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
9863       VPValue *Red = PhiR->getBackedgeValue();
9864       Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});
9865     }
9866   }
9867 }
9868 
9869 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9870 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9871                                VPSlotTracker &SlotTracker) const {
9872   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9873   IG->getInsertPos()->printAsOperand(O, false);
9874   O << ", ";
9875   getAddr()->printAsOperand(O, SlotTracker);
9876   VPValue *Mask = getMask();
9877   if (Mask) {
9878     O << ", ";
9879     Mask->printAsOperand(O, SlotTracker);
9880   }
9881 
9882   unsigned OpIdx = 0;
9883   for (unsigned i = 0; i < IG->getFactor(); ++i) {
9884     if (!IG->getMember(i))
9885       continue;
9886     if (getNumStoreOperands() > 0) {
9887       O << "\n" << Indent << "  store ";
9888       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9889       O << " to index " << i;
9890     } else {
9891       O << "\n" << Indent << "  ";
9892       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
9893       O << " = load from index " << i;
9894     }
9895     ++OpIdx;
9896   }
9897 }
9898 #endif
9899 
9900 void VPWidenCallRecipe::execute(VPTransformState &State) {
9901   State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
9902                                   *this, State);
9903 }
9904 
9905 void VPWidenSelectRecipe::execute(VPTransformState &State) {
9906   State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()),
9907                                     this, *this, InvariantCond, State);
9908 }
9909 
9910 void VPWidenRecipe::execute(VPTransformState &State) {
9911   State.ILV->widenInstruction(*getUnderlyingInstr(), this, State);
9912 }
9913 
9914 void VPWidenGEPRecipe::execute(VPTransformState &State) {
9915   State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this,
9916                       *this, State.UF, State.VF, IsPtrLoopInvariant,
9917                       IsIndexLoopInvariant, State);
9918 }
9919 
9920 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
9921   assert(!State.Instance && "Int or FP induction being replicated.");
9922   State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(),
9923                                    getTruncInst(), getVPValue(0),
9924                                    getCastValue(), State);
9925 }
9926 
9927 void VPWidenPHIRecipe::execute(VPTransformState &State) {
9928   State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this,
9929                                  State);
9930 }
9931 
9932 void VPBlendRecipe::execute(VPTransformState &State) {
9933   State.ILV->setDebugLocFromInst(Phi, &State.Builder);
9934   // We know that all PHIs in non-header blocks are converted into
9935   // selects, so we don't have to worry about the insertion order and we
9936   // can just use the builder.
9937   // At this point we generate the predication tree. There may be
9938   // duplications since this is a simple recursive scan, but future
9939   // optimizations will clean it up.
9940 
9941   unsigned NumIncoming = getNumIncomingValues();
9942 
9943   // Generate a sequence of selects of the form:
9944   // SELECT(Mask3, In3,
9945   //        SELECT(Mask2, In2,
9946   //               SELECT(Mask1, In1,
9947   //                      In0)))
9948   // Note that Mask0 is never used: lanes for which no path reaches this phi and
9949   // are essentially undef are taken from In0.
9950   InnerLoopVectorizer::VectorParts Entry(State.UF);
9951   for (unsigned In = 0; In < NumIncoming; ++In) {
9952     for (unsigned Part = 0; Part < State.UF; ++Part) {
9953       // We might have single edge PHIs (blocks) - use an identity
9954       // 'select' for the first PHI operand.
9955       Value *In0 = State.get(getIncomingValue(In), Part);
9956       if (In == 0)
9957         Entry[Part] = In0; // Initialize with the first incoming value.
9958       else {
9959         // Select between the current value and the previous incoming edge
9960         // based on the incoming mask.
9961         Value *Cond = State.get(getMask(In), Part);
9962         Entry[Part] =
9963             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
9964       }
9965     }
9966   }
9967   for (unsigned Part = 0; Part < State.UF; ++Part)
9968     State.set(this, Entry[Part], Part);
9969 }
9970 
9971 void VPInterleaveRecipe::execute(VPTransformState &State) {
9972   assert(!State.Instance && "Interleave group being replicated.");
9973   State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9974                                       getStoredValues(), getMask());
9975 }
9976 
9977 void VPReductionRecipe::execute(VPTransformState &State) {
9978   assert(!State.Instance && "Reduction being replicated.");
9979   Value *PrevInChain = State.get(getChainOp(), 0);
9980   RecurKind Kind = RdxDesc->getRecurrenceKind();
9981   bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc);
9982   // Propagate the fast-math flags carried by the underlying instruction.
9983   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
9984   State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags());
9985   for (unsigned Part = 0; Part < State.UF; ++Part) {
9986     Value *NewVecOp = State.get(getVecOp(), Part);
9987     if (VPValue *Cond = getCondOp()) {
9988       Value *NewCond = State.get(Cond, Part);
9989       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
9990       Value *Iden = RdxDesc->getRecurrenceIdentity(
9991           Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags());
9992       Value *IdenVec =
9993           State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
9994       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
9995       NewVecOp = Select;
9996     }
9997     Value *NewRed;
9998     Value *NextInChain;
9999     if (IsOrdered) {
10000       if (State.VF.isVector())
10001         NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp,
10002                                         PrevInChain);
10003       else
10004         NewRed = State.Builder.CreateBinOp(
10005             (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain,
10006             NewVecOp);
10007       PrevInChain = NewRed;
10008     } else {
10009       PrevInChain = State.get(getChainOp(), Part);
10010       NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
10011     }
10012     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
10013       NextInChain =
10014           createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
10015                          NewRed, PrevInChain);
10016     } else if (IsOrdered)
10017       NextInChain = NewRed;
10018     else
10019       NextInChain = State.Builder.CreateBinOp(
10020           (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed,
10021           PrevInChain);
10022     State.set(this, NextInChain, Part);
10023   }
10024 }
10025 
10026 void VPReplicateRecipe::execute(VPTransformState &State) {
10027   if (State.Instance) { // Generate a single instance.
10028     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
10029     State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance,
10030                                     IsPredicated, State);
10031     // Insert scalar instance packing it into a vector.
10032     if (AlsoPack && State.VF.isVector()) {
10033       // If we're constructing lane 0, initialize to start from poison.
10034       if (State.Instance->Lane.isFirstLane()) {
10035         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
10036         Value *Poison = PoisonValue::get(
10037             VectorType::get(getUnderlyingValue()->getType(), State.VF));
10038         State.set(this, Poison, State.Instance->Part);
10039       }
10040       State.ILV->packScalarIntoVectorValue(this, *State.Instance, State);
10041     }
10042     return;
10043   }
10044 
10045   // Generate scalar instances for all VF lanes of all UF parts, unless the
10046   // instruction is uniform inwhich case generate only the first lane for each
10047   // of the UF parts.
10048   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
10049   assert((!State.VF.isScalable() || IsUniform) &&
10050          "Can't scalarize a scalable vector");
10051   for (unsigned Part = 0; Part < State.UF; ++Part)
10052     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
10053       State.ILV->scalarizeInstruction(getUnderlyingInstr(), this,
10054                                       VPIteration(Part, Lane), IsPredicated,
10055                                       State);
10056 }
10057 
10058 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
10059   assert(State.Instance && "Branch on Mask works only on single instance.");
10060 
10061   unsigned Part = State.Instance->Part;
10062   unsigned Lane = State.Instance->Lane.getKnownLane();
10063 
10064   Value *ConditionBit = nullptr;
10065   VPValue *BlockInMask = getMask();
10066   if (BlockInMask) {
10067     ConditionBit = State.get(BlockInMask, Part);
10068     if (ConditionBit->getType()->isVectorTy())
10069       ConditionBit = State.Builder.CreateExtractElement(
10070           ConditionBit, State.Builder.getInt32(Lane));
10071   } else // Block in mask is all-one.
10072     ConditionBit = State.Builder.getTrue();
10073 
10074   // Replace the temporary unreachable terminator with a new conditional branch,
10075   // whose two destinations will be set later when they are created.
10076   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
10077   assert(isa<UnreachableInst>(CurrentTerminator) &&
10078          "Expected to replace unreachable terminator with conditional branch.");
10079   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
10080   CondBr->setSuccessor(0, nullptr);
10081   ReplaceInstWithInst(CurrentTerminator, CondBr);
10082 }
10083 
10084 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
10085   assert(State.Instance && "Predicated instruction PHI works per instance.");
10086   Instruction *ScalarPredInst =
10087       cast<Instruction>(State.get(getOperand(0), *State.Instance));
10088   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
10089   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
10090   assert(PredicatingBB && "Predicated block has no single predecessor.");
10091   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
10092          "operand must be VPReplicateRecipe");
10093 
10094   // By current pack/unpack logic we need to generate only a single phi node: if
10095   // a vector value for the predicated instruction exists at this point it means
10096   // the instruction has vector users only, and a phi for the vector value is
10097   // needed. In this case the recipe of the predicated instruction is marked to
10098   // also do that packing, thereby "hoisting" the insert-element sequence.
10099   // Otherwise, a phi node for the scalar value is needed.
10100   unsigned Part = State.Instance->Part;
10101   if (State.hasVectorValue(getOperand(0), Part)) {
10102     Value *VectorValue = State.get(getOperand(0), Part);
10103     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
10104     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
10105     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
10106     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
10107     if (State.hasVectorValue(this, Part))
10108       State.reset(this, VPhi, Part);
10109     else
10110       State.set(this, VPhi, Part);
10111     // NOTE: Currently we need to update the value of the operand, so the next
10112     // predicated iteration inserts its generated value in the correct vector.
10113     State.reset(getOperand(0), VPhi, Part);
10114   } else {
10115     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
10116     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
10117     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
10118                      PredicatingBB);
10119     Phi->addIncoming(ScalarPredInst, PredicatedBB);
10120     if (State.hasScalarValue(this, *State.Instance))
10121       State.reset(this, Phi, *State.Instance);
10122     else
10123       State.set(this, Phi, *State.Instance);
10124     // NOTE: Currently we need to update the value of the operand, so the next
10125     // predicated iteration inserts its generated value in the correct vector.
10126     State.reset(getOperand(0), Phi, *State.Instance);
10127   }
10128 }
10129 
10130 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
10131   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
10132   State.ILV->vectorizeMemoryInstruction(
10133       &Ingredient, State, StoredValue ? nullptr : getVPSingleValue(), getAddr(),
10134       StoredValue, getMask(), Consecutive, Reverse);
10135 }
10136 
10137 // Determine how to lower the scalar epilogue, which depends on 1) optimising
10138 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
10139 // predication, and 4) a TTI hook that analyses whether the loop is suitable
10140 // for predication.
10141 static ScalarEpilogueLowering getScalarEpilogueLowering(
10142     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
10143     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
10144     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
10145     LoopVectorizationLegality &LVL) {
10146   // 1) OptSize takes precedence over all other options, i.e. if this is set,
10147   // don't look at hints or options, and don't request a scalar epilogue.
10148   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
10149   // LoopAccessInfo (due to code dependency and not being able to reliably get
10150   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
10151   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
10152   // versioning when the vectorization is forced, unlike hasOptSize. So revert
10153   // back to the old way and vectorize with versioning when forced. See D81345.)
10154   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
10155                                                       PGSOQueryType::IRPass) &&
10156                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
10157     return CM_ScalarEpilogueNotAllowedOptSize;
10158 
10159   // 2) If set, obey the directives
10160   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
10161     switch (PreferPredicateOverEpilogue) {
10162     case PreferPredicateTy::ScalarEpilogue:
10163       return CM_ScalarEpilogueAllowed;
10164     case PreferPredicateTy::PredicateElseScalarEpilogue:
10165       return CM_ScalarEpilogueNotNeededUsePredicate;
10166     case PreferPredicateTy::PredicateOrDontVectorize:
10167       return CM_ScalarEpilogueNotAllowedUsePredicate;
10168     };
10169   }
10170 
10171   // 3) If set, obey the hints
10172   switch (Hints.getPredicate()) {
10173   case LoopVectorizeHints::FK_Enabled:
10174     return CM_ScalarEpilogueNotNeededUsePredicate;
10175   case LoopVectorizeHints::FK_Disabled:
10176     return CM_ScalarEpilogueAllowed;
10177   };
10178 
10179   // 4) if the TTI hook indicates this is profitable, request predication.
10180   if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
10181                                        LVL.getLAI()))
10182     return CM_ScalarEpilogueNotNeededUsePredicate;
10183 
10184   return CM_ScalarEpilogueAllowed;
10185 }
10186 
10187 Value *VPTransformState::get(VPValue *Def, unsigned Part) {
10188   // If Values have been set for this Def return the one relevant for \p Part.
10189   if (hasVectorValue(Def, Part))
10190     return Data.PerPartOutput[Def][Part];
10191 
10192   if (!hasScalarValue(Def, {Part, 0})) {
10193     Value *IRV = Def->getLiveInIRValue();
10194     Value *B = ILV->getBroadcastInstrs(IRV);
10195     set(Def, B, Part);
10196     return B;
10197   }
10198 
10199   Value *ScalarValue = get(Def, {Part, 0});
10200   // If we aren't vectorizing, we can just copy the scalar map values over
10201   // to the vector map.
10202   if (VF.isScalar()) {
10203     set(Def, ScalarValue, Part);
10204     return ScalarValue;
10205   }
10206 
10207   auto *RepR = dyn_cast<VPReplicateRecipe>(Def);
10208   bool IsUniform = RepR && RepR->isUniform();
10209 
10210   unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
10211   // Check if there is a scalar value for the selected lane.
10212   if (!hasScalarValue(Def, {Part, LastLane})) {
10213     // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform.
10214     assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) &&
10215            "unexpected recipe found to be invariant");
10216     IsUniform = true;
10217     LastLane = 0;
10218   }
10219 
10220   auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));
10221   // Set the insert point after the last scalarized instruction or after the
10222   // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
10223   // will directly follow the scalar definitions.
10224   auto OldIP = Builder.saveIP();
10225   auto NewIP =
10226       isa<PHINode>(LastInst)
10227           ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
10228           : std::next(BasicBlock::iterator(LastInst));
10229   Builder.SetInsertPoint(&*NewIP);
10230 
10231   // However, if we are vectorizing, we need to construct the vector values.
10232   // If the value is known to be uniform after vectorization, we can just
10233   // broadcast the scalar value corresponding to lane zero for each unroll
10234   // iteration. Otherwise, we construct the vector values using
10235   // insertelement instructions. Since the resulting vectors are stored in
10236   // State, we will only generate the insertelements once.
10237   Value *VectorValue = nullptr;
10238   if (IsUniform) {
10239     VectorValue = ILV->getBroadcastInstrs(ScalarValue);
10240     set(Def, VectorValue, Part);
10241   } else {
10242     // Initialize packing with insertelements to start from undef.
10243     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
10244     Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
10245     set(Def, Undef, Part);
10246     for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
10247       ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this);
10248     VectorValue = get(Def, Part);
10249   }
10250   Builder.restoreIP(OldIP);
10251   return VectorValue;
10252 }
10253 
10254 // Process the loop in the VPlan-native vectorization path. This path builds
10255 // VPlan upfront in the vectorization pipeline, which allows to apply
10256 // VPlan-to-VPlan transformations from the very beginning without modifying the
10257 // input LLVM IR.
10258 static bool processLoopInVPlanNativePath(
10259     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
10260     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
10261     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
10262     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
10263     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
10264     LoopVectorizationRequirements &Requirements) {
10265 
10266   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
10267     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
10268     return false;
10269   }
10270   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
10271   Function *F = L->getHeader()->getParent();
10272   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
10273 
10274   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10275       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
10276 
10277   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
10278                                 &Hints, IAI);
10279   // Use the planner for outer loop vectorization.
10280   // TODO: CM is not used at this point inside the planner. Turn CM into an
10281   // optional argument if we don't need it in the future.
10282   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints,
10283                                Requirements, ORE);
10284 
10285   // Get user vectorization factor.
10286   ElementCount UserVF = Hints.getWidth();
10287 
10288   CM.collectElementTypesForWidening();
10289 
10290   // Plan how to best vectorize, return the best VF and its cost.
10291   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
10292 
10293   // If we are stress testing VPlan builds, do not attempt to generate vector
10294   // code. Masked vector code generation support will follow soon.
10295   // Also, do not attempt to vectorize if no vector code will be produced.
10296   if (VPlanBuildStressTest || EnableVPlanPredication ||
10297       VectorizationFactor::Disabled() == VF)
10298     return false;
10299 
10300   VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10301 
10302   {
10303     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10304                              F->getParent()->getDataLayout());
10305     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
10306                            &CM, BFI, PSI, Checks);
10307     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
10308                       << L->getHeader()->getParent()->getName() << "\"\n");
10309     LVP.executePlan(VF.Width, 1, BestPlan, LB, DT);
10310   }
10311 
10312   // Mark the loop as already vectorized to avoid vectorizing again.
10313   Hints.setAlreadyVectorized();
10314   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10315   return true;
10316 }
10317 
10318 // Emit a remark if there are stores to floats that required a floating point
10319 // extension. If the vectorized loop was generated with floating point there
10320 // will be a performance penalty from the conversion overhead and the change in
10321 // the vector width.
10322 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
10323   SmallVector<Instruction *, 4> Worklist;
10324   for (BasicBlock *BB : L->getBlocks()) {
10325     for (Instruction &Inst : *BB) {
10326       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
10327         if (S->getValueOperand()->getType()->isFloatTy())
10328           Worklist.push_back(S);
10329       }
10330     }
10331   }
10332 
10333   // Traverse the floating point stores upwards searching, for floating point
10334   // conversions.
10335   SmallPtrSet<const Instruction *, 4> Visited;
10336   SmallPtrSet<const Instruction *, 4> EmittedRemark;
10337   while (!Worklist.empty()) {
10338     auto *I = Worklist.pop_back_val();
10339     if (!L->contains(I))
10340       continue;
10341     if (!Visited.insert(I).second)
10342       continue;
10343 
10344     // Emit a remark if the floating point store required a floating
10345     // point conversion.
10346     // TODO: More work could be done to identify the root cause such as a
10347     // constant or a function return type and point the user to it.
10348     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
10349       ORE->emit([&]() {
10350         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
10351                                           I->getDebugLoc(), L->getHeader())
10352                << "floating point conversion changes vector width. "
10353                << "Mixed floating point precision requires an up/down "
10354                << "cast that will negatively impact performance.";
10355       });
10356 
10357     for (Use &Op : I->operands())
10358       if (auto *OpI = dyn_cast<Instruction>(Op))
10359         Worklist.push_back(OpI);
10360   }
10361 }
10362 
10363 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
10364     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10365                                !EnableLoopInterleaving),
10366       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10367                               !EnableLoopVectorization) {}
10368 
10369 bool LoopVectorizePass::processLoop(Loop *L) {
10370   assert((EnableVPlanNativePath || L->isInnermost()) &&
10371          "VPlan-native path is not enabled. Only process inner loops.");
10372 
10373 #ifndef NDEBUG
10374   const std::string DebugLocStr = getDebugLocString(L);
10375 #endif /* NDEBUG */
10376 
10377   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
10378                     << L->getHeader()->getParent()->getName() << "\" from "
10379                     << DebugLocStr << "\n");
10380 
10381   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
10382 
10383   LLVM_DEBUG(
10384       dbgs() << "LV: Loop hints:"
10385              << " force="
10386              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
10387                      ? "disabled"
10388                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
10389                             ? "enabled"
10390                             : "?"))
10391              << " width=" << Hints.getWidth()
10392              << " interleave=" << Hints.getInterleave() << "\n");
10393 
10394   // Function containing loop
10395   Function *F = L->getHeader()->getParent();
10396 
10397   // Looking at the diagnostic output is the only way to determine if a loop
10398   // was vectorized (other than looking at the IR or machine code), so it
10399   // is important to generate an optimization remark for each loop. Most of
10400   // these messages are generated as OptimizationRemarkAnalysis. Remarks
10401   // generated as OptimizationRemark and OptimizationRemarkMissed are
10402   // less verbose reporting vectorized loops and unvectorized loops that may
10403   // benefit from vectorization, respectively.
10404 
10405   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10406     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10407     return false;
10408   }
10409 
10410   PredicatedScalarEvolution PSE(*SE, *L);
10411 
10412   // Check if it is legal to vectorize the loop.
10413   LoopVectorizationRequirements Requirements;
10414   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
10415                                 &Requirements, &Hints, DB, AC, BFI, PSI);
10416   if (!LVL.canVectorize(EnableVPlanNativePath)) {
10417     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10418     Hints.emitRemarkWithHints();
10419     return false;
10420   }
10421 
10422   // Check the function attributes and profiles to find out if this function
10423   // should be optimized for size.
10424   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10425       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
10426 
10427   // Entrance to the VPlan-native vectorization path. Outer loops are processed
10428   // here. They may require CFG and instruction level transformations before
10429   // even evaluating whether vectorization is profitable. Since we cannot modify
10430   // the incoming IR, we need to build VPlan upfront in the vectorization
10431   // pipeline.
10432   if (!L->isInnermost())
10433     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10434                                         ORE, BFI, PSI, Hints, Requirements);
10435 
10436   assert(L->isInnermost() && "Inner loop expected.");
10437 
10438   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10439   // count by optimizing for size, to minimize overheads.
10440   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
10441   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10442     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10443                       << "This loop is worth vectorizing only if no scalar "
10444                       << "iteration overheads are incurred.");
10445     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10446       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10447     else {
10448       LLVM_DEBUG(dbgs() << "\n");
10449       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10450     }
10451   }
10452 
10453   // Check the function attributes to see if implicit floats are allowed.
10454   // FIXME: This check doesn't seem possibly correct -- what if the loop is
10455   // an integer loop and the vector instructions selected are purely integer
10456   // vector instructions?
10457   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10458     reportVectorizationFailure(
10459         "Can't vectorize when the NoImplicitFloat attribute is used",
10460         "loop not vectorized due to NoImplicitFloat attribute",
10461         "NoImplicitFloat", ORE, L);
10462     Hints.emitRemarkWithHints();
10463     return false;
10464   }
10465 
10466   // Check if the target supports potentially unsafe FP vectorization.
10467   // FIXME: Add a check for the type of safety issue (denormal, signaling)
10468   // for the target we're vectorizing for, to make sure none of the
10469   // additional fp-math flags can help.
10470   if (Hints.isPotentiallyUnsafe() &&
10471       TTI->isFPVectorizationPotentiallyUnsafe()) {
10472     reportVectorizationFailure(
10473         "Potentially unsafe FP op prevents vectorization",
10474         "loop not vectorized due to unsafe FP support.",
10475         "UnsafeFP", ORE, L);
10476     Hints.emitRemarkWithHints();
10477     return false;
10478   }
10479 
10480   bool AllowOrderedReductions;
10481   // If the flag is set, use that instead and override the TTI behaviour.
10482   if (ForceOrderedReductions.getNumOccurrences() > 0)
10483     AllowOrderedReductions = ForceOrderedReductions;
10484   else
10485     AllowOrderedReductions = TTI->enableOrderedReductions();
10486   if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10487     ORE->emit([&]() {
10488       auto *ExactFPMathInst = Requirements.getExactFPInst();
10489       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10490                                                  ExactFPMathInst->getDebugLoc(),
10491                                                  ExactFPMathInst->getParent())
10492              << "loop not vectorized: cannot prove it is safe to reorder "
10493                 "floating-point operations";
10494     });
10495     LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10496                          "reorder floating-point operations\n");
10497     Hints.emitRemarkWithHints();
10498     return false;
10499   }
10500 
10501   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10502   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10503 
10504   // If an override option has been passed in for interleaved accesses, use it.
10505   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
10506     UseInterleaved = EnableInterleavedMemAccesses;
10507 
10508   // Analyze interleaved memory accesses.
10509   if (UseInterleaved) {
10510     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
10511   }
10512 
10513   // Use the cost model.
10514   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10515                                 F, &Hints, IAI);
10516   CM.collectValuesToIgnore();
10517   CM.collectElementTypesForWidening();
10518 
10519   // Use the planner for vectorization.
10520   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints,
10521                                Requirements, ORE);
10522 
10523   // Get user vectorization factor and interleave count.
10524   ElementCount UserVF = Hints.getWidth();
10525   unsigned UserIC = Hints.getInterleave();
10526 
10527   // Plan how to best vectorize, return the best VF and its cost.
10528   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
10529 
10530   VectorizationFactor VF = VectorizationFactor::Disabled();
10531   unsigned IC = 1;
10532 
10533   if (MaybeVF) {
10534     VF = *MaybeVF;
10535     // Select the interleave count.
10536     IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());
10537   }
10538 
10539   // Identify the diagnostic messages that should be produced.
10540   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10541   bool VectorizeLoop = true, InterleaveLoop = true;
10542   if (VF.Width.isScalar()) {
10543     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10544     VecDiagMsg = std::make_pair(
10545         "VectorizationNotBeneficial",
10546         "the cost-model indicates that vectorization is not beneficial");
10547     VectorizeLoop = false;
10548   }
10549 
10550   if (!MaybeVF && UserIC > 1) {
10551     // Tell the user interleaving was avoided up-front, despite being explicitly
10552     // requested.
10553     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10554                          "interleaving should be avoided up front\n");
10555     IntDiagMsg = std::make_pair(
10556         "InterleavingAvoided",
10557         "Ignoring UserIC, because interleaving was avoided up front");
10558     InterleaveLoop = false;
10559   } else if (IC == 1 && UserIC <= 1) {
10560     // Tell the user interleaving is not beneficial.
10561     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10562     IntDiagMsg = std::make_pair(
10563         "InterleavingNotBeneficial",
10564         "the cost-model indicates that interleaving is not beneficial");
10565     InterleaveLoop = false;
10566     if (UserIC == 1) {
10567       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10568       IntDiagMsg.second +=
10569           " and is explicitly disabled or interleave count is set to 1";
10570     }
10571   } else if (IC > 1 && UserIC == 1) {
10572     // Tell the user interleaving is beneficial, but it explicitly disabled.
10573     LLVM_DEBUG(
10574         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10575     IntDiagMsg = std::make_pair(
10576         "InterleavingBeneficialButDisabled",
10577         "the cost-model indicates that interleaving is beneficial "
10578         "but is explicitly disabled or interleave count is set to 1");
10579     InterleaveLoop = false;
10580   }
10581 
10582   // Override IC if user provided an interleave count.
10583   IC = UserIC > 0 ? UserIC : IC;
10584 
10585   // Emit diagnostic messages, if any.
10586   const char *VAPassName = Hints.vectorizeAnalysisPassName();
10587   if (!VectorizeLoop && !InterleaveLoop) {
10588     // Do not vectorize or interleaving the loop.
10589     ORE->emit([&]() {
10590       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10591                                       L->getStartLoc(), L->getHeader())
10592              << VecDiagMsg.second;
10593     });
10594     ORE->emit([&]() {
10595       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10596                                       L->getStartLoc(), L->getHeader())
10597              << IntDiagMsg.second;
10598     });
10599     return false;
10600   } else if (!VectorizeLoop && InterleaveLoop) {
10601     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10602     ORE->emit([&]() {
10603       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10604                                         L->getStartLoc(), L->getHeader())
10605              << VecDiagMsg.second;
10606     });
10607   } else if (VectorizeLoop && !InterleaveLoop) {
10608     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10609                       << ") in " << DebugLocStr << '\n');
10610     ORE->emit([&]() {
10611       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10612                                         L->getStartLoc(), L->getHeader())
10613              << IntDiagMsg.second;
10614     });
10615   } else if (VectorizeLoop && InterleaveLoop) {
10616     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10617                       << ") in " << DebugLocStr << '\n');
10618     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10619   }
10620 
10621   bool DisableRuntimeUnroll = false;
10622   MDNode *OrigLoopID = L->getLoopID();
10623   {
10624     // Optimistically generate runtime checks. Drop them if they turn out to not
10625     // be profitable. Limit the scope of Checks, so the cleanup happens
10626     // immediately after vector codegeneration is done.
10627     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10628                              F->getParent()->getDataLayout());
10629     if (!VF.Width.isScalar() || IC > 1)
10630       Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate());
10631 
10632     using namespace ore;
10633     if (!VectorizeLoop) {
10634       assert(IC > 1 && "interleave count should not be 1 or 0");
10635       // If we decided that it is not legal to vectorize the loop, then
10636       // interleave it.
10637       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10638                                  &CM, BFI, PSI, Checks);
10639 
10640       VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10641       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT);
10642 
10643       ORE->emit([&]() {
10644         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10645                                   L->getHeader())
10646                << "interleaved loop (interleaved count: "
10647                << NV("InterleaveCount", IC) << ")";
10648       });
10649     } else {
10650       // If we decided that it is *legal* to vectorize the loop, then do it.
10651 
10652       // Consider vectorizing the epilogue too if it's profitable.
10653       VectorizationFactor EpilogueVF =
10654           CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
10655       if (EpilogueVF.Width.isVector()) {
10656 
10657         // The first pass vectorizes the main loop and creates a scalar epilogue
10658         // to be vectorized by executing the plan (potentially with a different
10659         // factor) again shortly afterwards.
10660         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10661         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10662                                            EPI, &LVL, &CM, BFI, PSI, Checks);
10663 
10664         VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
10665         LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV,
10666                         DT);
10667         ++LoopsVectorized;
10668 
10669         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10670         formLCSSARecursively(*L, *DT, LI, SE);
10671 
10672         // Second pass vectorizes the epilogue and adjusts the control flow
10673         // edges from the first pass.
10674         EPI.MainLoopVF = EPI.EpilogueVF;
10675         EPI.MainLoopUF = EPI.EpilogueUF;
10676         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10677                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
10678                                                  Checks);
10679 
10680         VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
10681         LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10682                         DT);
10683         ++LoopsEpilogueVectorized;
10684 
10685         if (!MainILV.areSafetyChecksAdded())
10686           DisableRuntimeUnroll = true;
10687       } else {
10688         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
10689                                &LVL, &CM, BFI, PSI, Checks);
10690 
10691         VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10692         LVP.executePlan(VF.Width, IC, BestPlan, LB, DT);
10693         ++LoopsVectorized;
10694 
10695         // Add metadata to disable runtime unrolling a scalar loop when there
10696         // are no runtime checks about strides and memory. A scalar loop that is
10697         // rarely used is not worth unrolling.
10698         if (!LB.areSafetyChecksAdded())
10699           DisableRuntimeUnroll = true;
10700       }
10701       // Report the vectorization decision.
10702       ORE->emit([&]() {
10703         return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
10704                                   L->getHeader())
10705                << "vectorized loop (vectorization width: "
10706                << NV("VectorizationFactor", VF.Width)
10707                << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
10708       });
10709     }
10710 
10711     if (ORE->allowExtraAnalysis(LV_NAME))
10712       checkMixedPrecision(L, ORE);
10713   }
10714 
10715   Optional<MDNode *> RemainderLoopID =
10716       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10717                                       LLVMLoopVectorizeFollowupEpilogue});
10718   if (RemainderLoopID.hasValue()) {
10719     L->setLoopID(RemainderLoopID.getValue());
10720   } else {
10721     if (DisableRuntimeUnroll)
10722       AddRuntimeUnrollDisableMetaData(L);
10723 
10724     // Mark the loop as already vectorized to avoid vectorizing again.
10725     Hints.setAlreadyVectorized();
10726   }
10727 
10728   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10729   return true;
10730 }
10731 
10732 LoopVectorizeResult LoopVectorizePass::runImpl(
10733     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10734     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
10735     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
10736     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
10737     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10738   SE = &SE_;
10739   LI = &LI_;
10740   TTI = &TTI_;
10741   DT = &DT_;
10742   BFI = &BFI_;
10743   TLI = TLI_;
10744   AA = &AA_;
10745   AC = &AC_;
10746   GetLAA = &GetLAA_;
10747   DB = &DB_;
10748   ORE = &ORE_;
10749   PSI = PSI_;
10750 
10751   // Don't attempt if
10752   // 1. the target claims to have no vector registers, and
10753   // 2. interleaving won't help ILP.
10754   //
10755   // The second condition is necessary because, even if the target has no
10756   // vector registers, loop vectorization may still enable scalar
10757   // interleaving.
10758   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10759       TTI->getMaxInterleaveFactor(1) < 2)
10760     return LoopVectorizeResult(false, false);
10761 
10762   bool Changed = false, CFGChanged = false;
10763 
10764   // The vectorizer requires loops to be in simplified form.
10765   // Since simplification may add new inner loops, it has to run before the
10766   // legality and profitability checks. This means running the loop vectorizer
10767   // will simplify all loops, regardless of whether anything end up being
10768   // vectorized.
10769   for (auto &L : *LI)
10770     Changed |= CFGChanged |=
10771         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10772 
10773   // Build up a worklist of inner-loops to vectorize. This is necessary as
10774   // the act of vectorizing or partially unrolling a loop creates new loops
10775   // and can invalidate iterators across the loops.
10776   SmallVector<Loop *, 8> Worklist;
10777 
10778   for (Loop *L : *LI)
10779     collectSupportedLoops(*L, LI, ORE, Worklist);
10780 
10781   LoopsAnalyzed += Worklist.size();
10782 
10783   // Now walk the identified inner loops.
10784   while (!Worklist.empty()) {
10785     Loop *L = Worklist.pop_back_val();
10786 
10787     // For the inner loops we actually process, form LCSSA to simplify the
10788     // transform.
10789     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10790 
10791     Changed |= CFGChanged |= processLoop(L);
10792   }
10793 
10794   // Process each loop nest in the function.
10795   return LoopVectorizeResult(Changed, CFGChanged);
10796 }
10797 
10798 PreservedAnalyses LoopVectorizePass::run(Function &F,
10799                                          FunctionAnalysisManager &AM) {
10800     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
10801     auto &LI = AM.getResult<LoopAnalysis>(F);
10802     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10803     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10804     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
10805     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10806     auto &AA = AM.getResult<AAManager>(F);
10807     auto &AC = AM.getResult<AssumptionAnalysis>(F);
10808     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10809     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10810 
10811     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
10812     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
10813         [&](Loop &L) -> const LoopAccessInfo & {
10814       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,      SE,
10815                                         TLI, TTI, nullptr, nullptr, nullptr};
10816       return LAM.getResult<LoopAccessAnalysis>(L, AR);
10817     };
10818     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10819     ProfileSummaryInfo *PSI =
10820         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10821     LoopVectorizeResult Result =
10822         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
10823     if (!Result.MadeAnyChange)
10824       return PreservedAnalyses::all();
10825     PreservedAnalyses PA;
10826 
10827     // We currently do not preserve loopinfo/dominator analyses with outer loop
10828     // vectorization. Until this is addressed, mark these analyses as preserved
10829     // only for non-VPlan-native path.
10830     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10831     if (!EnableVPlanNativePath) {
10832       PA.preserve<LoopAnalysis>();
10833       PA.preserve<DominatorTreeAnalysis>();
10834     }
10835     if (!Result.MadeCFGChange)
10836       PA.preserveSet<CFGAnalyses>();
10837     return PA;
10838 }
10839 
10840 void LoopVectorizePass::printPipeline(
10841     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10842   static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10843       OS, MapClassName2PassName);
10844 
10845   OS << "<";
10846   OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10847   OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10848   OS << ">";
10849 }
10850