1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SmallPtrSet.h"
73 #include "llvm/ADT/SmallSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
91 #include "llvm/Analysis/ProfileSummaryInfo.h"
92 #include "llvm/Analysis/ScalarEvolution.h"
93 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
94 #include "llvm/Analysis/TargetLibraryInfo.h"
95 #include "llvm/Analysis/TargetTransformInfo.h"
96 #include "llvm/Analysis/VectorUtils.h"
97 #include "llvm/IR/Attributes.h"
98 #include "llvm/IR/BasicBlock.h"
99 #include "llvm/IR/CFG.h"
100 #include "llvm/IR/Constant.h"
101 #include "llvm/IR/Constants.h"
102 #include "llvm/IR/DataLayout.h"
103 #include "llvm/IR/DebugInfoMetadata.h"
104 #include "llvm/IR/DebugLoc.h"
105 #include "llvm/IR/DerivedTypes.h"
106 #include "llvm/IR/DiagnosticInfo.h"
107 #include "llvm/IR/Dominators.h"
108 #include "llvm/IR/Function.h"
109 #include "llvm/IR/IRBuilder.h"
110 #include "llvm/IR/InstrTypes.h"
111 #include "llvm/IR/Instruction.h"
112 #include "llvm/IR/Instructions.h"
113 #include "llvm/IR/IntrinsicInst.h"
114 #include "llvm/IR/Intrinsics.h"
115 #include "llvm/IR/LLVMContext.h"
116 #include "llvm/IR/Metadata.h"
117 #include "llvm/IR/Module.h"
118 #include "llvm/IR/Operator.h"
119 #include "llvm/IR/PatternMatch.h"
120 #include "llvm/IR/Type.h"
121 #include "llvm/IR/Use.h"
122 #include "llvm/IR/User.h"
123 #include "llvm/IR/Value.h"
124 #include "llvm/IR/ValueHandle.h"
125 #include "llvm/IR/Verifier.h"
126 #include "llvm/InitializePasses.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/InstructionCost.h"
134 #include "llvm/Support/MathExtras.h"
135 #include "llvm/Support/raw_ostream.h"
136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
137 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
138 #include "llvm/Transforms/Utils/LoopSimplify.h"
139 #include "llvm/Transforms/Utils/LoopUtils.h"
140 #include "llvm/Transforms/Utils/LoopVersioning.h"
141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
142 #include "llvm/Transforms/Utils/SizeOpts.h"
143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
144 #include <algorithm>
145 #include <cassert>
146 #include <cstdint>
147 #include <cstdlib>
148 #include <functional>
149 #include <iterator>
150 #include <limits>
151 #include <memory>
152 #include <string>
153 #include <tuple>
154 #include <utility>
155 
156 using namespace llvm;
157 
158 #define LV_NAME "loop-vectorize"
159 #define DEBUG_TYPE LV_NAME
160 
161 #ifndef NDEBUG
162 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
163 #endif
164 
165 /// @{
166 /// Metadata attribute names
167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
168 const char LLVMLoopVectorizeFollowupVectorized[] =
169     "llvm.loop.vectorize.followup_vectorized";
170 const char LLVMLoopVectorizeFollowupEpilogue[] =
171     "llvm.loop.vectorize.followup_epilogue";
172 /// @}
173 
174 STATISTIC(LoopsVectorized, "Number of loops vectorized");
175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
177 
178 static cl::opt<bool> EnableEpilogueVectorization(
179     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
180     cl::desc("Enable vectorization of epilogue loops."));
181 
182 static cl::opt<unsigned> EpilogueVectorizationForceVF(
183     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
184     cl::desc("When epilogue vectorization is enabled, and a value greater than "
185              "1 is specified, forces the given VF for all applicable epilogue "
186              "loops."));
187 
188 static cl::opt<unsigned> EpilogueVectorizationMinVF(
189     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
190     cl::desc("Only loops with vectorization factor equal to or larger than "
191              "the specified value are considered for epilogue vectorization."));
192 
193 /// Loops with a known constant trip count below this number are vectorized only
194 /// if no scalar iteration overheads are incurred.
195 static cl::opt<unsigned> TinyTripCountVectorThreshold(
196     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
197     cl::desc("Loops with a constant trip count that is smaller than this "
198              "value are vectorized only if no scalar iteration overheads "
199              "are incurred."));
200 
201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
202     "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
203     cl::desc("The maximum allowed number of runtime memory checks with a "
204              "vectorize(enable) pragma."));
205 
206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
207 // that predication is preferred, and this lists all options. I.e., the
208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
209 // and predicate the instructions accordingly. If tail-folding fails, there are
210 // different fallback strategies depending on these values:
211 namespace PreferPredicateTy {
212   enum Option {
213     ScalarEpilogue = 0,
214     PredicateElseScalarEpilogue,
215     PredicateOrDontVectorize
216   };
217 } // namespace PreferPredicateTy
218 
219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
220     "prefer-predicate-over-epilogue",
221     cl::init(PreferPredicateTy::ScalarEpilogue),
222     cl::Hidden,
223     cl::desc("Tail-folding and predication preferences over creating a scalar "
224              "epilogue loop."),
225     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
226                          "scalar-epilogue",
227                          "Don't tail-predicate loops, create scalar epilogue"),
228               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
229                          "predicate-else-scalar-epilogue",
230                          "prefer tail-folding, create scalar epilogue if tail "
231                          "folding fails."),
232               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
233                          "predicate-dont-vectorize",
234                          "prefers tail-folding, don't attempt vectorization if "
235                          "tail-folding fails.")));
236 
237 static cl::opt<bool> MaximizeBandwidth(
238     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
239     cl::desc("Maximize bandwidth when selecting vectorization factor which "
240              "will be determined by the smallest type in loop."));
241 
242 static cl::opt<bool> EnableInterleavedMemAccesses(
243     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
244     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
245 
246 /// An interleave-group may need masking if it resides in a block that needs
247 /// predication, or in order to mask away gaps.
248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
249     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
250     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
251 
252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
253     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
254     cl::desc("We don't interleave loops with a estimated constant trip count "
255              "below this number"));
256 
257 static cl::opt<unsigned> ForceTargetNumScalarRegs(
258     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
259     cl::desc("A flag that overrides the target's number of scalar registers."));
260 
261 static cl::opt<unsigned> ForceTargetNumVectorRegs(
262     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
263     cl::desc("A flag that overrides the target's number of vector registers."));
264 
265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
266     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
267     cl::desc("A flag that overrides the target's max interleave factor for "
268              "scalar loops."));
269 
270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
271     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
272     cl::desc("A flag that overrides the target's max interleave factor for "
273              "vectorized loops."));
274 
275 static cl::opt<unsigned> ForceTargetInstructionCost(
276     "force-target-instruction-cost", cl::init(0), cl::Hidden,
277     cl::desc("A flag that overrides the target's expected cost for "
278              "an instruction to a single constant value. Mostly "
279              "useful for getting consistent testing."));
280 
281 static cl::opt<bool> ForceTargetSupportsScalableVectors(
282     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
283     cl::desc(
284         "Pretend that scalable vectors are supported, even if the target does "
285         "not support them. This flag should only be used for testing."));
286 
287 static cl::opt<unsigned> SmallLoopCost(
288     "small-loop-cost", cl::init(20), cl::Hidden,
289     cl::desc(
290         "The cost of a loop that is considered 'small' by the interleaver."));
291 
292 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
293     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
294     cl::desc("Enable the use of the block frequency analysis to access PGO "
295              "heuristics minimizing code growth in cold regions and being more "
296              "aggressive in hot regions."));
297 
298 // Runtime interleave loops for load/store throughput.
299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
300     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
301     cl::desc(
302         "Enable runtime interleaving until load/store ports are saturated"));
303 
304 /// Interleave small loops with scalar reductions.
305 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
306     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
307     cl::desc("Enable interleaving for loops with small iteration counts that "
308              "contain scalar reductions to expose ILP."));
309 
310 /// The number of stores in a loop that are allowed to need predication.
311 static cl::opt<unsigned> NumberOfStoresToPredicate(
312     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
313     cl::desc("Max number of stores to be predicated behind an if."));
314 
315 static cl::opt<bool> EnableIndVarRegisterHeur(
316     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
317     cl::desc("Count the induction variable only once when interleaving"));
318 
319 static cl::opt<bool> EnableCondStoresVectorization(
320     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
321     cl::desc("Enable if predication of stores during vectorization."));
322 
323 static cl::opt<unsigned> MaxNestedScalarReductionIC(
324     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
325     cl::desc("The maximum interleave count to use when interleaving a scalar "
326              "reduction in a nested loop."));
327 
328 static cl::opt<bool>
329     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
330                            cl::Hidden,
331                            cl::desc("Prefer in-loop vector reductions, "
332                                     "overriding the targets preference."));
333 
334 static cl::opt<bool> ForceOrderedReductions(
335     "force-ordered-reductions", cl::init(false), cl::Hidden,
336     cl::desc("Enable the vectorisation of loops with in-order (strict) "
337              "FP reductions"));
338 
339 static cl::opt<bool> PreferPredicatedReductionSelect(
340     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
341     cl::desc(
342         "Prefer predicating a reduction operation over an after loop select."));
343 
344 cl::opt<bool> EnableVPlanNativePath(
345     "enable-vplan-native-path", cl::init(false), cl::Hidden,
346     cl::desc("Enable VPlan-native vectorization path with "
347              "support for outer loop vectorization."));
348 
349 // FIXME: Remove this switch once we have divergence analysis. Currently we
350 // assume divergent non-backedge branches when this switch is true.
351 cl::opt<bool> EnableVPlanPredication(
352     "enable-vplan-predication", cl::init(false), cl::Hidden,
353     cl::desc("Enable VPlan-native vectorization path predicator with "
354              "support for outer loop vectorization."));
355 
356 // This flag enables the stress testing of the VPlan H-CFG construction in the
357 // VPlan-native vectorization path. It must be used in conjuction with
358 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
359 // verification of the H-CFGs built.
360 static cl::opt<bool> VPlanBuildStressTest(
361     "vplan-build-stress-test", cl::init(false), cl::Hidden,
362     cl::desc(
363         "Build VPlan for every supported loop nest in the function and bail "
364         "out right after the build (stress test the VPlan H-CFG construction "
365         "in the VPlan-native vectorization path)."));
366 
367 cl::opt<bool> llvm::EnableLoopInterleaving(
368     "interleave-loops", cl::init(true), cl::Hidden,
369     cl::desc("Enable loop interleaving in Loop vectorization passes"));
370 cl::opt<bool> llvm::EnableLoopVectorization(
371     "vectorize-loops", cl::init(true), cl::Hidden,
372     cl::desc("Run the Loop vectorization passes"));
373 
374 cl::opt<bool> PrintVPlansInDotFormat(
375     "vplan-print-in-dot-format", cl::init(false), cl::Hidden,
376     cl::desc("Use dot format instead of plain text when dumping VPlans"));
377 
378 /// A helper function that returns true if the given type is irregular. The
379 /// type is irregular if its allocated size doesn't equal the store size of an
380 /// element of the corresponding vector type.
381 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
382   // Determine if an array of N elements of type Ty is "bitcast compatible"
383   // with a <N x Ty> vector.
384   // This is only true if there is no padding between the array elements.
385   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
386 }
387 
388 /// A helper function that returns the reciprocal of the block probability of
389 /// predicated blocks. If we return X, we are assuming the predicated block
390 /// will execute once for every X iterations of the loop header.
391 ///
392 /// TODO: We should use actual block probability here, if available. Currently,
393 ///       we always assume predicated blocks have a 50% chance of executing.
394 static unsigned getReciprocalPredBlockProb() { return 2; }
395 
396 /// A helper function that returns an integer or floating-point constant with
397 /// value C.
398 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
399   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
400                            : ConstantFP::get(Ty, C);
401 }
402 
403 /// Returns "best known" trip count for the specified loop \p L as defined by
404 /// the following procedure:
405 ///   1) Returns exact trip count if it is known.
406 ///   2) Returns expected trip count according to profile data if any.
407 ///   3) Returns upper bound estimate if it is known.
408 ///   4) Returns None if all of the above failed.
409 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
410   // Check if exact trip count is known.
411   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
412     return ExpectedTC;
413 
414   // Check if there is an expected trip count available from profile data.
415   if (LoopVectorizeWithBlockFrequency)
416     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
417       return EstimatedTC;
418 
419   // Check if upper bound estimate is known.
420   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
421     return ExpectedTC;
422 
423   return None;
424 }
425 
426 // Forward declare GeneratedRTChecks.
427 class GeneratedRTChecks;
428 
429 namespace llvm {
430 
431 /// InnerLoopVectorizer vectorizes loops which contain only one basic
432 /// block to a specified vectorization factor (VF).
433 /// This class performs the widening of scalars into vectors, or multiple
434 /// scalars. This class also implements the following features:
435 /// * It inserts an epilogue loop for handling loops that don't have iteration
436 ///   counts that are known to be a multiple of the vectorization factor.
437 /// * It handles the code generation for reduction variables.
438 /// * Scalarization (implementation using scalars) of un-vectorizable
439 ///   instructions.
440 /// InnerLoopVectorizer does not perform any vectorization-legality
441 /// checks, and relies on the caller to check for the different legality
442 /// aspects. The InnerLoopVectorizer relies on the
443 /// LoopVectorizationLegality class to provide information about the induction
444 /// and reduction variables that were found to a given vectorization factor.
445 class InnerLoopVectorizer {
446 public:
447   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
448                       LoopInfo *LI, DominatorTree *DT,
449                       const TargetLibraryInfo *TLI,
450                       const TargetTransformInfo *TTI, AssumptionCache *AC,
451                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
452                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
453                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
454                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
455       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
456         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
457         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
458         PSI(PSI), RTChecks(RTChecks) {
459     // Query this against the original loop and save it here because the profile
460     // of the original loop header may change as the transformation happens.
461     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
462         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
463   }
464 
465   virtual ~InnerLoopVectorizer() = default;
466 
467   /// Create a new empty loop that will contain vectorized instructions later
468   /// on, while the old loop will be used as the scalar remainder. Control flow
469   /// is generated around the vectorized (and scalar epilogue) loops consisting
470   /// of various checks and bypasses. Return the pre-header block of the new
471   /// loop.
472   /// In the case of epilogue vectorization, this function is overriden to
473   /// handle the more complex control flow around the loops.
474   virtual BasicBlock *createVectorizedLoopSkeleton();
475 
476   /// Widen a single instruction within the innermost loop.
477   void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands,
478                         VPTransformState &State);
479 
480   /// Widen a single call instruction within the innermost loop.
481   void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
482                             VPTransformState &State);
483 
484   /// Widen a single select instruction within the innermost loop.
485   void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands,
486                               bool InvariantCond, VPTransformState &State);
487 
488   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
489   void fixVectorizedLoop(VPTransformState &State);
490 
491   // Return true if any runtime check is added.
492   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
493 
494   /// A type for vectorized values in the new loop. Each value from the
495   /// original loop, when vectorized, is represented by UF vector values in the
496   /// new unrolled loop, where UF is the unroll factor.
497   using VectorParts = SmallVector<Value *, 2>;
498 
499   /// Vectorize a single GetElementPtrInst based on information gathered and
500   /// decisions taken during planning.
501   void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices,
502                 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant,
503                 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
504 
505   /// Vectorize a single first-order recurrence or pointer induction PHINode in
506   /// a block. This method handles the induction variable canonicalization. It
507   /// supports both VF = 1 for unrolled loops and arbitrary length vectors.
508   void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR,
509                            VPTransformState &State);
510 
511   /// A helper function to scalarize a single Instruction in the innermost loop.
512   /// Generates a sequence of scalar instances for each lane between \p MinLane
513   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
514   /// inclusive. Uses the VPValue operands from \p Operands instead of \p
515   /// Instr's operands.
516   void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands,
517                             const VPIteration &Instance, bool IfPredicateInstr,
518                             VPTransformState &State);
519 
520   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
521   /// is provided, the integer induction variable will first be truncated to
522   /// the corresponding type.
523   void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc,
524                              VPValue *Def, VPValue *CastDef,
525                              VPTransformState &State);
526 
527   /// Construct the vector value of a scalarized value \p V one lane at a time.
528   void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
529                                  VPTransformState &State);
530 
531   /// Try to vectorize interleaved access group \p Group with the base address
532   /// given in \p Addr, optionally masking the vector operations if \p
533   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
534   /// values in the vectorized loop.
535   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
536                                 ArrayRef<VPValue *> VPDefs,
537                                 VPTransformState &State, VPValue *Addr,
538                                 ArrayRef<VPValue *> StoredValues,
539                                 VPValue *BlockInMask = nullptr);
540 
541   /// Vectorize Load and Store instructions with the base address given in \p
542   /// Addr, optionally masking the vector operations if \p BlockInMask is
543   /// non-null. Use \p State to translate given VPValues to IR values in the
544   /// vectorized loop.
545   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
546                                   VPValue *Def, VPValue *Addr,
547                                   VPValue *StoredValue, VPValue *BlockInMask,
548                                   bool ConsecutiveStride, bool Reverse);
549 
550   /// Set the debug location in the builder \p Ptr using the debug location in
551   /// \p V. If \p Ptr is None then it uses the class member's Builder.
552   void setDebugLocFromInst(const Value *V,
553                            Optional<IRBuilder<> *> CustomBuilder = None);
554 
555   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
556   void fixNonInductionPHIs(VPTransformState &State);
557 
558   /// Returns true if the reordering of FP operations is not allowed, but we are
559   /// able to vectorize with strict in-order reductions for the given RdxDesc.
560   bool useOrderedReductions(RecurrenceDescriptor &RdxDesc);
561 
562   /// Create a broadcast instruction. This method generates a broadcast
563   /// instruction (shuffle) for loop invariant values and for the induction
564   /// value. If this is the induction variable then we extend it to N, N+1, ...
565   /// this is needed because each iteration in the loop corresponds to a SIMD
566   /// element.
567   virtual Value *getBroadcastInstrs(Value *V);
568 
569 protected:
570   friend class LoopVectorizationPlanner;
571 
572   /// A small list of PHINodes.
573   using PhiVector = SmallVector<PHINode *, 4>;
574 
575   /// A type for scalarized values in the new loop. Each value from the
576   /// original loop, when scalarized, is represented by UF x VF scalar values
577   /// in the new unrolled loop, where UF is the unroll factor and VF is the
578   /// vectorization factor.
579   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
580 
581   /// Set up the values of the IVs correctly when exiting the vector loop.
582   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
583                     Value *CountRoundDown, Value *EndValue,
584                     BasicBlock *MiddleBlock);
585 
586   /// Create a new induction variable inside L.
587   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
588                                    Value *Step, Instruction *DL);
589 
590   /// Handle all cross-iteration phis in the header.
591   void fixCrossIterationPHIs(VPTransformState &State);
592 
593   /// Create the exit value of first order recurrences in the middle block and
594   /// update their users.
595   void fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR, VPTransformState &State);
596 
597   /// Create code for the loop exit value of the reduction.
598   void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
599 
600   /// Clear NSW/NUW flags from reduction instructions if necessary.
601   void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
602                                VPTransformState &State);
603 
604   /// Fixup the LCSSA phi nodes in the unique exit block.  This simply
605   /// means we need to add the appropriate incoming value from the middle
606   /// block as exiting edges from the scalar epilogue loop (if present) are
607   /// already in place, and we exit the vector loop exclusively to the middle
608   /// block.
609   void fixLCSSAPHIs(VPTransformState &State);
610 
611   /// Iteratively sink the scalarized operands of a predicated instruction into
612   /// the block that was created for it.
613   void sinkScalarOperands(Instruction *PredInst);
614 
615   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
616   /// represented as.
617   void truncateToMinimalBitwidths(VPTransformState &State);
618 
619   /// This function adds
620   /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
621   /// to each vector element of Val. The sequence starts at StartIndex.
622   /// \p Opcode is relevant for FP induction variable.
623   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
624                                Instruction::BinaryOps Opcode =
625                                Instruction::BinaryOpsEnd);
626 
627   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
628   /// variable on which to base the steps, \p Step is the size of the step, and
629   /// \p EntryVal is the value from the original loop that maps to the steps.
630   /// Note that \p EntryVal doesn't have to be an induction variable - it
631   /// can also be a truncate instruction.
632   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
633                         const InductionDescriptor &ID, VPValue *Def,
634                         VPValue *CastDef, VPTransformState &State);
635 
636   /// Create a vector induction phi node based on an existing scalar one. \p
637   /// EntryVal is the value from the original loop that maps to the vector phi
638   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
639   /// truncate instruction, instead of widening the original IV, we widen a
640   /// version of the IV truncated to \p EntryVal's type.
641   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
642                                        Value *Step, Value *Start,
643                                        Instruction *EntryVal, VPValue *Def,
644                                        VPValue *CastDef,
645                                        VPTransformState &State);
646 
647   /// Returns true if an instruction \p I should be scalarized instead of
648   /// vectorized for the chosen vectorization factor.
649   bool shouldScalarizeInstruction(Instruction *I) const;
650 
651   /// Returns true if we should generate a scalar version of \p IV.
652   bool needsScalarInduction(Instruction *IV) const;
653 
654   /// If there is a cast involved in the induction variable \p ID, which should
655   /// be ignored in the vectorized loop body, this function records the
656   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
657   /// cast. We had already proved that the casted Phi is equal to the uncasted
658   /// Phi in the vectorized loop (under a runtime guard), and therefore
659   /// there is no need to vectorize the cast - the same value can be used in the
660   /// vector loop for both the Phi and the cast.
661   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
662   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
663   ///
664   /// \p EntryVal is the value from the original loop that maps to the vector
665   /// phi node and is used to distinguish what is the IV currently being
666   /// processed - original one (if \p EntryVal is a phi corresponding to the
667   /// original IV) or the "newly-created" one based on the proof mentioned above
668   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
669   /// latter case \p EntryVal is a TruncInst and we must not record anything for
670   /// that IV, but it's error-prone to expect callers of this routine to care
671   /// about that, hence this explicit parameter.
672   void recordVectorLoopValueForInductionCast(
673       const InductionDescriptor &ID, const Instruction *EntryVal,
674       Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State,
675       unsigned Part, unsigned Lane = UINT_MAX);
676 
677   /// Generate a shuffle sequence that will reverse the vector Vec.
678   virtual Value *reverseVector(Value *Vec);
679 
680   /// Returns (and creates if needed) the original loop trip count.
681   Value *getOrCreateTripCount(Loop *NewLoop);
682 
683   /// Returns (and creates if needed) the trip count of the widened loop.
684   Value *getOrCreateVectorTripCount(Loop *NewLoop);
685 
686   /// Returns a bitcasted value to the requested vector type.
687   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
688   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
689                                 const DataLayout &DL);
690 
691   /// Emit a bypass check to see if the vector trip count is zero, including if
692   /// it overflows.
693   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
694 
695   /// Emit a bypass check to see if all of the SCEV assumptions we've
696   /// had to make are correct. Returns the block containing the checks or
697   /// nullptr if no checks have been added.
698   BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass);
699 
700   /// Emit bypass checks to check any memory assumptions we may have made.
701   /// Returns the block containing the checks or nullptr if no checks have been
702   /// added.
703   BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
704 
705   /// Compute the transformed value of Index at offset StartValue using step
706   /// StepValue.
707   /// For integer induction, returns StartValue + Index * StepValue.
708   /// For pointer induction, returns StartValue[Index * StepValue].
709   /// FIXME: The newly created binary instructions should contain nsw/nuw
710   /// flags, which can be found from the original scalar operations.
711   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
712                               const DataLayout &DL,
713                               const InductionDescriptor &ID) const;
714 
715   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
716   /// vector loop preheader, middle block and scalar preheader. Also
717   /// allocate a loop object for the new vector loop and return it.
718   Loop *createVectorLoopSkeleton(StringRef Prefix);
719 
720   /// Create new phi nodes for the induction variables to resume iteration count
721   /// in the scalar epilogue, from where the vectorized loop left off (given by
722   /// \p VectorTripCount).
723   /// In cases where the loop skeleton is more complicated (eg. epilogue
724   /// vectorization) and the resume values can come from an additional bypass
725   /// block, the \p AdditionalBypass pair provides information about the bypass
726   /// block and the end value on the edge from bypass to this loop.
727   void createInductionResumeValues(
728       Loop *L, Value *VectorTripCount,
729       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
730 
731   /// Complete the loop skeleton by adding debug MDs, creating appropriate
732   /// conditional branches in the middle block, preparing the builder and
733   /// running the verifier. Take in the vector loop \p L as argument, and return
734   /// the preheader of the completed vector loop.
735   BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
736 
737   /// Add additional metadata to \p To that was not present on \p Orig.
738   ///
739   /// Currently this is used to add the noalias annotations based on the
740   /// inserted memchecks.  Use this for instructions that are *cloned* into the
741   /// vector loop.
742   void addNewMetadata(Instruction *To, const Instruction *Orig);
743 
744   /// Add metadata from one instruction to another.
745   ///
746   /// This includes both the original MDs from \p From and additional ones (\see
747   /// addNewMetadata).  Use this for *newly created* instructions in the vector
748   /// loop.
749   void addMetadata(Instruction *To, Instruction *From);
750 
751   /// Similar to the previous function but it adds the metadata to a
752   /// vector of instructions.
753   void addMetadata(ArrayRef<Value *> To, Instruction *From);
754 
755   /// Allow subclasses to override and print debug traces before/after vplan
756   /// execution, when trace information is requested.
757   virtual void printDebugTracesAtStart(){};
758   virtual void printDebugTracesAtEnd(){};
759 
760   /// The original loop.
761   Loop *OrigLoop;
762 
763   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
764   /// dynamic knowledge to simplify SCEV expressions and converts them to a
765   /// more usable form.
766   PredicatedScalarEvolution &PSE;
767 
768   /// Loop Info.
769   LoopInfo *LI;
770 
771   /// Dominator Tree.
772   DominatorTree *DT;
773 
774   /// Alias Analysis.
775   AAResults *AA;
776 
777   /// Target Library Info.
778   const TargetLibraryInfo *TLI;
779 
780   /// Target Transform Info.
781   const TargetTransformInfo *TTI;
782 
783   /// Assumption Cache.
784   AssumptionCache *AC;
785 
786   /// Interface to emit optimization remarks.
787   OptimizationRemarkEmitter *ORE;
788 
789   /// LoopVersioning.  It's only set up (non-null) if memchecks were
790   /// used.
791   ///
792   /// This is currently only used to add no-alias metadata based on the
793   /// memchecks.  The actually versioning is performed manually.
794   std::unique_ptr<LoopVersioning> LVer;
795 
796   /// The vectorization SIMD factor to use. Each vector will have this many
797   /// vector elements.
798   ElementCount VF;
799 
800   /// The vectorization unroll factor to use. Each scalar is vectorized to this
801   /// many different vector instructions.
802   unsigned UF;
803 
804   /// The builder that we use
805   IRBuilder<> Builder;
806 
807   // --- Vectorization state ---
808 
809   /// The vector-loop preheader.
810   BasicBlock *LoopVectorPreHeader;
811 
812   /// The scalar-loop preheader.
813   BasicBlock *LoopScalarPreHeader;
814 
815   /// Middle Block between the vector and the scalar.
816   BasicBlock *LoopMiddleBlock;
817 
818   /// The unique ExitBlock of the scalar loop if one exists.  Note that
819   /// there can be multiple exiting edges reaching this block.
820   BasicBlock *LoopExitBlock;
821 
822   /// The vector loop body.
823   BasicBlock *LoopVectorBody;
824 
825   /// The scalar loop body.
826   BasicBlock *LoopScalarBody;
827 
828   /// A list of all bypass blocks. The first block is the entry of the loop.
829   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
830 
831   /// The new Induction variable which was added to the new block.
832   PHINode *Induction = nullptr;
833 
834   /// The induction variable of the old basic block.
835   PHINode *OldInduction = nullptr;
836 
837   /// Store instructions that were predicated.
838   SmallVector<Instruction *, 4> PredicatedInstructions;
839 
840   /// Trip count of the original loop.
841   Value *TripCount = nullptr;
842 
843   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
844   Value *VectorTripCount = nullptr;
845 
846   /// The legality analysis.
847   LoopVectorizationLegality *Legal;
848 
849   /// The profitablity analysis.
850   LoopVectorizationCostModel *Cost;
851 
852   // Record whether runtime checks are added.
853   bool AddedSafetyChecks = false;
854 
855   // Holds the end values for each induction variable. We save the end values
856   // so we can later fix-up the external users of the induction variables.
857   DenseMap<PHINode *, Value *> IVEndValues;
858 
859   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
860   // fixed up at the end of vector code generation.
861   SmallVector<PHINode *, 8> OrigPHIsToFix;
862 
863   /// BFI and PSI are used to check for profile guided size optimizations.
864   BlockFrequencyInfo *BFI;
865   ProfileSummaryInfo *PSI;
866 
867   // Whether this loop should be optimized for size based on profile guided size
868   // optimizatios.
869   bool OptForSizeBasedOnProfile;
870 
871   /// Structure to hold information about generated runtime checks, responsible
872   /// for cleaning the checks, if vectorization turns out unprofitable.
873   GeneratedRTChecks &RTChecks;
874 };
875 
876 class InnerLoopUnroller : public InnerLoopVectorizer {
877 public:
878   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
879                     LoopInfo *LI, DominatorTree *DT,
880                     const TargetLibraryInfo *TLI,
881                     const TargetTransformInfo *TTI, AssumptionCache *AC,
882                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
883                     LoopVectorizationLegality *LVL,
884                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
885                     ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
886       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
887                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
888                             BFI, PSI, Check) {}
889 
890 private:
891   Value *getBroadcastInstrs(Value *V) override;
892   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
893                        Instruction::BinaryOps Opcode =
894                        Instruction::BinaryOpsEnd) override;
895   Value *reverseVector(Value *Vec) override;
896 };
897 
898 /// Encapsulate information regarding vectorization of a loop and its epilogue.
899 /// This information is meant to be updated and used across two stages of
900 /// epilogue vectorization.
901 struct EpilogueLoopVectorizationInfo {
902   ElementCount MainLoopVF = ElementCount::getFixed(0);
903   unsigned MainLoopUF = 0;
904   ElementCount EpilogueVF = ElementCount::getFixed(0);
905   unsigned EpilogueUF = 0;
906   BasicBlock *MainLoopIterationCountCheck = nullptr;
907   BasicBlock *EpilogueIterationCountCheck = nullptr;
908   BasicBlock *SCEVSafetyCheck = nullptr;
909   BasicBlock *MemSafetyCheck = nullptr;
910   Value *TripCount = nullptr;
911   Value *VectorTripCount = nullptr;
912 
913   EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
914                                 ElementCount EVF, unsigned EUF)
915       : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
916     assert(EUF == 1 &&
917            "A high UF for the epilogue loop is likely not beneficial.");
918   }
919 };
920 
921 /// An extension of the inner loop vectorizer that creates a skeleton for a
922 /// vectorized loop that has its epilogue (residual) also vectorized.
923 /// The idea is to run the vplan on a given loop twice, firstly to setup the
924 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
925 /// from the first step and vectorize the epilogue.  This is achieved by
926 /// deriving two concrete strategy classes from this base class and invoking
927 /// them in succession from the loop vectorizer planner.
928 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
929 public:
930   InnerLoopAndEpilogueVectorizer(
931       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
932       DominatorTree *DT, const TargetLibraryInfo *TLI,
933       const TargetTransformInfo *TTI, AssumptionCache *AC,
934       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
935       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
936       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
937       GeneratedRTChecks &Checks)
938       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
939                             EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI,
940                             Checks),
941         EPI(EPI) {}
942 
943   // Override this function to handle the more complex control flow around the
944   // three loops.
945   BasicBlock *createVectorizedLoopSkeleton() final override {
946     return createEpilogueVectorizedLoopSkeleton();
947   }
948 
949   /// The interface for creating a vectorized skeleton using one of two
950   /// different strategies, each corresponding to one execution of the vplan
951   /// as described above.
952   virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
953 
954   /// Holds and updates state information required to vectorize the main loop
955   /// and its epilogue in two separate passes. This setup helps us avoid
956   /// regenerating and recomputing runtime safety checks. It also helps us to
957   /// shorten the iteration-count-check path length for the cases where the
958   /// iteration count of the loop is so small that the main vector loop is
959   /// completely skipped.
960   EpilogueLoopVectorizationInfo &EPI;
961 };
962 
963 /// A specialized derived class of inner loop vectorizer that performs
964 /// vectorization of *main* loops in the process of vectorizing loops and their
965 /// epilogues.
966 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
967 public:
968   EpilogueVectorizerMainLoop(
969       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
970       DominatorTree *DT, const TargetLibraryInfo *TLI,
971       const TargetTransformInfo *TTI, AssumptionCache *AC,
972       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
973       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
974       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
975       GeneratedRTChecks &Check)
976       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
977                                        EPI, LVL, CM, BFI, PSI, Check) {}
978   /// Implements the interface for creating a vectorized skeleton using the
979   /// *main loop* strategy (ie the first pass of vplan execution).
980   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
981 
982 protected:
983   /// Emits an iteration count bypass check once for the main loop (when \p
984   /// ForEpilogue is false) and once for the epilogue loop (when \p
985   /// ForEpilogue is true).
986   BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
987                                              bool ForEpilogue);
988   void printDebugTracesAtStart() override;
989   void printDebugTracesAtEnd() override;
990 };
991 
992 // A specialized derived class of inner loop vectorizer that performs
993 // vectorization of *epilogue* loops in the process of vectorizing loops and
994 // their epilogues.
995 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
996 public:
997   EpilogueVectorizerEpilogueLoop(
998       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
999       DominatorTree *DT, const TargetLibraryInfo *TLI,
1000       const TargetTransformInfo *TTI, AssumptionCache *AC,
1001       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
1002       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
1003       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
1004       GeneratedRTChecks &Checks)
1005       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
1006                                        EPI, LVL, CM, BFI, PSI, Checks) {}
1007   /// Implements the interface for creating a vectorized skeleton using the
1008   /// *epilogue loop* strategy (ie the second pass of vplan execution).
1009   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
1010 
1011 protected:
1012   /// Emits an iteration count bypass check after the main vector loop has
1013   /// finished to see if there are any iterations left to execute by either
1014   /// the vector epilogue or the scalar epilogue.
1015   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
1016                                                       BasicBlock *Bypass,
1017                                                       BasicBlock *Insert);
1018   void printDebugTracesAtStart() override;
1019   void printDebugTracesAtEnd() override;
1020 };
1021 } // end namespace llvm
1022 
1023 /// Look for a meaningful debug location on the instruction or it's
1024 /// operands.
1025 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
1026   if (!I)
1027     return I;
1028 
1029   DebugLoc Empty;
1030   if (I->getDebugLoc() != Empty)
1031     return I;
1032 
1033   for (Use &Op : I->operands()) {
1034     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
1035       if (OpInst->getDebugLoc() != Empty)
1036         return OpInst;
1037   }
1038 
1039   return I;
1040 }
1041 
1042 void InnerLoopVectorizer::setDebugLocFromInst(
1043     const Value *V, Optional<IRBuilder<> *> CustomBuilder) {
1044   IRBuilder<> *B = (CustomBuilder == None) ? &Builder : *CustomBuilder;
1045   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) {
1046     const DILocation *DIL = Inst->getDebugLoc();
1047 
1048     // When a FSDiscriminator is enabled, we don't need to add the multiply
1049     // factors to the discriminators.
1050     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
1051         !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) {
1052       // FIXME: For scalable vectors, assume vscale=1.
1053       auto NewDIL =
1054           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
1055       if (NewDIL)
1056         B->SetCurrentDebugLocation(NewDIL.getValue());
1057       else
1058         LLVM_DEBUG(dbgs()
1059                    << "Failed to create new discriminator: "
1060                    << DIL->getFilename() << " Line: " << DIL->getLine());
1061     } else
1062       B->SetCurrentDebugLocation(DIL);
1063   } else
1064     B->SetCurrentDebugLocation(DebugLoc());
1065 }
1066 
1067 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
1068 /// is passed, the message relates to that particular instruction.
1069 #ifndef NDEBUG
1070 static void debugVectorizationMessage(const StringRef Prefix,
1071                                       const StringRef DebugMsg,
1072                                       Instruction *I) {
1073   dbgs() << "LV: " << Prefix << DebugMsg;
1074   if (I != nullptr)
1075     dbgs() << " " << *I;
1076   else
1077     dbgs() << '.';
1078   dbgs() << '\n';
1079 }
1080 #endif
1081 
1082 /// Create an analysis remark that explains why vectorization failed
1083 ///
1084 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
1085 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
1086 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
1087 /// the location of the remark.  \return the remark object that can be
1088 /// streamed to.
1089 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
1090     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
1091   Value *CodeRegion = TheLoop->getHeader();
1092   DebugLoc DL = TheLoop->getStartLoc();
1093 
1094   if (I) {
1095     CodeRegion = I->getParent();
1096     // If there is no debug location attached to the instruction, revert back to
1097     // using the loop's.
1098     if (I->getDebugLoc())
1099       DL = I->getDebugLoc();
1100   }
1101 
1102   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
1103 }
1104 
1105 /// Return a value for Step multiplied by VF.
1106 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) {
1107   assert(isa<ConstantInt>(Step) && "Expected an integer step");
1108   Constant *StepVal = ConstantInt::get(
1109       Step->getType(),
1110       cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue());
1111   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
1112 }
1113 
1114 namespace llvm {
1115 
1116 /// Return the runtime value for VF.
1117 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) {
1118   Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
1119   return VF.isScalable() ? B.CreateVScale(EC) : EC;
1120 }
1121 
1122 void reportVectorizationFailure(const StringRef DebugMsg,
1123                                 const StringRef OREMsg, const StringRef ORETag,
1124                                 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1125                                 Instruction *I) {
1126   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
1127   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1128   ORE->emit(
1129       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1130       << "loop not vectorized: " << OREMsg);
1131 }
1132 
1133 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1134                              OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1135                              Instruction *I) {
1136   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
1137   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1138   ORE->emit(
1139       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1140       << Msg);
1141 }
1142 
1143 } // end namespace llvm
1144 
1145 #ifndef NDEBUG
1146 /// \return string containing a file name and a line # for the given loop.
1147 static std::string getDebugLocString(const Loop *L) {
1148   std::string Result;
1149   if (L) {
1150     raw_string_ostream OS(Result);
1151     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1152       LoopDbgLoc.print(OS);
1153     else
1154       // Just print the module name.
1155       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1156     OS.flush();
1157   }
1158   return Result;
1159 }
1160 #endif
1161 
1162 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1163                                          const Instruction *Orig) {
1164   // If the loop was versioned with memchecks, add the corresponding no-alias
1165   // metadata.
1166   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1167     LVer->annotateInstWithNoAlias(To, Orig);
1168 }
1169 
1170 void InnerLoopVectorizer::addMetadata(Instruction *To,
1171                                       Instruction *From) {
1172   propagateMetadata(To, From);
1173   addNewMetadata(To, From);
1174 }
1175 
1176 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1177                                       Instruction *From) {
1178   for (Value *V : To) {
1179     if (Instruction *I = dyn_cast<Instruction>(V))
1180       addMetadata(I, From);
1181   }
1182 }
1183 
1184 namespace llvm {
1185 
1186 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1187 // lowered.
1188 enum ScalarEpilogueLowering {
1189 
1190   // The default: allowing scalar epilogues.
1191   CM_ScalarEpilogueAllowed,
1192 
1193   // Vectorization with OptForSize: don't allow epilogues.
1194   CM_ScalarEpilogueNotAllowedOptSize,
1195 
1196   // A special case of vectorisation with OptForSize: loops with a very small
1197   // trip count are considered for vectorization under OptForSize, thereby
1198   // making sure the cost of their loop body is dominant, free of runtime
1199   // guards and scalar iteration overheads.
1200   CM_ScalarEpilogueNotAllowedLowTripLoop,
1201 
1202   // Loop hint predicate indicating an epilogue is undesired.
1203   CM_ScalarEpilogueNotNeededUsePredicate,
1204 
1205   // Directive indicating we must either tail fold or not vectorize
1206   CM_ScalarEpilogueNotAllowedUsePredicate
1207 };
1208 
1209 /// ElementCountComparator creates a total ordering for ElementCount
1210 /// for the purposes of using it in a set structure.
1211 struct ElementCountComparator {
1212   bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
1213     return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
1214            std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
1215   }
1216 };
1217 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
1218 
1219 /// LoopVectorizationCostModel - estimates the expected speedups due to
1220 /// vectorization.
1221 /// In many cases vectorization is not profitable. This can happen because of
1222 /// a number of reasons. In this class we mainly attempt to predict the
1223 /// expected speedup/slowdowns due to the supported instruction set. We use the
1224 /// TargetTransformInfo to query the different backends for the cost of
1225 /// different operations.
1226 class LoopVectorizationCostModel {
1227 public:
1228   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1229                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1230                              LoopVectorizationLegality *Legal,
1231                              const TargetTransformInfo &TTI,
1232                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1233                              AssumptionCache *AC,
1234                              OptimizationRemarkEmitter *ORE, const Function *F,
1235                              const LoopVectorizeHints *Hints,
1236                              InterleavedAccessInfo &IAI)
1237       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1238         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1239         Hints(Hints), InterleaveInfo(IAI) {}
1240 
1241   /// \return An upper bound for the vectorization factors (both fixed and
1242   /// scalable). If the factors are 0, vectorization and interleaving should be
1243   /// avoided up front.
1244   FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1245 
1246   /// \return True if runtime checks are required for vectorization, and false
1247   /// otherwise.
1248   bool runtimeChecksRequired();
1249 
1250   /// \return The most profitable vectorization factor and the cost of that VF.
1251   /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
1252   /// then this vectorization factor will be selected if vectorization is
1253   /// possible.
1254   VectorizationFactor
1255   selectVectorizationFactor(const ElementCountSet &CandidateVFs);
1256 
1257   VectorizationFactor
1258   selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1259                                     const LoopVectorizationPlanner &LVP);
1260 
1261   /// Setup cost-based decisions for user vectorization factor.
1262   /// \return true if the UserVF is a feasible VF to be chosen.
1263   bool selectUserVectorizationFactor(ElementCount UserVF) {
1264     collectUniformsAndScalars(UserVF);
1265     collectInstsToScalarize(UserVF);
1266     return expectedCost(UserVF).first.isValid();
1267   }
1268 
1269   /// \return The size (in bits) of the smallest and widest types in the code
1270   /// that needs to be vectorized. We ignore values that remain scalar such as
1271   /// 64 bit loop indices.
1272   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1273 
1274   /// \return The desired interleave count.
1275   /// If interleave count has been specified by metadata it will be returned.
1276   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1277   /// are the selected vectorization factor and the cost of the selected VF.
1278   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1279 
1280   /// Memory access instruction may be vectorized in more than one way.
1281   /// Form of instruction after vectorization depends on cost.
1282   /// This function takes cost-based decisions for Load/Store instructions
1283   /// and collects them in a map. This decisions map is used for building
1284   /// the lists of loop-uniform and loop-scalar instructions.
1285   /// The calculated cost is saved with widening decision in order to
1286   /// avoid redundant calculations.
1287   void setCostBasedWideningDecision(ElementCount VF);
1288 
1289   /// A struct that represents some properties of the register usage
1290   /// of a loop.
1291   struct RegisterUsage {
1292     /// Holds the number of loop invariant values that are used in the loop.
1293     /// The key is ClassID of target-provided register class.
1294     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1295     /// Holds the maximum number of concurrent live intervals in the loop.
1296     /// The key is ClassID of target-provided register class.
1297     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1298   };
1299 
1300   /// \return Returns information about the register usages of the loop for the
1301   /// given vectorization factors.
1302   SmallVector<RegisterUsage, 8>
1303   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1304 
1305   /// Collect values we want to ignore in the cost model.
1306   void collectValuesToIgnore();
1307 
1308   /// Collect all element types in the loop for which widening is needed.
1309   void collectElementTypesForWidening();
1310 
1311   /// Split reductions into those that happen in the loop, and those that happen
1312   /// outside. In loop reductions are collected into InLoopReductionChains.
1313   void collectInLoopReductions();
1314 
1315   /// Returns true if we should use strict in-order reductions for the given
1316   /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1317   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1318   /// of FP operations.
1319   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) {
1320     return !Hints->allowReordering() && RdxDesc.isOrdered();
1321   }
1322 
1323   /// \returns The smallest bitwidth each instruction can be represented with.
1324   /// The vector equivalents of these instructions should be truncated to this
1325   /// type.
1326   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1327     return MinBWs;
1328   }
1329 
1330   /// \returns True if it is more profitable to scalarize instruction \p I for
1331   /// vectorization factor \p VF.
1332   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1333     assert(VF.isVector() &&
1334            "Profitable to scalarize relevant only for VF > 1.");
1335 
1336     // Cost model is not run in the VPlan-native path - return conservative
1337     // result until this changes.
1338     if (EnableVPlanNativePath)
1339       return false;
1340 
1341     auto Scalars = InstsToScalarize.find(VF);
1342     assert(Scalars != InstsToScalarize.end() &&
1343            "VF not yet analyzed for scalarization profitability");
1344     return Scalars->second.find(I) != Scalars->second.end();
1345   }
1346 
1347   /// Returns true if \p I is known to be uniform after vectorization.
1348   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1349     if (VF.isScalar())
1350       return true;
1351 
1352     // Cost model is not run in the VPlan-native path - return conservative
1353     // result until this changes.
1354     if (EnableVPlanNativePath)
1355       return false;
1356 
1357     auto UniformsPerVF = Uniforms.find(VF);
1358     assert(UniformsPerVF != Uniforms.end() &&
1359            "VF not yet analyzed for uniformity");
1360     return UniformsPerVF->second.count(I);
1361   }
1362 
1363   /// Returns true if \p I is known to be scalar after vectorization.
1364   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1365     if (VF.isScalar())
1366       return true;
1367 
1368     // Cost model is not run in the VPlan-native path - return conservative
1369     // result until this changes.
1370     if (EnableVPlanNativePath)
1371       return false;
1372 
1373     auto ScalarsPerVF = Scalars.find(VF);
1374     assert(ScalarsPerVF != Scalars.end() &&
1375            "Scalar values are not calculated for VF");
1376     return ScalarsPerVF->second.count(I);
1377   }
1378 
1379   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1380   /// for vectorization factor \p VF.
1381   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1382     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1383            !isProfitableToScalarize(I, VF) &&
1384            !isScalarAfterVectorization(I, VF);
1385   }
1386 
1387   /// Decision that was taken during cost calculation for memory instruction.
1388   enum InstWidening {
1389     CM_Unknown,
1390     CM_Widen,         // For consecutive accesses with stride +1.
1391     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1392     CM_Interleave,
1393     CM_GatherScatter,
1394     CM_Scalarize
1395   };
1396 
1397   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1398   /// instruction \p I and vector width \p VF.
1399   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1400                            InstructionCost Cost) {
1401     assert(VF.isVector() && "Expected VF >=2");
1402     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1403   }
1404 
1405   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1406   /// interleaving group \p Grp and vector width \p VF.
1407   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1408                            ElementCount VF, InstWidening W,
1409                            InstructionCost Cost) {
1410     assert(VF.isVector() && "Expected VF >=2");
1411     /// Broadcast this decicion to all instructions inside the group.
1412     /// But the cost will be assigned to one instruction only.
1413     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1414       if (auto *I = Grp->getMember(i)) {
1415         if (Grp->getInsertPos() == I)
1416           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1417         else
1418           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1419       }
1420     }
1421   }
1422 
1423   /// Return the cost model decision for the given instruction \p I and vector
1424   /// width \p VF. Return CM_Unknown if this instruction did not pass
1425   /// through the cost modeling.
1426   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1427     assert(VF.isVector() && "Expected VF to be a vector VF");
1428     // Cost model is not run in the VPlan-native path - return conservative
1429     // result until this changes.
1430     if (EnableVPlanNativePath)
1431       return CM_GatherScatter;
1432 
1433     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1434     auto Itr = WideningDecisions.find(InstOnVF);
1435     if (Itr == WideningDecisions.end())
1436       return CM_Unknown;
1437     return Itr->second.first;
1438   }
1439 
1440   /// Return the vectorization cost for the given instruction \p I and vector
1441   /// width \p VF.
1442   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1443     assert(VF.isVector() && "Expected VF >=2");
1444     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1445     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1446            "The cost is not calculated");
1447     return WideningDecisions[InstOnVF].second;
1448   }
1449 
1450   /// Return True if instruction \p I is an optimizable truncate whose operand
1451   /// is an induction variable. Such a truncate will be removed by adding a new
1452   /// induction variable with the destination type.
1453   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1454     // If the instruction is not a truncate, return false.
1455     auto *Trunc = dyn_cast<TruncInst>(I);
1456     if (!Trunc)
1457       return false;
1458 
1459     // Get the source and destination types of the truncate.
1460     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1461     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1462 
1463     // If the truncate is free for the given types, return false. Replacing a
1464     // free truncate with an induction variable would add an induction variable
1465     // update instruction to each iteration of the loop. We exclude from this
1466     // check the primary induction variable since it will need an update
1467     // instruction regardless.
1468     Value *Op = Trunc->getOperand(0);
1469     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1470       return false;
1471 
1472     // If the truncated value is not an induction variable, return false.
1473     return Legal->isInductionPhi(Op);
1474   }
1475 
1476   /// Collects the instructions to scalarize for each predicated instruction in
1477   /// the loop.
1478   void collectInstsToScalarize(ElementCount VF);
1479 
1480   /// Collect Uniform and Scalar values for the given \p VF.
1481   /// The sets depend on CM decision for Load/Store instructions
1482   /// that may be vectorized as interleave, gather-scatter or scalarized.
1483   void collectUniformsAndScalars(ElementCount VF) {
1484     // Do the analysis once.
1485     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1486       return;
1487     setCostBasedWideningDecision(VF);
1488     collectLoopUniforms(VF);
1489     collectLoopScalars(VF);
1490   }
1491 
1492   /// Returns true if the target machine supports masked store operation
1493   /// for the given \p DataType and kind of access to \p Ptr.
1494   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1495     return Legal->isConsecutivePtr(DataType, Ptr) &&
1496            TTI.isLegalMaskedStore(DataType, Alignment);
1497   }
1498 
1499   /// Returns true if the target machine supports masked load operation
1500   /// for the given \p DataType and kind of access to \p Ptr.
1501   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1502     return Legal->isConsecutivePtr(DataType, Ptr) &&
1503            TTI.isLegalMaskedLoad(DataType, Alignment);
1504   }
1505 
1506   /// Returns true if the target machine can represent \p V as a masked gather
1507   /// or scatter operation.
1508   bool isLegalGatherOrScatter(Value *V) {
1509     bool LI = isa<LoadInst>(V);
1510     bool SI = isa<StoreInst>(V);
1511     if (!LI && !SI)
1512       return false;
1513     auto *Ty = getLoadStoreType(V);
1514     Align Align = getLoadStoreAlignment(V);
1515     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1516            (SI && TTI.isLegalMaskedScatter(Ty, Align));
1517   }
1518 
1519   /// Returns true if the target machine supports all of the reduction
1520   /// variables found for the given VF.
1521   bool canVectorizeReductions(ElementCount VF) const {
1522     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1523       const RecurrenceDescriptor &RdxDesc = Reduction.second;
1524       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1525     }));
1526   }
1527 
1528   /// Returns true if \p I is an instruction that will be scalarized with
1529   /// predication. Such instructions include conditional stores and
1530   /// instructions that may divide by zero.
1531   /// If a non-zero VF has been calculated, we check if I will be scalarized
1532   /// predication for that VF.
1533   bool isScalarWithPredication(Instruction *I) const;
1534 
1535   // Returns true if \p I is an instruction that will be predicated either
1536   // through scalar predication or masked load/store or masked gather/scatter.
1537   // Superset of instructions that return true for isScalarWithPredication.
1538   bool isPredicatedInst(Instruction *I) {
1539     if (!blockNeedsPredication(I->getParent()))
1540       return false;
1541     // Loads and stores that need some form of masked operation are predicated
1542     // instructions.
1543     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1544       return Legal->isMaskRequired(I);
1545     return isScalarWithPredication(I);
1546   }
1547 
1548   /// Returns true if \p I is a memory instruction with consecutive memory
1549   /// access that can be widened.
1550   bool
1551   memoryInstructionCanBeWidened(Instruction *I,
1552                                 ElementCount VF = ElementCount::getFixed(1));
1553 
1554   /// Returns true if \p I is a memory instruction in an interleaved-group
1555   /// of memory accesses that can be vectorized with wide vector loads/stores
1556   /// and shuffles.
1557   bool
1558   interleavedAccessCanBeWidened(Instruction *I,
1559                                 ElementCount VF = ElementCount::getFixed(1));
1560 
1561   /// Check if \p Instr belongs to any interleaved access group.
1562   bool isAccessInterleaved(Instruction *Instr) {
1563     return InterleaveInfo.isInterleaved(Instr);
1564   }
1565 
1566   /// Get the interleaved access group that \p Instr belongs to.
1567   const InterleaveGroup<Instruction> *
1568   getInterleavedAccessGroup(Instruction *Instr) {
1569     return InterleaveInfo.getInterleaveGroup(Instr);
1570   }
1571 
1572   /// Returns true if we're required to use a scalar epilogue for at least
1573   /// the final iteration of the original loop.
1574   bool requiresScalarEpilogue(ElementCount VF) const {
1575     if (!isScalarEpilogueAllowed())
1576       return false;
1577     // If we might exit from anywhere but the latch, must run the exiting
1578     // iteration in scalar form.
1579     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1580       return true;
1581     return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();
1582   }
1583 
1584   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1585   /// loop hint annotation.
1586   bool isScalarEpilogueAllowed() const {
1587     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1588   }
1589 
1590   /// Returns true if all loop blocks should be masked to fold tail loop.
1591   bool foldTailByMasking() const { return FoldTailByMasking; }
1592 
1593   bool blockNeedsPredication(BasicBlock *BB) const {
1594     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1595   }
1596 
1597   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1598   /// nodes to the chain of instructions representing the reductions. Uses a
1599   /// MapVector to ensure deterministic iteration order.
1600   using ReductionChainMap =
1601       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1602 
1603   /// Return the chain of instructions representing an inloop reduction.
1604   const ReductionChainMap &getInLoopReductionChains() const {
1605     return InLoopReductionChains;
1606   }
1607 
1608   /// Returns true if the Phi is part of an inloop reduction.
1609   bool isInLoopReduction(PHINode *Phi) const {
1610     return InLoopReductionChains.count(Phi);
1611   }
1612 
1613   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1614   /// with factor VF.  Return the cost of the instruction, including
1615   /// scalarization overhead if it's needed.
1616   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1617 
1618   /// Estimate cost of a call instruction CI if it were vectorized with factor
1619   /// VF. Return the cost of the instruction, including scalarization overhead
1620   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1621   /// scalarized -
1622   /// i.e. either vector version isn't available, or is too expensive.
1623   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1624                                     bool &NeedToScalarize) const;
1625 
1626   /// Returns true if the per-lane cost of VectorizationFactor A is lower than
1627   /// that of B.
1628   bool isMoreProfitable(const VectorizationFactor &A,
1629                         const VectorizationFactor &B) const;
1630 
1631   /// Invalidates decisions already taken by the cost model.
1632   void invalidateCostModelingDecisions() {
1633     WideningDecisions.clear();
1634     Uniforms.clear();
1635     Scalars.clear();
1636   }
1637 
1638 private:
1639   unsigned NumPredStores = 0;
1640 
1641   /// \return An upper bound for the vectorization factors for both
1642   /// fixed and scalable vectorization, where the minimum-known number of
1643   /// elements is a power-of-2 larger than zero. If scalable vectorization is
1644   /// disabled or unsupported, then the scalable part will be equal to
1645   /// ElementCount::getScalable(0).
1646   FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1647                                            ElementCount UserVF);
1648 
1649   /// \return the maximized element count based on the targets vector
1650   /// registers and the loop trip-count, but limited to a maximum safe VF.
1651   /// This is a helper function of computeFeasibleMaxVF.
1652   /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure
1653   /// issue that occurred on one of the buildbots which cannot be reproduced
1654   /// without having access to the properietary compiler (see comments on
1655   /// D98509). The issue is currently under investigation and this workaround
1656   /// will be removed as soon as possible.
1657   ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
1658                                        unsigned SmallestType,
1659                                        unsigned WidestType,
1660                                        const ElementCount &MaxSafeVF);
1661 
1662   /// \return the maximum legal scalable VF, based on the safe max number
1663   /// of elements.
1664   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1665 
1666   /// The vectorization cost is a combination of the cost itself and a boolean
1667   /// indicating whether any of the contributing operations will actually
1668   /// operate on vector values after type legalization in the backend. If this
1669   /// latter value is false, then all operations will be scalarized (i.e. no
1670   /// vectorization has actually taken place).
1671   using VectorizationCostTy = std::pair<InstructionCost, bool>;
1672 
1673   /// Returns the expected execution cost. The unit of the cost does
1674   /// not matter because we use the 'cost' units to compare different
1675   /// vector widths. The cost that is returned is *not* normalized by
1676   /// the factor width. If \p Invalid is not nullptr, this function
1677   /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1678   /// each instruction that has an Invalid cost for the given VF.
1679   using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1680   VectorizationCostTy
1681   expectedCost(ElementCount VF,
1682                SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1683 
1684   /// Returns the execution time cost of an instruction for a given vector
1685   /// width. Vector width of one means scalar.
1686   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1687 
1688   /// The cost-computation logic from getInstructionCost which provides
1689   /// the vector type as an output parameter.
1690   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1691                                      Type *&VectorTy);
1692 
1693   /// Return the cost of instructions in an inloop reduction pattern, if I is
1694   /// part of that pattern.
1695   Optional<InstructionCost>
1696   getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1697                           TTI::TargetCostKind CostKind);
1698 
1699   /// Calculate vectorization cost of memory instruction \p I.
1700   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1701 
1702   /// The cost computation for scalarized memory instruction.
1703   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1704 
1705   /// The cost computation for interleaving group of memory instructions.
1706   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1707 
1708   /// The cost computation for Gather/Scatter instruction.
1709   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1710 
1711   /// The cost computation for widening instruction \p I with consecutive
1712   /// memory access.
1713   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1714 
1715   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1716   /// Load: scalar load + broadcast.
1717   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1718   /// element)
1719   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1720 
1721   /// Estimate the overhead of scalarizing an instruction. This is a
1722   /// convenience wrapper for the type-based getScalarizationOverhead API.
1723   InstructionCost getScalarizationOverhead(Instruction *I,
1724                                            ElementCount VF) const;
1725 
1726   /// Returns whether the instruction is a load or store and will be a emitted
1727   /// as a vector operation.
1728   bool isConsecutiveLoadOrStore(Instruction *I);
1729 
1730   /// Returns true if an artificially high cost for emulated masked memrefs
1731   /// should be used.
1732   bool useEmulatedMaskMemRefHack(Instruction *I);
1733 
1734   /// Map of scalar integer values to the smallest bitwidth they can be legally
1735   /// represented as. The vector equivalents of these values should be truncated
1736   /// to this type.
1737   MapVector<Instruction *, uint64_t> MinBWs;
1738 
1739   /// A type representing the costs for instructions if they were to be
1740   /// scalarized rather than vectorized. The entries are Instruction-Cost
1741   /// pairs.
1742   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1743 
1744   /// A set containing all BasicBlocks that are known to present after
1745   /// vectorization as a predicated block.
1746   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1747 
1748   /// Records whether it is allowed to have the original scalar loop execute at
1749   /// least once. This may be needed as a fallback loop in case runtime
1750   /// aliasing/dependence checks fail, or to handle the tail/remainder
1751   /// iterations when the trip count is unknown or doesn't divide by the VF,
1752   /// or as a peel-loop to handle gaps in interleave-groups.
1753   /// Under optsize and when the trip count is very small we don't allow any
1754   /// iterations to execute in the scalar loop.
1755   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1756 
1757   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1758   bool FoldTailByMasking = false;
1759 
1760   /// A map holding scalar costs for different vectorization factors. The
1761   /// presence of a cost for an instruction in the mapping indicates that the
1762   /// instruction will be scalarized when vectorizing with the associated
1763   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1764   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1765 
1766   /// Holds the instructions known to be uniform after vectorization.
1767   /// The data is collected per VF.
1768   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1769 
1770   /// Holds the instructions known to be scalar after vectorization.
1771   /// The data is collected per VF.
1772   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1773 
1774   /// Holds the instructions (address computations) that are forced to be
1775   /// scalarized.
1776   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1777 
1778   /// PHINodes of the reductions that should be expanded in-loop along with
1779   /// their associated chains of reduction operations, in program order from top
1780   /// (PHI) to bottom
1781   ReductionChainMap InLoopReductionChains;
1782 
1783   /// A Map of inloop reduction operations and their immediate chain operand.
1784   /// FIXME: This can be removed once reductions can be costed correctly in
1785   /// vplan. This was added to allow quick lookup to the inloop operations,
1786   /// without having to loop through InLoopReductionChains.
1787   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1788 
1789   /// Returns the expected difference in cost from scalarizing the expression
1790   /// feeding a predicated instruction \p PredInst. The instructions to
1791   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1792   /// non-negative return value implies the expression will be scalarized.
1793   /// Currently, only single-use chains are considered for scalarization.
1794   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1795                               ElementCount VF);
1796 
1797   /// Collect the instructions that are uniform after vectorization. An
1798   /// instruction is uniform if we represent it with a single scalar value in
1799   /// the vectorized loop corresponding to each vector iteration. Examples of
1800   /// uniform instructions include pointer operands of consecutive or
1801   /// interleaved memory accesses. Note that although uniformity implies an
1802   /// instruction will be scalar, the reverse is not true. In general, a
1803   /// scalarized instruction will be represented by VF scalar values in the
1804   /// vectorized loop, each corresponding to an iteration of the original
1805   /// scalar loop.
1806   void collectLoopUniforms(ElementCount VF);
1807 
1808   /// Collect the instructions that are scalar after vectorization. An
1809   /// instruction is scalar if it is known to be uniform or will be scalarized
1810   /// during vectorization. Non-uniform scalarized instructions will be
1811   /// represented by VF values in the vectorized loop, each corresponding to an
1812   /// iteration of the original scalar loop.
1813   void collectLoopScalars(ElementCount VF);
1814 
1815   /// Keeps cost model vectorization decision and cost for instructions.
1816   /// Right now it is used for memory instructions only.
1817   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1818                                 std::pair<InstWidening, InstructionCost>>;
1819 
1820   DecisionList WideningDecisions;
1821 
1822   /// Returns true if \p V is expected to be vectorized and it needs to be
1823   /// extracted.
1824   bool needsExtract(Value *V, ElementCount VF) const {
1825     Instruction *I = dyn_cast<Instruction>(V);
1826     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1827         TheLoop->isLoopInvariant(I))
1828       return false;
1829 
1830     // Assume we can vectorize V (and hence we need extraction) if the
1831     // scalars are not computed yet. This can happen, because it is called
1832     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1833     // the scalars are collected. That should be a safe assumption in most
1834     // cases, because we check if the operands have vectorizable types
1835     // beforehand in LoopVectorizationLegality.
1836     return Scalars.find(VF) == Scalars.end() ||
1837            !isScalarAfterVectorization(I, VF);
1838   };
1839 
1840   /// Returns a range containing only operands needing to be extracted.
1841   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1842                                                    ElementCount VF) const {
1843     return SmallVector<Value *, 4>(make_filter_range(
1844         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1845   }
1846 
1847   /// Determines if we have the infrastructure to vectorize loop \p L and its
1848   /// epilogue, assuming the main loop is vectorized by \p VF.
1849   bool isCandidateForEpilogueVectorization(const Loop &L,
1850                                            const ElementCount VF) const;
1851 
1852   /// Returns true if epilogue vectorization is considered profitable, and
1853   /// false otherwise.
1854   /// \p VF is the vectorization factor chosen for the original loop.
1855   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1856 
1857 public:
1858   /// The loop that we evaluate.
1859   Loop *TheLoop;
1860 
1861   /// Predicated scalar evolution analysis.
1862   PredicatedScalarEvolution &PSE;
1863 
1864   /// Loop Info analysis.
1865   LoopInfo *LI;
1866 
1867   /// Vectorization legality.
1868   LoopVectorizationLegality *Legal;
1869 
1870   /// Vector target information.
1871   const TargetTransformInfo &TTI;
1872 
1873   /// Target Library Info.
1874   const TargetLibraryInfo *TLI;
1875 
1876   /// Demanded bits analysis.
1877   DemandedBits *DB;
1878 
1879   /// Assumption cache.
1880   AssumptionCache *AC;
1881 
1882   /// Interface to emit optimization remarks.
1883   OptimizationRemarkEmitter *ORE;
1884 
1885   const Function *TheFunction;
1886 
1887   /// Loop Vectorize Hint.
1888   const LoopVectorizeHints *Hints;
1889 
1890   /// The interleave access information contains groups of interleaved accesses
1891   /// with the same stride and close to each other.
1892   InterleavedAccessInfo &InterleaveInfo;
1893 
1894   /// Values to ignore in the cost model.
1895   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1896 
1897   /// Values to ignore in the cost model when VF > 1.
1898   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1899 
1900   /// All element types found in the loop.
1901   SmallPtrSet<Type *, 16> ElementTypesInLoop;
1902 
1903   /// Profitable vector factors.
1904   SmallVector<VectorizationFactor, 8> ProfitableVFs;
1905 };
1906 } // end namespace llvm
1907 
1908 /// Helper struct to manage generating runtime checks for vectorization.
1909 ///
1910 /// The runtime checks are created up-front in temporary blocks to allow better
1911 /// estimating the cost and un-linked from the existing IR. After deciding to
1912 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1913 /// temporary blocks are completely removed.
1914 class GeneratedRTChecks {
1915   /// Basic block which contains the generated SCEV checks, if any.
1916   BasicBlock *SCEVCheckBlock = nullptr;
1917 
1918   /// The value representing the result of the generated SCEV checks. If it is
1919   /// nullptr, either no SCEV checks have been generated or they have been used.
1920   Value *SCEVCheckCond = nullptr;
1921 
1922   /// Basic block which contains the generated memory runtime checks, if any.
1923   BasicBlock *MemCheckBlock = nullptr;
1924 
1925   /// The value representing the result of the generated memory runtime checks.
1926   /// If it is nullptr, either no memory runtime checks have been generated or
1927   /// they have been used.
1928   Value *MemRuntimeCheckCond = nullptr;
1929 
1930   DominatorTree *DT;
1931   LoopInfo *LI;
1932 
1933   SCEVExpander SCEVExp;
1934   SCEVExpander MemCheckExp;
1935 
1936 public:
1937   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1938                     const DataLayout &DL)
1939       : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"),
1940         MemCheckExp(SE, DL, "scev.check") {}
1941 
1942   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1943   /// accurately estimate the cost of the runtime checks. The blocks are
1944   /// un-linked from the IR and is added back during vector code generation. If
1945   /// there is no vector code generation, the check blocks are removed
1946   /// completely.
1947   void Create(Loop *L, const LoopAccessInfo &LAI,
1948               const SCEVUnionPredicate &UnionPred) {
1949 
1950     BasicBlock *LoopHeader = L->getHeader();
1951     BasicBlock *Preheader = L->getLoopPreheader();
1952 
1953     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1954     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1955     // may be used by SCEVExpander. The blocks will be un-linked from their
1956     // predecessors and removed from LI & DT at the end of the function.
1957     if (!UnionPred.isAlwaysTrue()) {
1958       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1959                                   nullptr, "vector.scevcheck");
1960 
1961       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1962           &UnionPred, SCEVCheckBlock->getTerminator());
1963     }
1964 
1965     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1966     if (RtPtrChecking.Need) {
1967       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1968       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1969                                  "vector.memcheck");
1970 
1971       MemRuntimeCheckCond =
1972           addRuntimeChecks(MemCheckBlock->getTerminator(), L,
1973                            RtPtrChecking.getChecks(), MemCheckExp);
1974       assert(MemRuntimeCheckCond &&
1975              "no RT checks generated although RtPtrChecking "
1976              "claimed checks are required");
1977     }
1978 
1979     if (!MemCheckBlock && !SCEVCheckBlock)
1980       return;
1981 
1982     // Unhook the temporary block with the checks, update various places
1983     // accordingly.
1984     if (SCEVCheckBlock)
1985       SCEVCheckBlock->replaceAllUsesWith(Preheader);
1986     if (MemCheckBlock)
1987       MemCheckBlock->replaceAllUsesWith(Preheader);
1988 
1989     if (SCEVCheckBlock) {
1990       SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1991       new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1992       Preheader->getTerminator()->eraseFromParent();
1993     }
1994     if (MemCheckBlock) {
1995       MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1996       new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1997       Preheader->getTerminator()->eraseFromParent();
1998     }
1999 
2000     DT->changeImmediateDominator(LoopHeader, Preheader);
2001     if (MemCheckBlock) {
2002       DT->eraseNode(MemCheckBlock);
2003       LI->removeBlock(MemCheckBlock);
2004     }
2005     if (SCEVCheckBlock) {
2006       DT->eraseNode(SCEVCheckBlock);
2007       LI->removeBlock(SCEVCheckBlock);
2008     }
2009   }
2010 
2011   /// Remove the created SCEV & memory runtime check blocks & instructions, if
2012   /// unused.
2013   ~GeneratedRTChecks() {
2014     SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT);
2015     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT);
2016     if (!SCEVCheckCond)
2017       SCEVCleaner.markResultUsed();
2018 
2019     if (!MemRuntimeCheckCond)
2020       MemCheckCleaner.markResultUsed();
2021 
2022     if (MemRuntimeCheckCond) {
2023       auto &SE = *MemCheckExp.getSE();
2024       // Memory runtime check generation creates compares that use expanded
2025       // values. Remove them before running the SCEVExpanderCleaners.
2026       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2027         if (MemCheckExp.isInsertedInstruction(&I))
2028           continue;
2029         SE.forgetValue(&I);
2030         SE.eraseValueFromMap(&I);
2031         I.eraseFromParent();
2032       }
2033     }
2034     MemCheckCleaner.cleanup();
2035     SCEVCleaner.cleanup();
2036 
2037     if (SCEVCheckCond)
2038       SCEVCheckBlock->eraseFromParent();
2039     if (MemRuntimeCheckCond)
2040       MemCheckBlock->eraseFromParent();
2041   }
2042 
2043   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2044   /// adjusts the branches to branch to the vector preheader or \p Bypass,
2045   /// depending on the generated condition.
2046   BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass,
2047                              BasicBlock *LoopVectorPreHeader,
2048                              BasicBlock *LoopExitBlock) {
2049     if (!SCEVCheckCond)
2050       return nullptr;
2051     if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond))
2052       if (C->isZero())
2053         return nullptr;
2054 
2055     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2056 
2057     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2058     // Create new preheader for vector loop.
2059     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2060       PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2061 
2062     SCEVCheckBlock->getTerminator()->eraseFromParent();
2063     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2064     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2065                                                 SCEVCheckBlock);
2066 
2067     DT->addNewBlock(SCEVCheckBlock, Pred);
2068     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2069 
2070     ReplaceInstWithInst(
2071         SCEVCheckBlock->getTerminator(),
2072         BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond));
2073     // Mark the check as used, to prevent it from being removed during cleanup.
2074     SCEVCheckCond = nullptr;
2075     return SCEVCheckBlock;
2076   }
2077 
2078   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2079   /// the branches to branch to the vector preheader or \p Bypass, depending on
2080   /// the generated condition.
2081   BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass,
2082                                    BasicBlock *LoopVectorPreHeader) {
2083     // Check if we generated code that checks in runtime if arrays overlap.
2084     if (!MemRuntimeCheckCond)
2085       return nullptr;
2086 
2087     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2088     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2089                                                 MemCheckBlock);
2090 
2091     DT->addNewBlock(MemCheckBlock, Pred);
2092     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2093     MemCheckBlock->moveBefore(LoopVectorPreHeader);
2094 
2095     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2096       PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2097 
2098     ReplaceInstWithInst(
2099         MemCheckBlock->getTerminator(),
2100         BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2101     MemCheckBlock->getTerminator()->setDebugLoc(
2102         Pred->getTerminator()->getDebugLoc());
2103 
2104     // Mark the check as used, to prevent it from being removed during cleanup.
2105     MemRuntimeCheckCond = nullptr;
2106     return MemCheckBlock;
2107   }
2108 };
2109 
2110 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2111 // vectorization. The loop needs to be annotated with #pragma omp simd
2112 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2113 // vector length information is not provided, vectorization is not considered
2114 // explicit. Interleave hints are not allowed either. These limitations will be
2115 // relaxed in the future.
2116 // Please, note that we are currently forced to abuse the pragma 'clang
2117 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2118 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2119 // provides *explicit vectorization hints* (LV can bypass legal checks and
2120 // assume that vectorization is legal). However, both hints are implemented
2121 // using the same metadata (llvm.loop.vectorize, processed by
2122 // LoopVectorizeHints). This will be fixed in the future when the native IR
2123 // representation for pragma 'omp simd' is introduced.
2124 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2125                                    OptimizationRemarkEmitter *ORE) {
2126   assert(!OuterLp->isInnermost() && "This is not an outer loop");
2127   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2128 
2129   // Only outer loops with an explicit vectorization hint are supported.
2130   // Unannotated outer loops are ignored.
2131   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2132     return false;
2133 
2134   Function *Fn = OuterLp->getHeader()->getParent();
2135   if (!Hints.allowVectorization(Fn, OuterLp,
2136                                 true /*VectorizeOnlyWhenForced*/)) {
2137     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2138     return false;
2139   }
2140 
2141   if (Hints.getInterleave() > 1) {
2142     // TODO: Interleave support is future work.
2143     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2144                          "outer loops.\n");
2145     Hints.emitRemarkWithHints();
2146     return false;
2147   }
2148 
2149   return true;
2150 }
2151 
2152 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2153                                   OptimizationRemarkEmitter *ORE,
2154                                   SmallVectorImpl<Loop *> &V) {
2155   // Collect inner loops and outer loops without irreducible control flow. For
2156   // now, only collect outer loops that have explicit vectorization hints. If we
2157   // are stress testing the VPlan H-CFG construction, we collect the outermost
2158   // loop of every loop nest.
2159   if (L.isInnermost() || VPlanBuildStressTest ||
2160       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2161     LoopBlocksRPO RPOT(&L);
2162     RPOT.perform(LI);
2163     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2164       V.push_back(&L);
2165       // TODO: Collect inner loops inside marked outer loops in case
2166       // vectorization fails for the outer loop. Do not invoke
2167       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2168       // already known to be reducible. We can use an inherited attribute for
2169       // that.
2170       return;
2171     }
2172   }
2173   for (Loop *InnerL : L)
2174     collectSupportedLoops(*InnerL, LI, ORE, V);
2175 }
2176 
2177 namespace {
2178 
2179 /// The LoopVectorize Pass.
2180 struct LoopVectorize : public FunctionPass {
2181   /// Pass identification, replacement for typeid
2182   static char ID;
2183 
2184   LoopVectorizePass Impl;
2185 
2186   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2187                          bool VectorizeOnlyWhenForced = false)
2188       : FunctionPass(ID),
2189         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2190     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2191   }
2192 
2193   bool runOnFunction(Function &F) override {
2194     if (skipFunction(F))
2195       return false;
2196 
2197     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2198     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2199     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2200     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2201     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2202     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2203     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2204     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2205     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2206     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2207     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2208     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2209     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2210 
2211     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2212         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2213 
2214     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2215                         GetLAA, *ORE, PSI).MadeAnyChange;
2216   }
2217 
2218   void getAnalysisUsage(AnalysisUsage &AU) const override {
2219     AU.addRequired<AssumptionCacheTracker>();
2220     AU.addRequired<BlockFrequencyInfoWrapperPass>();
2221     AU.addRequired<DominatorTreeWrapperPass>();
2222     AU.addRequired<LoopInfoWrapperPass>();
2223     AU.addRequired<ScalarEvolutionWrapperPass>();
2224     AU.addRequired<TargetTransformInfoWrapperPass>();
2225     AU.addRequired<AAResultsWrapperPass>();
2226     AU.addRequired<LoopAccessLegacyAnalysis>();
2227     AU.addRequired<DemandedBitsWrapperPass>();
2228     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2229     AU.addRequired<InjectTLIMappingsLegacy>();
2230 
2231     // We currently do not preserve loopinfo/dominator analyses with outer loop
2232     // vectorization. Until this is addressed, mark these analyses as preserved
2233     // only for non-VPlan-native path.
2234     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2235     if (!EnableVPlanNativePath) {
2236       AU.addPreserved<LoopInfoWrapperPass>();
2237       AU.addPreserved<DominatorTreeWrapperPass>();
2238     }
2239 
2240     AU.addPreserved<BasicAAWrapperPass>();
2241     AU.addPreserved<GlobalsAAWrapperPass>();
2242     AU.addRequired<ProfileSummaryInfoWrapperPass>();
2243   }
2244 };
2245 
2246 } // end anonymous namespace
2247 
2248 //===----------------------------------------------------------------------===//
2249 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2250 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2251 //===----------------------------------------------------------------------===//
2252 
2253 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2254   // We need to place the broadcast of invariant variables outside the loop,
2255   // but only if it's proven safe to do so. Else, broadcast will be inside
2256   // vector loop body.
2257   Instruction *Instr = dyn_cast<Instruction>(V);
2258   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2259                      (!Instr ||
2260                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2261   // Place the code for broadcasting invariant variables in the new preheader.
2262   IRBuilder<>::InsertPointGuard Guard(Builder);
2263   if (SafeToHoist)
2264     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2265 
2266   // Broadcast the scalar into all locations in the vector.
2267   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2268 
2269   return Shuf;
2270 }
2271 
2272 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
2273     const InductionDescriptor &II, Value *Step, Value *Start,
2274     Instruction *EntryVal, VPValue *Def, VPValue *CastDef,
2275     VPTransformState &State) {
2276   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2277          "Expected either an induction phi-node or a truncate of it!");
2278 
2279   // Construct the initial value of the vector IV in the vector loop preheader
2280   auto CurrIP = Builder.saveIP();
2281   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2282   if (isa<TruncInst>(EntryVal)) {
2283     assert(Start->getType()->isIntegerTy() &&
2284            "Truncation requires an integer type");
2285     auto *TruncType = cast<IntegerType>(EntryVal->getType());
2286     Step = Builder.CreateTrunc(Step, TruncType);
2287     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2288   }
2289   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
2290   Value *SteppedStart =
2291       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
2292 
2293   // We create vector phi nodes for both integer and floating-point induction
2294   // variables. Here, we determine the kind of arithmetic we will perform.
2295   Instruction::BinaryOps AddOp;
2296   Instruction::BinaryOps MulOp;
2297   if (Step->getType()->isIntegerTy()) {
2298     AddOp = Instruction::Add;
2299     MulOp = Instruction::Mul;
2300   } else {
2301     AddOp = II.getInductionOpcode();
2302     MulOp = Instruction::FMul;
2303   }
2304 
2305   // Multiply the vectorization factor by the step using integer or
2306   // floating-point arithmetic as appropriate.
2307   Type *StepType = Step->getType();
2308   if (Step->getType()->isFloatingPointTy())
2309     StepType = IntegerType::get(StepType->getContext(),
2310                                 StepType->getScalarSizeInBits());
2311   Value *RuntimeVF = getRuntimeVF(Builder, StepType, VF);
2312   if (Step->getType()->isFloatingPointTy())
2313     RuntimeVF = Builder.CreateSIToFP(RuntimeVF, Step->getType());
2314   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
2315 
2316   // Create a vector splat to use in the induction update.
2317   //
2318   // FIXME: If the step is non-constant, we create the vector splat with
2319   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2320   //        handle a constant vector splat.
2321   Value *SplatVF = isa<Constant>(Mul)
2322                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
2323                        : Builder.CreateVectorSplat(VF, Mul);
2324   Builder.restoreIP(CurrIP);
2325 
2326   // We may need to add the step a number of times, depending on the unroll
2327   // factor. The last of those goes into the PHI.
2328   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2329                                     &*LoopVectorBody->getFirstInsertionPt());
2330   VecInd->setDebugLoc(EntryVal->getDebugLoc());
2331   Instruction *LastInduction = VecInd;
2332   for (unsigned Part = 0; Part < UF; ++Part) {
2333     State.set(Def, LastInduction, Part);
2334 
2335     if (isa<TruncInst>(EntryVal))
2336       addMetadata(LastInduction, EntryVal);
2337     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef,
2338                                           State, Part);
2339 
2340     LastInduction = cast<Instruction>(
2341         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
2342     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
2343   }
2344 
2345   // Move the last step to the end of the latch block. This ensures consistent
2346   // placement of all induction updates.
2347   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2348   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2349   auto *ICmp = cast<Instruction>(Br->getCondition());
2350   LastInduction->moveBefore(ICmp);
2351   LastInduction->setName("vec.ind.next");
2352 
2353   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2354   VecInd->addIncoming(LastInduction, LoopVectorLatch);
2355 }
2356 
2357 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
2358   return Cost->isScalarAfterVectorization(I, VF) ||
2359          Cost->isProfitableToScalarize(I, VF);
2360 }
2361 
2362 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
2363   if (shouldScalarizeInstruction(IV))
2364     return true;
2365   auto isScalarInst = [&](User *U) -> bool {
2366     auto *I = cast<Instruction>(U);
2367     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
2368   };
2369   return llvm::any_of(IV->users(), isScalarInst);
2370 }
2371 
2372 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
2373     const InductionDescriptor &ID, const Instruction *EntryVal,
2374     Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State,
2375     unsigned Part, unsigned Lane) {
2376   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2377          "Expected either an induction phi-node or a truncate of it!");
2378 
2379   // This induction variable is not the phi from the original loop but the
2380   // newly-created IV based on the proof that casted Phi is equal to the
2381   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
2382   // re-uses the same InductionDescriptor that original IV uses but we don't
2383   // have to do any recording in this case - that is done when original IV is
2384   // processed.
2385   if (isa<TruncInst>(EntryVal))
2386     return;
2387 
2388   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
2389   if (Casts.empty())
2390     return;
2391   // Only the first Cast instruction in the Casts vector is of interest.
2392   // The rest of the Casts (if exist) have no uses outside the
2393   // induction update chain itself.
2394   if (Lane < UINT_MAX)
2395     State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane));
2396   else
2397     State.set(CastDef, VectorLoopVal, Part);
2398 }
2399 
2400 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
2401                                                 TruncInst *Trunc, VPValue *Def,
2402                                                 VPValue *CastDef,
2403                                                 VPTransformState &State) {
2404   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
2405          "Primary induction variable must have an integer type");
2406 
2407   auto II = Legal->getInductionVars().find(IV);
2408   assert(II != Legal->getInductionVars().end() && "IV is not an induction");
2409 
2410   auto ID = II->second;
2411   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
2412 
2413   // The value from the original loop to which we are mapping the new induction
2414   // variable.
2415   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2416 
2417   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2418 
2419   // Generate code for the induction step. Note that induction steps are
2420   // required to be loop-invariant
2421   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
2422     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
2423            "Induction step should be loop invariant");
2424     if (PSE.getSE()->isSCEVable(IV->getType())) {
2425       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2426       return Exp.expandCodeFor(Step, Step->getType(),
2427                                LoopVectorPreHeader->getTerminator());
2428     }
2429     return cast<SCEVUnknown>(Step)->getValue();
2430   };
2431 
2432   // The scalar value to broadcast. This is derived from the canonical
2433   // induction variable. If a truncation type is given, truncate the canonical
2434   // induction variable and step. Otherwise, derive these values from the
2435   // induction descriptor.
2436   auto CreateScalarIV = [&](Value *&Step) -> Value * {
2437     Value *ScalarIV = Induction;
2438     if (IV != OldInduction) {
2439       ScalarIV = IV->getType()->isIntegerTy()
2440                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
2441                      : Builder.CreateCast(Instruction::SIToFP, Induction,
2442                                           IV->getType());
2443       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
2444       ScalarIV->setName("offset.idx");
2445     }
2446     if (Trunc) {
2447       auto *TruncType = cast<IntegerType>(Trunc->getType());
2448       assert(Step->getType()->isIntegerTy() &&
2449              "Truncation requires an integer step");
2450       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2451       Step = Builder.CreateTrunc(Step, TruncType);
2452     }
2453     return ScalarIV;
2454   };
2455 
2456   // Create the vector values from the scalar IV, in the absence of creating a
2457   // vector IV.
2458   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
2459     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
2460     for (unsigned Part = 0; Part < UF; ++Part) {
2461       assert(!VF.isScalable() && "scalable vectors not yet supported.");
2462       Value *EntryPart =
2463           getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
2464                         ID.getInductionOpcode());
2465       State.set(Def, EntryPart, Part);
2466       if (Trunc)
2467         addMetadata(EntryPart, Trunc);
2468       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef,
2469                                             State, Part);
2470     }
2471   };
2472 
2473   // Fast-math-flags propagate from the original induction instruction.
2474   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
2475   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
2476     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
2477 
2478   // Now do the actual transformations, and start with creating the step value.
2479   Value *Step = CreateStepValue(ID.getStep());
2480   if (VF.isZero() || VF.isScalar()) {
2481     Value *ScalarIV = CreateScalarIV(Step);
2482     CreateSplatIV(ScalarIV, Step);
2483     return;
2484   }
2485 
2486   // Determine if we want a scalar version of the induction variable. This is
2487   // true if the induction variable itself is not widened, or if it has at
2488   // least one user in the loop that is not widened.
2489   auto NeedsScalarIV = needsScalarInduction(EntryVal);
2490   if (!NeedsScalarIV) {
2491     createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
2492                                     State);
2493     return;
2494   }
2495 
2496   // Try to create a new independent vector induction variable. If we can't
2497   // create the phi node, we will splat the scalar induction variable in each
2498   // loop iteration.
2499   if (!shouldScalarizeInstruction(EntryVal)) {
2500     createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
2501                                     State);
2502     Value *ScalarIV = CreateScalarIV(Step);
2503     // Create scalar steps that can be used by instructions we will later
2504     // scalarize. Note that the addition of the scalar steps will not increase
2505     // the number of instructions in the loop in the common case prior to
2506     // InstCombine. We will be trading one vector extract for each scalar step.
2507     buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
2508     return;
2509   }
2510 
2511   // All IV users are scalar instructions, so only emit a scalar IV, not a
2512   // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2513   // predicate used by the masked loads/stores.
2514   Value *ScalarIV = CreateScalarIV(Step);
2515   if (!Cost->isScalarEpilogueAllowed())
2516     CreateSplatIV(ScalarIV, Step);
2517   buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
2518 }
2519 
2520 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
2521                                           Instruction::BinaryOps BinOp) {
2522   // Create and check the types.
2523   auto *ValVTy = cast<VectorType>(Val->getType());
2524   ElementCount VLen = ValVTy->getElementCount();
2525 
2526   Type *STy = Val->getType()->getScalarType();
2527   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2528          "Induction Step must be an integer or FP");
2529   assert(Step->getType() == STy && "Step has wrong type");
2530 
2531   SmallVector<Constant *, 8> Indices;
2532 
2533   // Create a vector of consecutive numbers from zero to VF.
2534   VectorType *InitVecValVTy = ValVTy;
2535   Type *InitVecValSTy = STy;
2536   if (STy->isFloatingPointTy()) {
2537     InitVecValSTy =
2538         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
2539     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2540   }
2541   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2542 
2543   // Add on StartIdx
2544   Value *StartIdxSplat = Builder.CreateVectorSplat(
2545       VLen, ConstantInt::get(InitVecValSTy, StartIdx));
2546   InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2547 
2548   if (STy->isIntegerTy()) {
2549     Step = Builder.CreateVectorSplat(VLen, Step);
2550     assert(Step->getType() == Val->getType() && "Invalid step vec");
2551     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2552     // which can be found from the original scalar operations.
2553     Step = Builder.CreateMul(InitVec, Step);
2554     return Builder.CreateAdd(Val, Step, "induction");
2555   }
2556 
2557   // Floating point induction.
2558   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2559          "Binary Opcode should be specified for FP induction");
2560   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2561   Step = Builder.CreateVectorSplat(VLen, Step);
2562   Value *MulOp = Builder.CreateFMul(InitVec, Step);
2563   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2564 }
2565 
2566 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2567                                            Instruction *EntryVal,
2568                                            const InductionDescriptor &ID,
2569                                            VPValue *Def, VPValue *CastDef,
2570                                            VPTransformState &State) {
2571   // We shouldn't have to build scalar steps if we aren't vectorizing.
2572   assert(VF.isVector() && "VF should be greater than one");
2573   // Get the value type and ensure it and the step have the same integer type.
2574   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2575   assert(ScalarIVTy == Step->getType() &&
2576          "Val and Step should have the same type");
2577 
2578   // We build scalar steps for both integer and floating-point induction
2579   // variables. Here, we determine the kind of arithmetic we will perform.
2580   Instruction::BinaryOps AddOp;
2581   Instruction::BinaryOps MulOp;
2582   if (ScalarIVTy->isIntegerTy()) {
2583     AddOp = Instruction::Add;
2584     MulOp = Instruction::Mul;
2585   } else {
2586     AddOp = ID.getInductionOpcode();
2587     MulOp = Instruction::FMul;
2588   }
2589 
2590   // Determine the number of scalars we need to generate for each unroll
2591   // iteration. If EntryVal is uniform, we only need to generate the first
2592   // lane. Otherwise, we generate all VF values.
2593   bool IsUniform =
2594       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF);
2595   unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue();
2596   // Compute the scalar steps and save the results in State.
2597   Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2598                                      ScalarIVTy->getScalarSizeInBits());
2599   Type *VecIVTy = nullptr;
2600   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2601   if (!IsUniform && VF.isScalable()) {
2602     VecIVTy = VectorType::get(ScalarIVTy, VF);
2603     UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF));
2604     SplatStep = Builder.CreateVectorSplat(VF, Step);
2605     SplatIV = Builder.CreateVectorSplat(VF, ScalarIV);
2606   }
2607 
2608   for (unsigned Part = 0; Part < UF; ++Part) {
2609     Value *StartIdx0 =
2610         createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF);
2611 
2612     if (!IsUniform && VF.isScalable()) {
2613       auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0);
2614       auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2615       if (ScalarIVTy->isFloatingPointTy())
2616         InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2617       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2618       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2619       State.set(Def, Add, Part);
2620       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
2621                                             Part);
2622       // It's useful to record the lane values too for the known minimum number
2623       // of elements so we do those below. This improves the code quality when
2624       // trying to extract the first element, for example.
2625     }
2626 
2627     if (ScalarIVTy->isFloatingPointTy())
2628       StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2629 
2630     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2631       Value *StartIdx = Builder.CreateBinOp(
2632           AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2633       // The step returned by `createStepForVF` is a runtime-evaluated value
2634       // when VF is scalable. Otherwise, it should be folded into a Constant.
2635       assert((VF.isScalable() || isa<Constant>(StartIdx)) &&
2636              "Expected StartIdx to be folded to a constant when VF is not "
2637              "scalable");
2638       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2639       auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2640       State.set(Def, Add, VPIteration(Part, Lane));
2641       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
2642                                             Part, Lane);
2643     }
2644   }
2645 }
2646 
2647 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2648                                                     const VPIteration &Instance,
2649                                                     VPTransformState &State) {
2650   Value *ScalarInst = State.get(Def, Instance);
2651   Value *VectorValue = State.get(Def, Instance.Part);
2652   VectorValue = Builder.CreateInsertElement(
2653       VectorValue, ScalarInst,
2654       Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2655   State.set(Def, VectorValue, Instance.Part);
2656 }
2657 
2658 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2659   assert(Vec->getType()->isVectorTy() && "Invalid type");
2660   return Builder.CreateVectorReverse(Vec, "reverse");
2661 }
2662 
2663 // Return whether we allow using masked interleave-groups (for dealing with
2664 // strided loads/stores that reside in predicated blocks, or for dealing
2665 // with gaps).
2666 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2667   // If an override option has been passed in for interleaved accesses, use it.
2668   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2669     return EnableMaskedInterleavedMemAccesses;
2670 
2671   return TTI.enableMaskedInterleavedAccessVectorization();
2672 }
2673 
2674 // Try to vectorize the interleave group that \p Instr belongs to.
2675 //
2676 // E.g. Translate following interleaved load group (factor = 3):
2677 //   for (i = 0; i < N; i+=3) {
2678 //     R = Pic[i];             // Member of index 0
2679 //     G = Pic[i+1];           // Member of index 1
2680 //     B = Pic[i+2];           // Member of index 2
2681 //     ... // do something to R, G, B
2682 //   }
2683 // To:
2684 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2685 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2686 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2687 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2688 //
2689 // Or translate following interleaved store group (factor = 3):
2690 //   for (i = 0; i < N; i+=3) {
2691 //     ... do something to R, G, B
2692 //     Pic[i]   = R;           // Member of index 0
2693 //     Pic[i+1] = G;           // Member of index 1
2694 //     Pic[i+2] = B;           // Member of index 2
2695 //   }
2696 // To:
2697 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2698 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2699 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2700 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2701 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2702 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2703     const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2704     VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2705     VPValue *BlockInMask) {
2706   Instruction *Instr = Group->getInsertPos();
2707   const DataLayout &DL = Instr->getModule()->getDataLayout();
2708 
2709   // Prepare for the vector type of the interleaved load/store.
2710   Type *ScalarTy = getLoadStoreType(Instr);
2711   unsigned InterleaveFactor = Group->getFactor();
2712   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2713   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2714 
2715   // Prepare for the new pointers.
2716   SmallVector<Value *, 2> AddrParts;
2717   unsigned Index = Group->getIndex(Instr);
2718 
2719   // TODO: extend the masked interleaved-group support to reversed access.
2720   assert((!BlockInMask || !Group->isReverse()) &&
2721          "Reversed masked interleave-group not supported.");
2722 
2723   // If the group is reverse, adjust the index to refer to the last vector lane
2724   // instead of the first. We adjust the index from the first vector lane,
2725   // rather than directly getting the pointer for lane VF - 1, because the
2726   // pointer operand of the interleaved access is supposed to be uniform. For
2727   // uniform instructions, we're only required to generate a value for the
2728   // first vector lane in each unroll iteration.
2729   if (Group->isReverse())
2730     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2731 
2732   for (unsigned Part = 0; Part < UF; Part++) {
2733     Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2734     setDebugLocFromInst(AddrPart);
2735 
2736     // Notice current instruction could be any index. Need to adjust the address
2737     // to the member of index 0.
2738     //
2739     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2740     //       b = A[i];       // Member of index 0
2741     // Current pointer is pointed to A[i+1], adjust it to A[i].
2742     //
2743     // E.g.  A[i+1] = a;     // Member of index 1
2744     //       A[i]   = b;     // Member of index 0
2745     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2746     // Current pointer is pointed to A[i+2], adjust it to A[i].
2747 
2748     bool InBounds = false;
2749     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2750       InBounds = gep->isInBounds();
2751     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2752     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2753 
2754     // Cast to the vector pointer type.
2755     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2756     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2757     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2758   }
2759 
2760   setDebugLocFromInst(Instr);
2761   Value *PoisonVec = PoisonValue::get(VecTy);
2762 
2763   Value *MaskForGaps = nullptr;
2764   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2765     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2766     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2767   }
2768 
2769   // Vectorize the interleaved load group.
2770   if (isa<LoadInst>(Instr)) {
2771     // For each unroll part, create a wide load for the group.
2772     SmallVector<Value *, 2> NewLoads;
2773     for (unsigned Part = 0; Part < UF; Part++) {
2774       Instruction *NewLoad;
2775       if (BlockInMask || MaskForGaps) {
2776         assert(useMaskedInterleavedAccesses(*TTI) &&
2777                "masked interleaved groups are not allowed.");
2778         Value *GroupMask = MaskForGaps;
2779         if (BlockInMask) {
2780           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2781           Value *ShuffledMask = Builder.CreateShuffleVector(
2782               BlockInMaskPart,
2783               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2784               "interleaved.mask");
2785           GroupMask = MaskForGaps
2786                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2787                                                 MaskForGaps)
2788                           : ShuffledMask;
2789         }
2790         NewLoad =
2791             Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2792                                      GroupMask, PoisonVec, "wide.masked.vec");
2793       }
2794       else
2795         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2796                                             Group->getAlign(), "wide.vec");
2797       Group->addMetadata(NewLoad);
2798       NewLoads.push_back(NewLoad);
2799     }
2800 
2801     // For each member in the group, shuffle out the appropriate data from the
2802     // wide loads.
2803     unsigned J = 0;
2804     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2805       Instruction *Member = Group->getMember(I);
2806 
2807       // Skip the gaps in the group.
2808       if (!Member)
2809         continue;
2810 
2811       auto StrideMask =
2812           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2813       for (unsigned Part = 0; Part < UF; Part++) {
2814         Value *StridedVec = Builder.CreateShuffleVector(
2815             NewLoads[Part], StrideMask, "strided.vec");
2816 
2817         // If this member has different type, cast the result type.
2818         if (Member->getType() != ScalarTy) {
2819           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2820           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2821           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2822         }
2823 
2824         if (Group->isReverse())
2825           StridedVec = reverseVector(StridedVec);
2826 
2827         State.set(VPDefs[J], StridedVec, Part);
2828       }
2829       ++J;
2830     }
2831     return;
2832   }
2833 
2834   // The sub vector type for current instruction.
2835   auto *SubVT = VectorType::get(ScalarTy, VF);
2836 
2837   // Vectorize the interleaved store group.
2838   MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2839   assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2840          "masked interleaved groups are not allowed.");
2841   assert((!MaskForGaps || !VF.isScalable()) &&
2842          "masking gaps for scalable vectors is not yet supported.");
2843   for (unsigned Part = 0; Part < UF; Part++) {
2844     // Collect the stored vector from each member.
2845     SmallVector<Value *, 4> StoredVecs;
2846     for (unsigned i = 0; i < InterleaveFactor; i++) {
2847       assert((Group->getMember(i) || MaskForGaps) &&
2848              "Fail to get a member from an interleaved store group");
2849       Instruction *Member = Group->getMember(i);
2850 
2851       // Skip the gaps in the group.
2852       if (!Member) {
2853         Value *Undef = PoisonValue::get(SubVT);
2854         StoredVecs.push_back(Undef);
2855         continue;
2856       }
2857 
2858       Value *StoredVec = State.get(StoredValues[i], Part);
2859 
2860       if (Group->isReverse())
2861         StoredVec = reverseVector(StoredVec);
2862 
2863       // If this member has different type, cast it to a unified type.
2864 
2865       if (StoredVec->getType() != SubVT)
2866         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2867 
2868       StoredVecs.push_back(StoredVec);
2869     }
2870 
2871     // Concatenate all vectors into a wide vector.
2872     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2873 
2874     // Interleave the elements in the wide vector.
2875     Value *IVec = Builder.CreateShuffleVector(
2876         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2877         "interleaved.vec");
2878 
2879     Instruction *NewStoreInstr;
2880     if (BlockInMask || MaskForGaps) {
2881       Value *GroupMask = MaskForGaps;
2882       if (BlockInMask) {
2883         Value *BlockInMaskPart = State.get(BlockInMask, Part);
2884         Value *ShuffledMask = Builder.CreateShuffleVector(
2885             BlockInMaskPart,
2886             createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2887             "interleaved.mask");
2888         GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And,
2889                                                       ShuffledMask, MaskForGaps)
2890                                 : ShuffledMask;
2891       }
2892       NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2893                                                 Group->getAlign(), GroupMask);
2894     } else
2895       NewStoreInstr =
2896           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2897 
2898     Group->addMetadata(NewStoreInstr);
2899   }
2900 }
2901 
2902 void InnerLoopVectorizer::vectorizeMemoryInstruction(
2903     Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,
2904     VPValue *StoredValue, VPValue *BlockInMask, bool ConsecutiveStride,
2905     bool Reverse) {
2906   // Attempt to issue a wide load.
2907   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2908   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2909 
2910   assert((LI || SI) && "Invalid Load/Store instruction");
2911   assert((!SI || StoredValue) && "No stored value provided for widened store");
2912   assert((!LI || !StoredValue) && "Stored value provided for widened load");
2913 
2914   Type *ScalarDataTy = getLoadStoreType(Instr);
2915 
2916   auto *DataTy = VectorType::get(ScalarDataTy, VF);
2917   const Align Alignment = getLoadStoreAlignment(Instr);
2918   bool CreateGatherScatter = !ConsecutiveStride;
2919 
2920   VectorParts BlockInMaskParts(UF);
2921   bool isMaskRequired = BlockInMask;
2922   if (isMaskRequired)
2923     for (unsigned Part = 0; Part < UF; ++Part)
2924       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2925 
2926   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2927     // Calculate the pointer for the specific unroll-part.
2928     GetElementPtrInst *PartPtr = nullptr;
2929 
2930     bool InBounds = false;
2931     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2932       InBounds = gep->isInBounds();
2933     if (Reverse) {
2934       // If the address is consecutive but reversed, then the
2935       // wide store needs to start at the last vector element.
2936       // RunTimeVF =  VScale * VF.getKnownMinValue()
2937       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
2938       Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF);
2939       // NumElt = -Part * RunTimeVF
2940       Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
2941       // LastLane = 1 - RunTimeVF
2942       Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
2943       PartPtr =
2944           cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
2945       PartPtr->setIsInBounds(InBounds);
2946       PartPtr = cast<GetElementPtrInst>(
2947           Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
2948       PartPtr->setIsInBounds(InBounds);
2949       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2950         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2951     } else {
2952       Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF);
2953       PartPtr = cast<GetElementPtrInst>(
2954           Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
2955       PartPtr->setIsInBounds(InBounds);
2956     }
2957 
2958     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2959     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2960   };
2961 
2962   // Handle Stores:
2963   if (SI) {
2964     setDebugLocFromInst(SI);
2965 
2966     for (unsigned Part = 0; Part < UF; ++Part) {
2967       Instruction *NewSI = nullptr;
2968       Value *StoredVal = State.get(StoredValue, Part);
2969       if (CreateGatherScatter) {
2970         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2971         Value *VectorGep = State.get(Addr, Part);
2972         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2973                                             MaskPart);
2974       } else {
2975         if (Reverse) {
2976           // If we store to reverse consecutive memory locations, then we need
2977           // to reverse the order of elements in the stored value.
2978           StoredVal = reverseVector(StoredVal);
2979           // We don't want to update the value in the map as it might be used in
2980           // another expression. So don't call resetVectorValue(StoredVal).
2981         }
2982         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));
2983         if (isMaskRequired)
2984           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2985                                             BlockInMaskParts[Part]);
2986         else
2987           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2988       }
2989       addMetadata(NewSI, SI);
2990     }
2991     return;
2992   }
2993 
2994   // Handle loads.
2995   assert(LI && "Must have a load instruction");
2996   setDebugLocFromInst(LI);
2997   for (unsigned Part = 0; Part < UF; ++Part) {
2998     Value *NewLI;
2999     if (CreateGatherScatter) {
3000       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
3001       Value *VectorGep = State.get(Addr, Part);
3002       NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
3003                                          nullptr, "wide.masked.gather");
3004       addMetadata(NewLI, LI);
3005     } else {
3006       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));
3007       if (isMaskRequired)
3008         NewLI = Builder.CreateMaskedLoad(
3009             DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
3010             PoisonValue::get(DataTy), "wide.masked.load");
3011       else
3012         NewLI =
3013             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
3014 
3015       // Add metadata to the load, but setVectorValue to the reverse shuffle.
3016       addMetadata(NewLI, LI);
3017       if (Reverse)
3018         NewLI = reverseVector(NewLI);
3019     }
3020 
3021     State.set(Def, NewLI, Part);
3022   }
3023 }
3024 
3025 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def,
3026                                                VPUser &User,
3027                                                const VPIteration &Instance,
3028                                                bool IfPredicateInstr,
3029                                                VPTransformState &State) {
3030   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
3031 
3032   // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
3033   // the first lane and part.
3034   if (isa<NoAliasScopeDeclInst>(Instr))
3035     if (!Instance.isFirstIteration())
3036       return;
3037 
3038   setDebugLocFromInst(Instr);
3039 
3040   // Does this instruction return a value ?
3041   bool IsVoidRetTy = Instr->getType()->isVoidTy();
3042 
3043   Instruction *Cloned = Instr->clone();
3044   if (!IsVoidRetTy)
3045     Cloned->setName(Instr->getName() + ".cloned");
3046 
3047   State.Builder.SetInsertPoint(Builder.GetInsertBlock(),
3048                                Builder.GetInsertPoint());
3049   // Replace the operands of the cloned instructions with their scalar
3050   // equivalents in the new loop.
3051   for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
3052     auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
3053     auto InputInstance = Instance;
3054     if (!Operand || !OrigLoop->contains(Operand) ||
3055         (Cost->isUniformAfterVectorization(Operand, State.VF)))
3056       InputInstance.Lane = VPLane::getFirstLane();
3057     auto *NewOp = State.get(User.getOperand(op), InputInstance);
3058     Cloned->setOperand(op, NewOp);
3059   }
3060   addNewMetadata(Cloned, Instr);
3061 
3062   // Place the cloned scalar in the new loop.
3063   Builder.Insert(Cloned);
3064 
3065   State.set(Def, Cloned, Instance);
3066 
3067   // If we just cloned a new assumption, add it the assumption cache.
3068   if (auto *II = dyn_cast<AssumeInst>(Cloned))
3069     AC->registerAssumption(II);
3070 
3071   // End if-block.
3072   if (IfPredicateInstr)
3073     PredicatedInstructions.push_back(Cloned);
3074 }
3075 
3076 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
3077                                                       Value *End, Value *Step,
3078                                                       Instruction *DL) {
3079   BasicBlock *Header = L->getHeader();
3080   BasicBlock *Latch = L->getLoopLatch();
3081   // As we're just creating this loop, it's possible no latch exists
3082   // yet. If so, use the header as this will be a single block loop.
3083   if (!Latch)
3084     Latch = Header;
3085 
3086   IRBuilder<> B(&*Header->getFirstInsertionPt());
3087   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
3088   setDebugLocFromInst(OldInst, &B);
3089   auto *Induction = B.CreatePHI(Start->getType(), 2, "index");
3090 
3091   B.SetInsertPoint(Latch->getTerminator());
3092   setDebugLocFromInst(OldInst, &B);
3093 
3094   // Create i+1 and fill the PHINode.
3095   //
3096   // If the tail is not folded, we know that End - Start >= Step (either
3097   // statically or through the minimum iteration checks). We also know that both
3098   // Start % Step == 0 and End % Step == 0. We exit the vector loop if %IV +
3099   // %Step == %End. Hence we must exit the loop before %IV + %Step unsigned
3100   // overflows and we can mark the induction increment as NUW.
3101   Value *Next = B.CreateAdd(Induction, Step, "index.next",
3102                             /*NUW=*/!Cost->foldTailByMasking(), /*NSW=*/false);
3103   Induction->addIncoming(Start, L->getLoopPreheader());
3104   Induction->addIncoming(Next, Latch);
3105   // Create the compare.
3106   Value *ICmp = B.CreateICmpEQ(Next, End);
3107   B.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header);
3108 
3109   // Now we have two terminators. Remove the old one from the block.
3110   Latch->getTerminator()->eraseFromParent();
3111 
3112   return Induction;
3113 }
3114 
3115 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
3116   if (TripCount)
3117     return TripCount;
3118 
3119   assert(L && "Create Trip Count for null loop.");
3120   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3121   // Find the loop boundaries.
3122   ScalarEvolution *SE = PSE.getSE();
3123   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
3124   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
3125          "Invalid loop count");
3126 
3127   Type *IdxTy = Legal->getWidestInductionType();
3128   assert(IdxTy && "No type for induction");
3129 
3130   // The exit count might have the type of i64 while the phi is i32. This can
3131   // happen if we have an induction variable that is sign extended before the
3132   // compare. The only way that we get a backedge taken count is that the
3133   // induction variable was signed and as such will not overflow. In such a case
3134   // truncation is legal.
3135   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
3136       IdxTy->getPrimitiveSizeInBits())
3137     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
3138   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
3139 
3140   // Get the total trip count from the count by adding 1.
3141   const SCEV *ExitCount = SE->getAddExpr(
3142       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
3143 
3144   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
3145 
3146   // Expand the trip count and place the new instructions in the preheader.
3147   // Notice that the pre-header does not change, only the loop body.
3148   SCEVExpander Exp(*SE, DL, "induction");
3149 
3150   // Count holds the overall loop count (N).
3151   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
3152                                 L->getLoopPreheader()->getTerminator());
3153 
3154   if (TripCount->getType()->isPointerTy())
3155     TripCount =
3156         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
3157                                     L->getLoopPreheader()->getTerminator());
3158 
3159   return TripCount;
3160 }
3161 
3162 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
3163   if (VectorTripCount)
3164     return VectorTripCount;
3165 
3166   Value *TC = getOrCreateTripCount(L);
3167   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3168 
3169   Type *Ty = TC->getType();
3170   // This is where we can make the step a runtime constant.
3171   Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF);
3172 
3173   // If the tail is to be folded by masking, round the number of iterations N
3174   // up to a multiple of Step instead of rounding down. This is done by first
3175   // adding Step-1 and then rounding down. Note that it's ok if this addition
3176   // overflows: the vector induction variable will eventually wrap to zero given
3177   // that it starts at zero and its Step is a power of two; the loop will then
3178   // exit, with the last early-exit vector comparison also producing all-true.
3179   if (Cost->foldTailByMasking()) {
3180     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
3181            "VF*UF must be a power of 2 when folding tail by masking");
3182     assert(!VF.isScalable() &&
3183            "Tail folding not yet supported for scalable vectors");
3184     TC = Builder.CreateAdd(
3185         TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
3186   }
3187 
3188   // Now we need to generate the expression for the part of the loop that the
3189   // vectorized body will execute. This is equal to N - (N % Step) if scalar
3190   // iterations are not required for correctness, or N - Step, otherwise. Step
3191   // is equal to the vectorization factor (number of SIMD elements) times the
3192   // unroll factor (number of SIMD instructions).
3193   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
3194 
3195   // There are cases where we *must* run at least one iteration in the remainder
3196   // loop.  See the cost model for when this can happen.  If the step evenly
3197   // divides the trip count, we set the remainder to be equal to the step. If
3198   // the step does not evenly divide the trip count, no adjustment is necessary
3199   // since there will already be scalar iterations. Note that the minimum
3200   // iterations check ensures that N >= Step.
3201   if (Cost->requiresScalarEpilogue(VF)) {
3202     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
3203     R = Builder.CreateSelect(IsZero, Step, R);
3204   }
3205 
3206   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
3207 
3208   return VectorTripCount;
3209 }
3210 
3211 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
3212                                                    const DataLayout &DL) {
3213   // Verify that V is a vector type with same number of elements as DstVTy.
3214   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
3215   unsigned VF = DstFVTy->getNumElements();
3216   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
3217   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
3218   Type *SrcElemTy = SrcVecTy->getElementType();
3219   Type *DstElemTy = DstFVTy->getElementType();
3220   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3221          "Vector elements must have same size");
3222 
3223   // Do a direct cast if element types are castable.
3224   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3225     return Builder.CreateBitOrPointerCast(V, DstFVTy);
3226   }
3227   // V cannot be directly casted to desired vector type.
3228   // May happen when V is a floating point vector but DstVTy is a vector of
3229   // pointers or vice-versa. Handle this using a two-step bitcast using an
3230   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3231   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3232          "Only one type should be a pointer type");
3233   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3234          "Only one type should be a floating point type");
3235   Type *IntTy =
3236       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3237   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
3238   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3239   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
3240 }
3241 
3242 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
3243                                                          BasicBlock *Bypass) {
3244   Value *Count = getOrCreateTripCount(L);
3245   // Reuse existing vector loop preheader for TC checks.
3246   // Note that new preheader block is generated for vector loop.
3247   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
3248   IRBuilder<> Builder(TCCheckBlock->getTerminator());
3249 
3250   // Generate code to check if the loop's trip count is less than VF * UF, or
3251   // equal to it in case a scalar epilogue is required; this implies that the
3252   // vector trip count is zero. This check also covers the case where adding one
3253   // to the backedge-taken count overflowed leading to an incorrect trip count
3254   // of zero. In this case we will also jump to the scalar loop.
3255   auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
3256                                             : ICmpInst::ICMP_ULT;
3257 
3258   // If tail is to be folded, vector loop takes care of all iterations.
3259   Value *CheckMinIters = Builder.getFalse();
3260   if (!Cost->foldTailByMasking()) {
3261     Value *Step =
3262         createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF);
3263     CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
3264   }
3265   // Create new preheader for vector loop.
3266   LoopVectorPreHeader =
3267       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3268                  "vector.ph");
3269 
3270   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
3271                                DT->getNode(Bypass)->getIDom()) &&
3272          "TC check is expected to dominate Bypass");
3273 
3274   // Update dominator for Bypass & LoopExit (if needed).
3275   DT->changeImmediateDominator(Bypass, TCCheckBlock);
3276   if (!Cost->requiresScalarEpilogue(VF))
3277     // If there is an epilogue which must run, there's no edge from the
3278     // middle block to exit blocks  and thus no need to update the immediate
3279     // dominator of the exit blocks.
3280     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3281 
3282   ReplaceInstWithInst(
3283       TCCheckBlock->getTerminator(),
3284       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3285   LoopBypassBlocks.push_back(TCCheckBlock);
3286 }
3287 
3288 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
3289 
3290   BasicBlock *const SCEVCheckBlock =
3291       RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock);
3292   if (!SCEVCheckBlock)
3293     return nullptr;
3294 
3295   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3296            (OptForSizeBasedOnProfile &&
3297             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
3298          "Cannot SCEV check stride or overflow when optimizing for size");
3299 
3300 
3301   // Update dominator only if this is first RT check.
3302   if (LoopBypassBlocks.empty()) {
3303     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3304     if (!Cost->requiresScalarEpilogue(VF))
3305       // If there is an epilogue which must run, there's no edge from the
3306       // middle block to exit blocks  and thus no need to update the immediate
3307       // dominator of the exit blocks.
3308       DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3309   }
3310 
3311   LoopBypassBlocks.push_back(SCEVCheckBlock);
3312   AddedSafetyChecks = true;
3313   return SCEVCheckBlock;
3314 }
3315 
3316 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
3317                                                       BasicBlock *Bypass) {
3318   // VPlan-native path does not do any analysis for runtime checks currently.
3319   if (EnableVPlanNativePath)
3320     return nullptr;
3321 
3322   BasicBlock *const MemCheckBlock =
3323       RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader);
3324 
3325   // Check if we generated code that checks in runtime if arrays overlap. We put
3326   // the checks into a separate block to make the more common case of few
3327   // elements faster.
3328   if (!MemCheckBlock)
3329     return nullptr;
3330 
3331   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3332     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3333            "Cannot emit memory checks when optimizing for size, unless forced "
3334            "to vectorize.");
3335     ORE->emit([&]() {
3336       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3337                                         L->getStartLoc(), L->getHeader())
3338              << "Code-size may be reduced by not forcing "
3339                 "vectorization, or by source-code modifications "
3340                 "eliminating the need for runtime checks "
3341                 "(e.g., adding 'restrict').";
3342     });
3343   }
3344 
3345   LoopBypassBlocks.push_back(MemCheckBlock);
3346 
3347   AddedSafetyChecks = true;
3348 
3349   // We currently don't use LoopVersioning for the actual loop cloning but we
3350   // still use it to add the noalias metadata.
3351   LVer = std::make_unique<LoopVersioning>(
3352       *Legal->getLAI(),
3353       Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3354       DT, PSE.getSE());
3355   LVer->prepareNoAliasMetadata();
3356   return MemCheckBlock;
3357 }
3358 
3359 Value *InnerLoopVectorizer::emitTransformedIndex(
3360     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
3361     const InductionDescriptor &ID) const {
3362 
3363   SCEVExpander Exp(*SE, DL, "induction");
3364   auto Step = ID.getStep();
3365   auto StartValue = ID.getStartValue();
3366   assert(Index->getType()->getScalarType() == Step->getType() &&
3367          "Index scalar type does not match StepValue type");
3368 
3369   // Note: the IR at this point is broken. We cannot use SE to create any new
3370   // SCEV and then expand it, hoping that SCEV's simplification will give us
3371   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3372   // lead to various SCEV crashes. So all we can do is to use builder and rely
3373   // on InstCombine for future simplifications. Here we handle some trivial
3374   // cases only.
3375   auto CreateAdd = [&B](Value *X, Value *Y) {
3376     assert(X->getType() == Y->getType() && "Types don't match!");
3377     if (auto *CX = dyn_cast<ConstantInt>(X))
3378       if (CX->isZero())
3379         return Y;
3380     if (auto *CY = dyn_cast<ConstantInt>(Y))
3381       if (CY->isZero())
3382         return X;
3383     return B.CreateAdd(X, Y);
3384   };
3385 
3386   // We allow X to be a vector type, in which case Y will potentially be
3387   // splatted into a vector with the same element count.
3388   auto CreateMul = [&B](Value *X, Value *Y) {
3389     assert(X->getType()->getScalarType() == Y->getType() &&
3390            "Types don't match!");
3391     if (auto *CX = dyn_cast<ConstantInt>(X))
3392       if (CX->isOne())
3393         return Y;
3394     if (auto *CY = dyn_cast<ConstantInt>(Y))
3395       if (CY->isOne())
3396         return X;
3397     VectorType *XVTy = dyn_cast<VectorType>(X->getType());
3398     if (XVTy && !isa<VectorType>(Y->getType()))
3399       Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
3400     return B.CreateMul(X, Y);
3401   };
3402 
3403   // Get a suitable insert point for SCEV expansion. For blocks in the vector
3404   // loop, choose the end of the vector loop header (=LoopVectorBody), because
3405   // the DomTree is not kept up-to-date for additional blocks generated in the
3406   // vector loop. By using the header as insertion point, we guarantee that the
3407   // expanded instructions dominate all their uses.
3408   auto GetInsertPoint = [this, &B]() {
3409     BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3410     if (InsertBB != LoopVectorBody &&
3411         LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
3412       return LoopVectorBody->getTerminator();
3413     return &*B.GetInsertPoint();
3414   };
3415 
3416   switch (ID.getKind()) {
3417   case InductionDescriptor::IK_IntInduction: {
3418     assert(!isa<VectorType>(Index->getType()) &&
3419            "Vector indices not supported for integer inductions yet");
3420     assert(Index->getType() == StartValue->getType() &&
3421            "Index type does not match StartValue type");
3422     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3423       return B.CreateSub(StartValue, Index);
3424     auto *Offset = CreateMul(
3425         Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3426     return CreateAdd(StartValue, Offset);
3427   }
3428   case InductionDescriptor::IK_PtrInduction: {
3429     assert(isa<SCEVConstant>(Step) &&
3430            "Expected constant step for pointer induction");
3431     return B.CreateGEP(
3432         ID.getElementType(), StartValue,
3433         CreateMul(Index,
3434                   Exp.expandCodeFor(Step, Index->getType()->getScalarType(),
3435                                     GetInsertPoint())));
3436   }
3437   case InductionDescriptor::IK_FpInduction: {
3438     assert(!isa<VectorType>(Index->getType()) &&
3439            "Vector indices not supported for FP inductions yet");
3440     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
3441     auto InductionBinOp = ID.getInductionBinOp();
3442     assert(InductionBinOp &&
3443            (InductionBinOp->getOpcode() == Instruction::FAdd ||
3444             InductionBinOp->getOpcode() == Instruction::FSub) &&
3445            "Original bin op should be defined for FP induction");
3446 
3447     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3448     Value *MulExp = B.CreateFMul(StepValue, Index);
3449     return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3450                          "induction");
3451   }
3452   case InductionDescriptor::IK_NoInduction:
3453     return nullptr;
3454   }
3455   llvm_unreachable("invalid enum");
3456 }
3457 
3458 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3459   LoopScalarBody = OrigLoop->getHeader();
3460   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3461   assert(LoopVectorPreHeader && "Invalid loop structure");
3462   LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3463   assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) &&
3464          "multiple exit loop without required epilogue?");
3465 
3466   LoopMiddleBlock =
3467       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3468                  LI, nullptr, Twine(Prefix) + "middle.block");
3469   LoopScalarPreHeader =
3470       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3471                  nullptr, Twine(Prefix) + "scalar.ph");
3472 
3473   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3474 
3475   // Set up the middle block terminator.  Two cases:
3476   // 1) If we know that we must execute the scalar epilogue, emit an
3477   //    unconditional branch.
3478   // 2) Otherwise, we must have a single unique exit block (due to how we
3479   //    implement the multiple exit case).  In this case, set up a conditonal
3480   //    branch from the middle block to the loop scalar preheader, and the
3481   //    exit block.  completeLoopSkeleton will update the condition to use an
3482   //    iteration check, if required to decide whether to execute the remainder.
3483   BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ?
3484     BranchInst::Create(LoopScalarPreHeader) :
3485     BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3486                        Builder.getTrue());
3487   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3488   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3489 
3490   // We intentionally don't let SplitBlock to update LoopInfo since
3491   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3492   // LoopVectorBody is explicitly added to the correct place few lines later.
3493   LoopVectorBody =
3494       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3495                  nullptr, nullptr, Twine(Prefix) + "vector.body");
3496 
3497   // Update dominator for loop exit.
3498   if (!Cost->requiresScalarEpilogue(VF))
3499     // If there is an epilogue which must run, there's no edge from the
3500     // middle block to exit blocks  and thus no need to update the immediate
3501     // dominator of the exit blocks.
3502     DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3503 
3504   // Create and register the new vector loop.
3505   Loop *Lp = LI->AllocateLoop();
3506   Loop *ParentLoop = OrigLoop->getParentLoop();
3507 
3508   // Insert the new loop into the loop nest and register the new basic blocks
3509   // before calling any utilities such as SCEV that require valid LoopInfo.
3510   if (ParentLoop) {
3511     ParentLoop->addChildLoop(Lp);
3512   } else {
3513     LI->addTopLevelLoop(Lp);
3514   }
3515   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3516   return Lp;
3517 }
3518 
3519 void InnerLoopVectorizer::createInductionResumeValues(
3520     Loop *L, Value *VectorTripCount,
3521     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3522   assert(VectorTripCount && L && "Expected valid arguments");
3523   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3524           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3525          "Inconsistent information about additional bypass.");
3526   // We are going to resume the execution of the scalar loop.
3527   // Go over all of the induction variables that we found and fix the
3528   // PHIs that are left in the scalar version of the loop.
3529   // The starting values of PHI nodes depend on the counter of the last
3530   // iteration in the vectorized loop.
3531   // If we come from a bypass edge then we need to start from the original
3532   // start value.
3533   for (auto &InductionEntry : Legal->getInductionVars()) {
3534     PHINode *OrigPhi = InductionEntry.first;
3535     InductionDescriptor II = InductionEntry.second;
3536 
3537     // Create phi nodes to merge from the  backedge-taken check block.
3538     PHINode *BCResumeVal =
3539         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3540                         LoopScalarPreHeader->getTerminator());
3541     // Copy original phi DL over to the new one.
3542     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3543     Value *&EndValue = IVEndValues[OrigPhi];
3544     Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3545     if (OrigPhi == OldInduction) {
3546       // We know what the end value is.
3547       EndValue = VectorTripCount;
3548     } else {
3549       IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3550 
3551       // Fast-math-flags propagate from the original induction instruction.
3552       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3553         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3554 
3555       Type *StepType = II.getStep()->getType();
3556       Instruction::CastOps CastOp =
3557           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3558       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3559       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3560       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3561       EndValue->setName("ind.end");
3562 
3563       // Compute the end value for the additional bypass (if applicable).
3564       if (AdditionalBypass.first) {
3565         B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3566         CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3567                                          StepType, true);
3568         CRD =
3569             B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
3570         EndValueFromAdditionalBypass =
3571             emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3572         EndValueFromAdditionalBypass->setName("ind.end");
3573       }
3574     }
3575     // The new PHI merges the original incoming value, in case of a bypass,
3576     // or the value at the end of the vectorized loop.
3577     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3578 
3579     // Fix the scalar body counter (PHI node).
3580     // The old induction's phi node in the scalar body needs the truncated
3581     // value.
3582     for (BasicBlock *BB : LoopBypassBlocks)
3583       BCResumeVal->addIncoming(II.getStartValue(), BB);
3584 
3585     if (AdditionalBypass.first)
3586       BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3587                                             EndValueFromAdditionalBypass);
3588 
3589     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3590   }
3591 }
3592 
3593 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3594                                                       MDNode *OrigLoopID) {
3595   assert(L && "Expected valid loop.");
3596 
3597   // The trip counts should be cached by now.
3598   Value *Count = getOrCreateTripCount(L);
3599   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3600 
3601   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3602 
3603   // Add a check in the middle block to see if we have completed
3604   // all of the iterations in the first vector loop.  Three cases:
3605   // 1) If we require a scalar epilogue, there is no conditional branch as
3606   //    we unconditionally branch to the scalar preheader.  Do nothing.
3607   // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3608   //    Thus if tail is to be folded, we know we don't need to run the
3609   //    remainder and we can use the previous value for the condition (true).
3610   // 3) Otherwise, construct a runtime check.
3611   if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) {
3612     Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3613                                         Count, VectorTripCount, "cmp.n",
3614                                         LoopMiddleBlock->getTerminator());
3615 
3616     // Here we use the same DebugLoc as the scalar loop latch terminator instead
3617     // of the corresponding compare because they may have ended up with
3618     // different line numbers and we want to avoid awkward line stepping while
3619     // debugging. Eg. if the compare has got a line number inside the loop.
3620     CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3621     cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3622   }
3623 
3624   // Get ready to start creating new instructions into the vectorized body.
3625   assert(LoopVectorPreHeader == L->getLoopPreheader() &&
3626          "Inconsistent vector loop preheader");
3627   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3628 
3629   Optional<MDNode *> VectorizedLoopID =
3630       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3631                                       LLVMLoopVectorizeFollowupVectorized});
3632   if (VectorizedLoopID.hasValue()) {
3633     L->setLoopID(VectorizedLoopID.getValue());
3634 
3635     // Do not setAlreadyVectorized if loop attributes have been defined
3636     // explicitly.
3637     return LoopVectorPreHeader;
3638   }
3639 
3640   // Keep all loop hints from the original loop on the vector loop (we'll
3641   // replace the vectorizer-specific hints below).
3642   if (MDNode *LID = OrigLoop->getLoopID())
3643     L->setLoopID(LID);
3644 
3645   LoopVectorizeHints Hints(L, true, *ORE);
3646   Hints.setAlreadyVectorized();
3647 
3648 #ifdef EXPENSIVE_CHECKS
3649   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3650   LI->verify(*DT);
3651 #endif
3652 
3653   return LoopVectorPreHeader;
3654 }
3655 
3656 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3657   /*
3658    In this function we generate a new loop. The new loop will contain
3659    the vectorized instructions while the old loop will continue to run the
3660    scalar remainder.
3661 
3662        [ ] <-- loop iteration number check.
3663     /   |
3664    /    v
3665   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3666   |  /  |
3667   | /   v
3668   ||   [ ]     <-- vector pre header.
3669   |/    |
3670   |     v
3671   |    [  ] \
3672   |    [  ]_|   <-- vector loop.
3673   |     |
3674   |     v
3675   \   -[ ]   <--- middle-block.
3676    \/   |
3677    /\   v
3678    | ->[ ]     <--- new preheader.
3679    |    |
3680  (opt)  v      <-- edge from middle to exit iff epilogue is not required.
3681    |   [ ] \
3682    |   [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue).
3683     \   |
3684      \  v
3685       >[ ]     <-- exit block(s).
3686    ...
3687    */
3688 
3689   // Get the metadata of the original loop before it gets modified.
3690   MDNode *OrigLoopID = OrigLoop->getLoopID();
3691 
3692   // Workaround!  Compute the trip count of the original loop and cache it
3693   // before we start modifying the CFG.  This code has a systemic problem
3694   // wherein it tries to run analysis over partially constructed IR; this is
3695   // wrong, and not simply for SCEV.  The trip count of the original loop
3696   // simply happens to be prone to hitting this in practice.  In theory, we
3697   // can hit the same issue for any SCEV, or ValueTracking query done during
3698   // mutation.  See PR49900.
3699   getOrCreateTripCount(OrigLoop);
3700 
3701   // Create an empty vector loop, and prepare basic blocks for the runtime
3702   // checks.
3703   Loop *Lp = createVectorLoopSkeleton("");
3704 
3705   // Now, compare the new count to zero. If it is zero skip the vector loop and
3706   // jump to the scalar loop. This check also covers the case where the
3707   // backedge-taken count is uint##_max: adding one to it will overflow leading
3708   // to an incorrect trip count of zero. In this (rare) case we will also jump
3709   // to the scalar loop.
3710   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3711 
3712   // Generate the code to check any assumptions that we've made for SCEV
3713   // expressions.
3714   emitSCEVChecks(Lp, LoopScalarPreHeader);
3715 
3716   // Generate the code that checks in runtime if arrays overlap. We put the
3717   // checks into a separate block to make the more common case of few elements
3718   // faster.
3719   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3720 
3721   // Some loops have a single integer induction variable, while other loops
3722   // don't. One example is c++ iterators that often have multiple pointer
3723   // induction variables. In the code below we also support a case where we
3724   // don't have a single induction variable.
3725   //
3726   // We try to obtain an induction variable from the original loop as hard
3727   // as possible. However if we don't find one that:
3728   //   - is an integer
3729   //   - counts from zero, stepping by one
3730   //   - is the size of the widest induction variable type
3731   // then we create a new one.
3732   OldInduction = Legal->getPrimaryInduction();
3733   Type *IdxTy = Legal->getWidestInductionType();
3734   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3735   // The loop step is equal to the vectorization factor (num of SIMD elements)
3736   // times the unroll factor (num of SIMD instructions).
3737   Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
3738   Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF);
3739   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3740   Induction =
3741       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3742                               getDebugLocFromInstOrOperands(OldInduction));
3743 
3744   // Emit phis for the new starting index of the scalar loop.
3745   createInductionResumeValues(Lp, CountRoundDown);
3746 
3747   return completeLoopSkeleton(Lp, OrigLoopID);
3748 }
3749 
3750 // Fix up external users of the induction variable. At this point, we are
3751 // in LCSSA form, with all external PHIs that use the IV having one input value,
3752 // coming from the remainder loop. We need those PHIs to also have a correct
3753 // value for the IV when arriving directly from the middle block.
3754 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3755                                        const InductionDescriptor &II,
3756                                        Value *CountRoundDown, Value *EndValue,
3757                                        BasicBlock *MiddleBlock) {
3758   // There are two kinds of external IV usages - those that use the value
3759   // computed in the last iteration (the PHI) and those that use the penultimate
3760   // value (the value that feeds into the phi from the loop latch).
3761   // We allow both, but they, obviously, have different values.
3762 
3763   assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3764 
3765   DenseMap<Value *, Value *> MissingVals;
3766 
3767   // An external user of the last iteration's value should see the value that
3768   // the remainder loop uses to initialize its own IV.
3769   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3770   for (User *U : PostInc->users()) {
3771     Instruction *UI = cast<Instruction>(U);
3772     if (!OrigLoop->contains(UI)) {
3773       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3774       MissingVals[UI] = EndValue;
3775     }
3776   }
3777 
3778   // An external user of the penultimate value need to see EndValue - Step.
3779   // The simplest way to get this is to recompute it from the constituent SCEVs,
3780   // that is Start + (Step * (CRD - 1)).
3781   for (User *U : OrigPhi->users()) {
3782     auto *UI = cast<Instruction>(U);
3783     if (!OrigLoop->contains(UI)) {
3784       const DataLayout &DL =
3785           OrigLoop->getHeader()->getModule()->getDataLayout();
3786       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3787 
3788       IRBuilder<> B(MiddleBlock->getTerminator());
3789 
3790       // Fast-math-flags propagate from the original induction instruction.
3791       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3792         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3793 
3794       Value *CountMinusOne = B.CreateSub(
3795           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3796       Value *CMO =
3797           !II.getStep()->getType()->isIntegerTy()
3798               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3799                              II.getStep()->getType())
3800               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3801       CMO->setName("cast.cmo");
3802       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3803       Escape->setName("ind.escape");
3804       MissingVals[UI] = Escape;
3805     }
3806   }
3807 
3808   for (auto &I : MissingVals) {
3809     PHINode *PHI = cast<PHINode>(I.first);
3810     // One corner case we have to handle is two IVs "chasing" each-other,
3811     // that is %IV2 = phi [...], [ %IV1, %latch ]
3812     // In this case, if IV1 has an external use, we need to avoid adding both
3813     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3814     // don't already have an incoming value for the middle block.
3815     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3816       PHI->addIncoming(I.second, MiddleBlock);
3817   }
3818 }
3819 
3820 namespace {
3821 
3822 struct CSEDenseMapInfo {
3823   static bool canHandle(const Instruction *I) {
3824     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3825            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3826   }
3827 
3828   static inline Instruction *getEmptyKey() {
3829     return DenseMapInfo<Instruction *>::getEmptyKey();
3830   }
3831 
3832   static inline Instruction *getTombstoneKey() {
3833     return DenseMapInfo<Instruction *>::getTombstoneKey();
3834   }
3835 
3836   static unsigned getHashValue(const Instruction *I) {
3837     assert(canHandle(I) && "Unknown instruction!");
3838     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3839                                                            I->value_op_end()));
3840   }
3841 
3842   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3843     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3844         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3845       return LHS == RHS;
3846     return LHS->isIdenticalTo(RHS);
3847   }
3848 };
3849 
3850 } // end anonymous namespace
3851 
3852 ///Perform cse of induction variable instructions.
3853 static void cse(BasicBlock *BB) {
3854   // Perform simple cse.
3855   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3856   for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3857     if (!CSEDenseMapInfo::canHandle(&In))
3858       continue;
3859 
3860     // Check if we can replace this instruction with any of the
3861     // visited instructions.
3862     if (Instruction *V = CSEMap.lookup(&In)) {
3863       In.replaceAllUsesWith(V);
3864       In.eraseFromParent();
3865       continue;
3866     }
3867 
3868     CSEMap[&In] = &In;
3869   }
3870 }
3871 
3872 InstructionCost
3873 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3874                                               bool &NeedToScalarize) const {
3875   Function *F = CI->getCalledFunction();
3876   Type *ScalarRetTy = CI->getType();
3877   SmallVector<Type *, 4> Tys, ScalarTys;
3878   for (auto &ArgOp : CI->args())
3879     ScalarTys.push_back(ArgOp->getType());
3880 
3881   // Estimate cost of scalarized vector call. The source operands are assumed
3882   // to be vectors, so we need to extract individual elements from there,
3883   // execute VF scalar calls, and then gather the result into the vector return
3884   // value.
3885   InstructionCost ScalarCallCost =
3886       TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
3887   if (VF.isScalar())
3888     return ScalarCallCost;
3889 
3890   // Compute corresponding vector type for return value and arguments.
3891   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3892   for (Type *ScalarTy : ScalarTys)
3893     Tys.push_back(ToVectorTy(ScalarTy, VF));
3894 
3895   // Compute costs of unpacking argument values for the scalar calls and
3896   // packing the return values to a vector.
3897   InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
3898 
3899   InstructionCost Cost =
3900       ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3901 
3902   // If we can't emit a vector call for this function, then the currently found
3903   // cost is the cost we need to return.
3904   NeedToScalarize = true;
3905   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3906   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3907 
3908   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3909     return Cost;
3910 
3911   // If the corresponding vector cost is cheaper, return its cost.
3912   InstructionCost VectorCallCost =
3913       TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
3914   if (VectorCallCost < Cost) {
3915     NeedToScalarize = false;
3916     Cost = VectorCallCost;
3917   }
3918   return Cost;
3919 }
3920 
3921 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3922   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3923     return Elt;
3924   return VectorType::get(Elt, VF);
3925 }
3926 
3927 InstructionCost
3928 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3929                                                    ElementCount VF) const {
3930   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3931   assert(ID && "Expected intrinsic call!");
3932   Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3933   FastMathFlags FMF;
3934   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3935     FMF = FPMO->getFastMathFlags();
3936 
3937   SmallVector<const Value *> Arguments(CI->args());
3938   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3939   SmallVector<Type *> ParamTys;
3940   std::transform(FTy->param_begin(), FTy->param_end(),
3941                  std::back_inserter(ParamTys),
3942                  [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3943 
3944   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3945                                     dyn_cast<IntrinsicInst>(CI));
3946   return TTI.getIntrinsicInstrCost(CostAttrs,
3947                                    TargetTransformInfo::TCK_RecipThroughput);
3948 }
3949 
3950 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3951   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3952   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3953   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3954 }
3955 
3956 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3957   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3958   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3959   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3960 }
3961 
3962 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3963   // For every instruction `I` in MinBWs, truncate the operands, create a
3964   // truncated version of `I` and reextend its result. InstCombine runs
3965   // later and will remove any ext/trunc pairs.
3966   SmallPtrSet<Value *, 4> Erased;
3967   for (const auto &KV : Cost->getMinimalBitwidths()) {
3968     // If the value wasn't vectorized, we must maintain the original scalar
3969     // type. The absence of the value from State indicates that it
3970     // wasn't vectorized.
3971     // FIXME: Should not rely on getVPValue at this point.
3972     VPValue *Def = State.Plan->getVPValue(KV.first, true);
3973     if (!State.hasAnyVectorValue(Def))
3974       continue;
3975     for (unsigned Part = 0; Part < UF; ++Part) {
3976       Value *I = State.get(Def, Part);
3977       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3978         continue;
3979       Type *OriginalTy = I->getType();
3980       Type *ScalarTruncatedTy =
3981           IntegerType::get(OriginalTy->getContext(), KV.second);
3982       auto *TruncatedTy = VectorType::get(
3983           ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
3984       if (TruncatedTy == OriginalTy)
3985         continue;
3986 
3987       IRBuilder<> B(cast<Instruction>(I));
3988       auto ShrinkOperand = [&](Value *V) -> Value * {
3989         if (auto *ZI = dyn_cast<ZExtInst>(V))
3990           if (ZI->getSrcTy() == TruncatedTy)
3991             return ZI->getOperand(0);
3992         return B.CreateZExtOrTrunc(V, TruncatedTy);
3993       };
3994 
3995       // The actual instruction modification depends on the instruction type,
3996       // unfortunately.
3997       Value *NewI = nullptr;
3998       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3999         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
4000                              ShrinkOperand(BO->getOperand(1)));
4001 
4002         // Any wrapping introduced by shrinking this operation shouldn't be
4003         // considered undefined behavior. So, we can't unconditionally copy
4004         // arithmetic wrapping flags to NewI.
4005         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
4006       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
4007         NewI =
4008             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
4009                          ShrinkOperand(CI->getOperand(1)));
4010       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
4011         NewI = B.CreateSelect(SI->getCondition(),
4012                               ShrinkOperand(SI->getTrueValue()),
4013                               ShrinkOperand(SI->getFalseValue()));
4014       } else if (auto *CI = dyn_cast<CastInst>(I)) {
4015         switch (CI->getOpcode()) {
4016         default:
4017           llvm_unreachable("Unhandled cast!");
4018         case Instruction::Trunc:
4019           NewI = ShrinkOperand(CI->getOperand(0));
4020           break;
4021         case Instruction::SExt:
4022           NewI = B.CreateSExtOrTrunc(
4023               CI->getOperand(0),
4024               smallestIntegerVectorType(OriginalTy, TruncatedTy));
4025           break;
4026         case Instruction::ZExt:
4027           NewI = B.CreateZExtOrTrunc(
4028               CI->getOperand(0),
4029               smallestIntegerVectorType(OriginalTy, TruncatedTy));
4030           break;
4031         }
4032       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
4033         auto Elements0 =
4034             cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
4035         auto *O0 = B.CreateZExtOrTrunc(
4036             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
4037         auto Elements1 =
4038             cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
4039         auto *O1 = B.CreateZExtOrTrunc(
4040             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
4041 
4042         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
4043       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
4044         // Don't do anything with the operands, just extend the result.
4045         continue;
4046       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
4047         auto Elements =
4048             cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
4049         auto *O0 = B.CreateZExtOrTrunc(
4050             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
4051         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
4052         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
4053       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
4054         auto Elements =
4055             cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
4056         auto *O0 = B.CreateZExtOrTrunc(
4057             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
4058         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
4059       } else {
4060         // If we don't know what to do, be conservative and don't do anything.
4061         continue;
4062       }
4063 
4064       // Lastly, extend the result.
4065       NewI->takeName(cast<Instruction>(I));
4066       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
4067       I->replaceAllUsesWith(Res);
4068       cast<Instruction>(I)->eraseFromParent();
4069       Erased.insert(I);
4070       State.reset(Def, Res, Part);
4071     }
4072   }
4073 
4074   // We'll have created a bunch of ZExts that are now parentless. Clean up.
4075   for (const auto &KV : Cost->getMinimalBitwidths()) {
4076     // If the value wasn't vectorized, we must maintain the original scalar
4077     // type. The absence of the value from State indicates that it
4078     // wasn't vectorized.
4079     // FIXME: Should not rely on getVPValue at this point.
4080     VPValue *Def = State.Plan->getVPValue(KV.first, true);
4081     if (!State.hasAnyVectorValue(Def))
4082       continue;
4083     for (unsigned Part = 0; Part < UF; ++Part) {
4084       Value *I = State.get(Def, Part);
4085       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
4086       if (Inst && Inst->use_empty()) {
4087         Value *NewI = Inst->getOperand(0);
4088         Inst->eraseFromParent();
4089         State.reset(Def, NewI, Part);
4090       }
4091     }
4092   }
4093 }
4094 
4095 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
4096   // Insert truncates and extends for any truncated instructions as hints to
4097   // InstCombine.
4098   if (VF.isVector())
4099     truncateToMinimalBitwidths(State);
4100 
4101   // Fix widened non-induction PHIs by setting up the PHI operands.
4102   if (OrigPHIsToFix.size()) {
4103     assert(EnableVPlanNativePath &&
4104            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
4105     fixNonInductionPHIs(State);
4106   }
4107 
4108   // At this point every instruction in the original loop is widened to a
4109   // vector form. Now we need to fix the recurrences in the loop. These PHI
4110   // nodes are currently empty because we did not want to introduce cycles.
4111   // This is the second stage of vectorizing recurrences.
4112   fixCrossIterationPHIs(State);
4113 
4114   // Forget the original basic block.
4115   PSE.getSE()->forgetLoop(OrigLoop);
4116 
4117   // If we inserted an edge from the middle block to the unique exit block,
4118   // update uses outside the loop (phis) to account for the newly inserted
4119   // edge.
4120   if (!Cost->requiresScalarEpilogue(VF)) {
4121     // Fix-up external users of the induction variables.
4122     for (auto &Entry : Legal->getInductionVars())
4123       fixupIVUsers(Entry.first, Entry.second,
4124                    getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
4125                    IVEndValues[Entry.first], LoopMiddleBlock);
4126 
4127     fixLCSSAPHIs(State);
4128   }
4129 
4130   for (Instruction *PI : PredicatedInstructions)
4131     sinkScalarOperands(&*PI);
4132 
4133   // Remove redundant induction instructions.
4134   cse(LoopVectorBody);
4135 
4136   // Set/update profile weights for the vector and remainder loops as original
4137   // loop iterations are now distributed among them. Note that original loop
4138   // represented by LoopScalarBody becomes remainder loop after vectorization.
4139   //
4140   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
4141   // end up getting slightly roughened result but that should be OK since
4142   // profile is not inherently precise anyway. Note also possible bypass of
4143   // vector code caused by legality checks is ignored, assigning all the weight
4144   // to the vector loop, optimistically.
4145   //
4146   // For scalable vectorization we can't know at compile time how many iterations
4147   // of the loop are handled in one vector iteration, so instead assume a pessimistic
4148   // vscale of '1'.
4149   setProfileInfoAfterUnrolling(
4150       LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
4151       LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
4152 }
4153 
4154 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
4155   // In order to support recurrences we need to be able to vectorize Phi nodes.
4156   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4157   // stage #2: We now need to fix the recurrences by adding incoming edges to
4158   // the currently empty PHI nodes. At this point every instruction in the
4159   // original loop is widened to a vector form so we can use them to construct
4160   // the incoming edges.
4161   VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock();
4162   for (VPRecipeBase &R : Header->phis()) {
4163     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
4164       fixReduction(ReductionPhi, State);
4165     else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
4166       fixFirstOrderRecurrence(FOR, State);
4167   }
4168 }
4169 
4170 void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR,
4171                                                   VPTransformState &State) {
4172   // This is the second phase of vectorizing first-order recurrences. An
4173   // overview of the transformation is described below. Suppose we have the
4174   // following loop.
4175   //
4176   //   for (int i = 0; i < n; ++i)
4177   //     b[i] = a[i] - a[i - 1];
4178   //
4179   // There is a first-order recurrence on "a". For this loop, the shorthand
4180   // scalar IR looks like:
4181   //
4182   //   scalar.ph:
4183   //     s_init = a[-1]
4184   //     br scalar.body
4185   //
4186   //   scalar.body:
4187   //     i = phi [0, scalar.ph], [i+1, scalar.body]
4188   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
4189   //     s2 = a[i]
4190   //     b[i] = s2 - s1
4191   //     br cond, scalar.body, ...
4192   //
4193   // In this example, s1 is a recurrence because it's value depends on the
4194   // previous iteration. In the first phase of vectorization, we created a
4195   // vector phi v1 for s1. We now complete the vectorization and produce the
4196   // shorthand vector IR shown below (for VF = 4, UF = 1).
4197   //
4198   //   vector.ph:
4199   //     v_init = vector(..., ..., ..., a[-1])
4200   //     br vector.body
4201   //
4202   //   vector.body
4203   //     i = phi [0, vector.ph], [i+4, vector.body]
4204   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
4205   //     v2 = a[i, i+1, i+2, i+3];
4206   //     v3 = vector(v1(3), v2(0, 1, 2))
4207   //     b[i, i+1, i+2, i+3] = v2 - v3
4208   //     br cond, vector.body, middle.block
4209   //
4210   //   middle.block:
4211   //     x = v2(3)
4212   //     br scalar.ph
4213   //
4214   //   scalar.ph:
4215   //     s_init = phi [x, middle.block], [a[-1], otherwise]
4216   //     br scalar.body
4217   //
4218   // After execution completes the vector loop, we extract the next value of
4219   // the recurrence (x) to use as the initial value in the scalar loop.
4220 
4221   // Extract the last vector element in the middle block. This will be the
4222   // initial value for the recurrence when jumping to the scalar loop.
4223   VPValue *PreviousDef = PhiR->getBackedgeValue();
4224   Value *Incoming = State.get(PreviousDef, UF - 1);
4225   auto *ExtractForScalar = Incoming;
4226   auto *IdxTy = Builder.getInt32Ty();
4227   if (VF.isVector()) {
4228     auto *One = ConstantInt::get(IdxTy, 1);
4229     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4230     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
4231     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
4232     ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
4233                                                     "vector.recur.extract");
4234   }
4235   // Extract the second last element in the middle block if the
4236   // Phi is used outside the loop. We need to extract the phi itself
4237   // and not the last element (the phi update in the current iteration). This
4238   // will be the value when jumping to the exit block from the LoopMiddleBlock,
4239   // when the scalar loop is not run at all.
4240   Value *ExtractForPhiUsedOutsideLoop = nullptr;
4241   if (VF.isVector()) {
4242     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
4243     auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
4244     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4245         Incoming, Idx, "vector.recur.extract.for.phi");
4246   } else if (UF > 1)
4247     // When loop is unrolled without vectorizing, initialize
4248     // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
4249     // of `Incoming`. This is analogous to the vectorized case above: extracting
4250     // the second last element when VF > 1.
4251     ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
4252 
4253   // Fix the initial value of the original recurrence in the scalar loop.
4254   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
4255   PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
4256   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4257   auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
4258   for (auto *BB : predecessors(LoopScalarPreHeader)) {
4259     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4260     Start->addIncoming(Incoming, BB);
4261   }
4262 
4263   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
4264   Phi->setName("scalar.recur");
4265 
4266   // Finally, fix users of the recurrence outside the loop. The users will need
4267   // either the last value of the scalar recurrence or the last value of the
4268   // vector recurrence we extracted in the middle block. Since the loop is in
4269   // LCSSA form, we just need to find all the phi nodes for the original scalar
4270   // recurrence in the exit block, and then add an edge for the middle block.
4271   // Note that LCSSA does not imply single entry when the original scalar loop
4272   // had multiple exiting edges (as we always run the last iteration in the
4273   // scalar epilogue); in that case, there is no edge from middle to exit and
4274   // and thus no phis which needed updated.
4275   if (!Cost->requiresScalarEpilogue(VF))
4276     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4277       if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi))
4278         LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4279 }
4280 
4281 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
4282                                        VPTransformState &State) {
4283   PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
4284   // Get it's reduction variable descriptor.
4285   assert(Legal->isReductionVariable(OrigPhi) &&
4286          "Unable to find the reduction variable");
4287   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
4288 
4289   RecurKind RK = RdxDesc.getRecurrenceKind();
4290   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4291   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4292   setDebugLocFromInst(ReductionStartValue);
4293 
4294   VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
4295   // This is the vector-clone of the value that leaves the loop.
4296   Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
4297 
4298   // Wrap flags are in general invalid after vectorization, clear them.
4299   clearReductionWrapFlags(RdxDesc, State);
4300 
4301   // Before each round, move the insertion point right between
4302   // the PHIs and the values we are going to write.
4303   // This allows us to write both PHINodes and the extractelement
4304   // instructions.
4305   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4306 
4307   setDebugLocFromInst(LoopExitInst);
4308 
4309   Type *PhiTy = OrigPhi->getType();
4310   // If tail is folded by masking, the vector value to leave the loop should be
4311   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
4312   // instead of the former. For an inloop reduction the reduction will already
4313   // be predicated, and does not need to be handled here.
4314   if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
4315     for (unsigned Part = 0; Part < UF; ++Part) {
4316       Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
4317       Value *Sel = nullptr;
4318       for (User *U : VecLoopExitInst->users()) {
4319         if (isa<SelectInst>(U)) {
4320           assert(!Sel && "Reduction exit feeding two selects");
4321           Sel = U;
4322         } else
4323           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
4324       }
4325       assert(Sel && "Reduction exit feeds no select");
4326       State.reset(LoopExitInstDef, Sel, Part);
4327 
4328       // If the target can create a predicated operator for the reduction at no
4329       // extra cost in the loop (for example a predicated vadd), it can be
4330       // cheaper for the select to remain in the loop than be sunk out of it,
4331       // and so use the select value for the phi instead of the old
4332       // LoopExitValue.
4333       if (PreferPredicatedReductionSelect ||
4334           TTI->preferPredicatedReductionSelect(
4335               RdxDesc.getOpcode(), PhiTy,
4336               TargetTransformInfo::ReductionFlags())) {
4337         auto *VecRdxPhi =
4338             cast<PHINode>(State.get(PhiR->getVPSingleValue(), Part));
4339         VecRdxPhi->setIncomingValueForBlock(
4340             LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4341       }
4342     }
4343   }
4344 
4345   // If the vector reduction can be performed in a smaller type, we truncate
4346   // then extend the loop exit value to enable InstCombine to evaluate the
4347   // entire expression in the smaller type.
4348   if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
4349     assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
4350     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4351     Builder.SetInsertPoint(
4352         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4353     VectorParts RdxParts(UF);
4354     for (unsigned Part = 0; Part < UF; ++Part) {
4355       RdxParts[Part] = State.get(LoopExitInstDef, Part);
4356       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4357       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4358                                         : Builder.CreateZExt(Trunc, VecTy);
4359       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4360            UI != RdxParts[Part]->user_end();)
4361         if (*UI != Trunc) {
4362           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4363           RdxParts[Part] = Extnd;
4364         } else {
4365           ++UI;
4366         }
4367     }
4368     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4369     for (unsigned Part = 0; Part < UF; ++Part) {
4370       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4371       State.reset(LoopExitInstDef, RdxParts[Part], Part);
4372     }
4373   }
4374 
4375   // Reduce all of the unrolled parts into a single vector.
4376   Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
4377   unsigned Op = RecurrenceDescriptor::getOpcode(RK);
4378 
4379   // The middle block terminator has already been assigned a DebugLoc here (the
4380   // OrigLoop's single latch terminator). We want the whole middle block to
4381   // appear to execute on this line because: (a) it is all compiler generated,
4382   // (b) these instructions are always executed after evaluating the latch
4383   // conditional branch, and (c) other passes may add new predecessors which
4384   // terminate on this line. This is the easiest way to ensure we don't
4385   // accidentally cause an extra step back into the loop while debugging.
4386   setDebugLocFromInst(LoopMiddleBlock->getTerminator());
4387   if (PhiR->isOrdered())
4388     ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
4389   else {
4390     // Floating-point operations should have some FMF to enable the reduction.
4391     IRBuilderBase::FastMathFlagGuard FMFG(Builder);
4392     Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
4393     for (unsigned Part = 1; Part < UF; ++Part) {
4394       Value *RdxPart = State.get(LoopExitInstDef, Part);
4395       if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
4396         ReducedPartRdx = Builder.CreateBinOp(
4397             (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
4398       } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK))
4399         ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK,
4400                                            ReducedPartRdx, RdxPart);
4401       else
4402         ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
4403     }
4404   }
4405 
4406   // Create the reduction after the loop. Note that inloop reductions create the
4407   // target reduction in the loop using a Reduction recipe.
4408   if (VF.isVector() && !PhiR->isInLoop()) {
4409     ReducedPartRdx =
4410         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi);
4411     // If the reduction can be performed in a smaller type, we need to extend
4412     // the reduction to the wider type before we branch to the original loop.
4413     if (PhiTy != RdxDesc.getRecurrenceType())
4414       ReducedPartRdx = RdxDesc.isSigned()
4415                            ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
4416                            : Builder.CreateZExt(ReducedPartRdx, PhiTy);
4417   }
4418 
4419   // Create a phi node that merges control-flow from the backedge-taken check
4420   // block and the middle block.
4421   PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
4422                                         LoopScalarPreHeader->getTerminator());
4423   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4424     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4425   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4426 
4427   // Now, we need to fix the users of the reduction variable
4428   // inside and outside of the scalar remainder loop.
4429 
4430   // We know that the loop is in LCSSA form. We need to update the PHI nodes
4431   // in the exit blocks.  See comment on analogous loop in
4432   // fixFirstOrderRecurrence for a more complete explaination of the logic.
4433   if (!Cost->requiresScalarEpilogue(VF))
4434     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4435       if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst))
4436         LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4437 
4438   // Fix the scalar loop reduction variable with the incoming reduction sum
4439   // from the vector body and from the backedge value.
4440   int IncomingEdgeBlockIdx =
4441       OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4442   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4443   // Pick the other block.
4444   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4445   OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4446   OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4447 }
4448 
4449 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
4450                                                   VPTransformState &State) {
4451   RecurKind RK = RdxDesc.getRecurrenceKind();
4452   if (RK != RecurKind::Add && RK != RecurKind::Mul)
4453     return;
4454 
4455   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4456   assert(LoopExitInstr && "null loop exit instruction");
4457   SmallVector<Instruction *, 8> Worklist;
4458   SmallPtrSet<Instruction *, 8> Visited;
4459   Worklist.push_back(LoopExitInstr);
4460   Visited.insert(LoopExitInstr);
4461 
4462   while (!Worklist.empty()) {
4463     Instruction *Cur = Worklist.pop_back_val();
4464     if (isa<OverflowingBinaryOperator>(Cur))
4465       for (unsigned Part = 0; Part < UF; ++Part) {
4466         // FIXME: Should not rely on getVPValue at this point.
4467         Value *V = State.get(State.Plan->getVPValue(Cur, true), Part);
4468         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4469       }
4470 
4471     for (User *U : Cur->users()) {
4472       Instruction *UI = cast<Instruction>(U);
4473       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4474           Visited.insert(UI).second)
4475         Worklist.push_back(UI);
4476     }
4477   }
4478 }
4479 
4480 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) {
4481   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4482     if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
4483       // Some phis were already hand updated by the reduction and recurrence
4484       // code above, leave them alone.
4485       continue;
4486 
4487     auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4488     // Non-instruction incoming values will have only one value.
4489 
4490     VPLane Lane = VPLane::getFirstLane();
4491     if (isa<Instruction>(IncomingValue) &&
4492         !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue),
4493                                            VF))
4494       Lane = VPLane::getLastLaneForVF(VF);
4495 
4496     // Can be a loop invariant incoming value or the last scalar value to be
4497     // extracted from the vectorized loop.
4498     // FIXME: Should not rely on getVPValue at this point.
4499     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4500     Value *lastIncomingValue =
4501         OrigLoop->isLoopInvariant(IncomingValue)
4502             ? IncomingValue
4503             : State.get(State.Plan->getVPValue(IncomingValue, true),
4504                         VPIteration(UF - 1, Lane));
4505     LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4506   }
4507 }
4508 
4509 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4510   // The basic block and loop containing the predicated instruction.
4511   auto *PredBB = PredInst->getParent();
4512   auto *VectorLoop = LI->getLoopFor(PredBB);
4513 
4514   // Initialize a worklist with the operands of the predicated instruction.
4515   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4516 
4517   // Holds instructions that we need to analyze again. An instruction may be
4518   // reanalyzed if we don't yet know if we can sink it or not.
4519   SmallVector<Instruction *, 8> InstsToReanalyze;
4520 
4521   // Returns true if a given use occurs in the predicated block. Phi nodes use
4522   // their operands in their corresponding predecessor blocks.
4523   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4524     auto *I = cast<Instruction>(U.getUser());
4525     BasicBlock *BB = I->getParent();
4526     if (auto *Phi = dyn_cast<PHINode>(I))
4527       BB = Phi->getIncomingBlock(
4528           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4529     return BB == PredBB;
4530   };
4531 
4532   // Iteratively sink the scalarized operands of the predicated instruction
4533   // into the block we created for it. When an instruction is sunk, it's
4534   // operands are then added to the worklist. The algorithm ends after one pass
4535   // through the worklist doesn't sink a single instruction.
4536   bool Changed;
4537   do {
4538     // Add the instructions that need to be reanalyzed to the worklist, and
4539     // reset the changed indicator.
4540     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4541     InstsToReanalyze.clear();
4542     Changed = false;
4543 
4544     while (!Worklist.empty()) {
4545       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4546 
4547       // We can't sink an instruction if it is a phi node, is not in the loop,
4548       // or may have side effects.
4549       if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
4550           I->mayHaveSideEffects())
4551         continue;
4552 
4553       // If the instruction is already in PredBB, check if we can sink its
4554       // operands. In that case, VPlan's sinkScalarOperands() succeeded in
4555       // sinking the scalar instruction I, hence it appears in PredBB; but it
4556       // may have failed to sink I's operands (recursively), which we try
4557       // (again) here.
4558       if (I->getParent() == PredBB) {
4559         Worklist.insert(I->op_begin(), I->op_end());
4560         continue;
4561       }
4562 
4563       // It's legal to sink the instruction if all its uses occur in the
4564       // predicated block. Otherwise, there's nothing to do yet, and we may
4565       // need to reanalyze the instruction.
4566       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4567         InstsToReanalyze.push_back(I);
4568         continue;
4569       }
4570 
4571       // Move the instruction to the beginning of the predicated block, and add
4572       // it's operands to the worklist.
4573       I->moveBefore(&*PredBB->getFirstInsertionPt());
4574       Worklist.insert(I->op_begin(), I->op_end());
4575 
4576       // The sinking may have enabled other instructions to be sunk, so we will
4577       // need to iterate.
4578       Changed = true;
4579     }
4580   } while (Changed);
4581 }
4582 
4583 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
4584   for (PHINode *OrigPhi : OrigPHIsToFix) {
4585     VPWidenPHIRecipe *VPPhi =
4586         cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi));
4587     PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4588     // Make sure the builder has a valid insert point.
4589     Builder.SetInsertPoint(NewPhi);
4590     for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4591       VPValue *Inc = VPPhi->getIncomingValue(i);
4592       VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4593       NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4594     }
4595   }
4596 }
4597 
4598 bool InnerLoopVectorizer::useOrderedReductions(RecurrenceDescriptor &RdxDesc) {
4599   return Cost->useOrderedReductions(RdxDesc);
4600 }
4601 
4602 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
4603                                    VPUser &Operands, unsigned UF,
4604                                    ElementCount VF, bool IsPtrLoopInvariant,
4605                                    SmallBitVector &IsIndexLoopInvariant,
4606                                    VPTransformState &State) {
4607   // Construct a vector GEP by widening the operands of the scalar GEP as
4608   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4609   // results in a vector of pointers when at least one operand of the GEP
4610   // is vector-typed. Thus, to keep the representation compact, we only use
4611   // vector-typed operands for loop-varying values.
4612 
4613   if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4614     // If we are vectorizing, but the GEP has only loop-invariant operands,
4615     // the GEP we build (by only using vector-typed operands for
4616     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4617     // produce a vector of pointers, we need to either arbitrarily pick an
4618     // operand to broadcast, or broadcast a clone of the original GEP.
4619     // Here, we broadcast a clone of the original.
4620     //
4621     // TODO: If at some point we decide to scalarize instructions having
4622     //       loop-invariant operands, this special case will no longer be
4623     //       required. We would add the scalarization decision to
4624     //       collectLoopScalars() and teach getVectorValue() to broadcast
4625     //       the lane-zero scalar value.
4626     auto *Clone = Builder.Insert(GEP->clone());
4627     for (unsigned Part = 0; Part < UF; ++Part) {
4628       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4629       State.set(VPDef, EntryPart, Part);
4630       addMetadata(EntryPart, GEP);
4631     }
4632   } else {
4633     // If the GEP has at least one loop-varying operand, we are sure to
4634     // produce a vector of pointers. But if we are only unrolling, we want
4635     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4636     // produce with the code below will be scalar (if VF == 1) or vector
4637     // (otherwise). Note that for the unroll-only case, we still maintain
4638     // values in the vector mapping with initVector, as we do for other
4639     // instructions.
4640     for (unsigned Part = 0; Part < UF; ++Part) {
4641       // The pointer operand of the new GEP. If it's loop-invariant, we
4642       // won't broadcast it.
4643       auto *Ptr = IsPtrLoopInvariant
4644                       ? State.get(Operands.getOperand(0), VPIteration(0, 0))
4645                       : State.get(Operands.getOperand(0), Part);
4646 
4647       // Collect all the indices for the new GEP. If any index is
4648       // loop-invariant, we won't broadcast it.
4649       SmallVector<Value *, 4> Indices;
4650       for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
4651         VPValue *Operand = Operands.getOperand(I);
4652         if (IsIndexLoopInvariant[I - 1])
4653           Indices.push_back(State.get(Operand, VPIteration(0, 0)));
4654         else
4655           Indices.push_back(State.get(Operand, Part));
4656       }
4657 
4658       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4659       // but it should be a vector, otherwise.
4660       auto *NewGEP =
4661           GEP->isInBounds()
4662               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4663                                           Indices)
4664               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4665       assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
4666              "NewGEP is not a pointer vector");
4667       State.set(VPDef, NewGEP, Part);
4668       addMetadata(NewGEP, GEP);
4669     }
4670   }
4671 }
4672 
4673 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
4674                                               VPWidenPHIRecipe *PhiR,
4675                                               VPTransformState &State) {
4676   PHINode *P = cast<PHINode>(PN);
4677   if (EnableVPlanNativePath) {
4678     // Currently we enter here in the VPlan-native path for non-induction
4679     // PHIs where all control flow is uniform. We simply widen these PHIs.
4680     // Create a vector phi with no operands - the vector phi operands will be
4681     // set at the end of vector code generation.
4682     Type *VecTy = (State.VF.isScalar())
4683                       ? PN->getType()
4684                       : VectorType::get(PN->getType(), State.VF);
4685     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4686     State.set(PhiR, VecPhi, 0);
4687     OrigPHIsToFix.push_back(P);
4688 
4689     return;
4690   }
4691 
4692   assert(PN->getParent() == OrigLoop->getHeader() &&
4693          "Non-header phis should have been handled elsewhere");
4694 
4695   // In order to support recurrences we need to be able to vectorize Phi nodes.
4696   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4697   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4698   // this value when we vectorize all of the instructions that use the PHI.
4699 
4700   assert(!Legal->isReductionVariable(P) &&
4701          "reductions should be handled elsewhere");
4702 
4703   setDebugLocFromInst(P);
4704 
4705   // This PHINode must be an induction variable.
4706   // Make sure that we know about it.
4707   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4708 
4709   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4710   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4711 
4712   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4713   // which can be found from the original scalar operations.
4714   switch (II.getKind()) {
4715   case InductionDescriptor::IK_NoInduction:
4716     llvm_unreachable("Unknown induction");
4717   case InductionDescriptor::IK_IntInduction:
4718   case InductionDescriptor::IK_FpInduction:
4719     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4720   case InductionDescriptor::IK_PtrInduction: {
4721     // Handle the pointer induction variable case.
4722     assert(P->getType()->isPointerTy() && "Unexpected type.");
4723 
4724     if (Cost->isScalarAfterVectorization(P, State.VF)) {
4725       // This is the normalized GEP that starts counting at zero.
4726       Value *PtrInd =
4727           Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4728       // Determine the number of scalars we need to generate for each unroll
4729       // iteration. If the instruction is uniform, we only need to generate the
4730       // first lane. Otherwise, we generate all VF values.
4731       bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF);
4732       unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue();
4733 
4734       bool NeedsVectorIndex = !IsUniform && VF.isScalable();
4735       Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr;
4736       if (NeedsVectorIndex) {
4737         Type *VecIVTy = VectorType::get(PtrInd->getType(), VF);
4738         UnitStepVec = Builder.CreateStepVector(VecIVTy);
4739         PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd);
4740       }
4741 
4742       for (unsigned Part = 0; Part < UF; ++Part) {
4743         Value *PartStart = createStepForVF(
4744             Builder, ConstantInt::get(PtrInd->getType(), Part), VF);
4745 
4746         if (NeedsVectorIndex) {
4747           // Here we cache the whole vector, which means we can support the
4748           // extraction of any lane. However, in some cases the extractelement
4749           // instruction that is generated for scalar uses of this vector (e.g.
4750           // a load instruction) is not folded away. Therefore we still
4751           // calculate values for the first n lanes to avoid redundant moves
4752           // (when extracting the 0th element) and to produce scalar code (i.e.
4753           // additional add/gep instructions instead of expensive extractelement
4754           // instructions) when extracting higher-order elements.
4755           Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart);
4756           Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec);
4757           Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices);
4758           Value *SclrGep =
4759               emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II);
4760           SclrGep->setName("next.gep");
4761           State.set(PhiR, SclrGep, Part);
4762         }
4763 
4764         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4765           Value *Idx = Builder.CreateAdd(
4766               PartStart, ConstantInt::get(PtrInd->getType(), Lane));
4767           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4768           Value *SclrGep =
4769               emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4770           SclrGep->setName("next.gep");
4771           State.set(PhiR, SclrGep, VPIteration(Part, Lane));
4772         }
4773       }
4774       return;
4775     }
4776     assert(isa<SCEVConstant>(II.getStep()) &&
4777            "Induction step not a SCEV constant!");
4778     Type *PhiType = II.getStep()->getType();
4779 
4780     // Build a pointer phi
4781     Value *ScalarStartValue = II.getStartValue();
4782     Type *ScStValueType = ScalarStartValue->getType();
4783     PHINode *NewPointerPhi =
4784         PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4785     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4786 
4787     // A pointer induction, performed by using a gep
4788     BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4789     Instruction *InductionLoc = LoopLatch->getTerminator();
4790     const SCEV *ScalarStep = II.getStep();
4791     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4792     Value *ScalarStepValue =
4793         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4794     Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF);
4795     Value *NumUnrolledElems =
4796         Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
4797     Value *InductionGEP = GetElementPtrInst::Create(
4798         II.getElementType(), NewPointerPhi,
4799         Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
4800         InductionLoc);
4801     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4802 
4803     // Create UF many actual address geps that use the pointer
4804     // phi as base and a vectorized version of the step value
4805     // (<step*0, ..., step*N>) as offset.
4806     for (unsigned Part = 0; Part < State.UF; ++Part) {
4807       Type *VecPhiType = VectorType::get(PhiType, State.VF);
4808       Value *StartOffsetScalar =
4809           Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
4810       Value *StartOffset =
4811           Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
4812       // Create a vector of consecutive numbers from zero to VF.
4813       StartOffset =
4814           Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType));
4815 
4816       Value *GEP = Builder.CreateGEP(
4817           II.getElementType(), NewPointerPhi,
4818           Builder.CreateMul(
4819               StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue),
4820               "vector.gep"));
4821       State.set(PhiR, GEP, Part);
4822     }
4823   }
4824   }
4825 }
4826 
4827 /// A helper function for checking whether an integer division-related
4828 /// instruction may divide by zero (in which case it must be predicated if
4829 /// executed conditionally in the scalar code).
4830 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4831 /// Non-zero divisors that are non compile-time constants will not be
4832 /// converted into multiplication, so we will still end up scalarizing
4833 /// the division, but can do so w/o predication.
4834 static bool mayDivideByZero(Instruction &I) {
4835   assert((I.getOpcode() == Instruction::UDiv ||
4836           I.getOpcode() == Instruction::SDiv ||
4837           I.getOpcode() == Instruction::URem ||
4838           I.getOpcode() == Instruction::SRem) &&
4839          "Unexpected instruction");
4840   Value *Divisor = I.getOperand(1);
4841   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4842   return !CInt || CInt->isZero();
4843 }
4844 
4845 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
4846                                            VPUser &User,
4847                                            VPTransformState &State) {
4848   switch (I.getOpcode()) {
4849   case Instruction::Call:
4850   case Instruction::Br:
4851   case Instruction::PHI:
4852   case Instruction::GetElementPtr:
4853   case Instruction::Select:
4854     llvm_unreachable("This instruction is handled by a different recipe.");
4855   case Instruction::UDiv:
4856   case Instruction::SDiv:
4857   case Instruction::SRem:
4858   case Instruction::URem:
4859   case Instruction::Add:
4860   case Instruction::FAdd:
4861   case Instruction::Sub:
4862   case Instruction::FSub:
4863   case Instruction::FNeg:
4864   case Instruction::Mul:
4865   case Instruction::FMul:
4866   case Instruction::FDiv:
4867   case Instruction::FRem:
4868   case Instruction::Shl:
4869   case Instruction::LShr:
4870   case Instruction::AShr:
4871   case Instruction::And:
4872   case Instruction::Or:
4873   case Instruction::Xor: {
4874     // Just widen unops and binops.
4875     setDebugLocFromInst(&I);
4876 
4877     for (unsigned Part = 0; Part < UF; ++Part) {
4878       SmallVector<Value *, 2> Ops;
4879       for (VPValue *VPOp : User.operands())
4880         Ops.push_back(State.get(VPOp, Part));
4881 
4882       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4883 
4884       if (auto *VecOp = dyn_cast<Instruction>(V))
4885         VecOp->copyIRFlags(&I);
4886 
4887       // Use this vector value for all users of the original instruction.
4888       State.set(Def, V, Part);
4889       addMetadata(V, &I);
4890     }
4891 
4892     break;
4893   }
4894   case Instruction::ICmp:
4895   case Instruction::FCmp: {
4896     // Widen compares. Generate vector compares.
4897     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4898     auto *Cmp = cast<CmpInst>(&I);
4899     setDebugLocFromInst(Cmp);
4900     for (unsigned Part = 0; Part < UF; ++Part) {
4901       Value *A = State.get(User.getOperand(0), Part);
4902       Value *B = State.get(User.getOperand(1), Part);
4903       Value *C = nullptr;
4904       if (FCmp) {
4905         // Propagate fast math flags.
4906         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4907         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4908         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4909       } else {
4910         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4911       }
4912       State.set(Def, C, Part);
4913       addMetadata(C, &I);
4914     }
4915 
4916     break;
4917   }
4918 
4919   case Instruction::ZExt:
4920   case Instruction::SExt:
4921   case Instruction::FPToUI:
4922   case Instruction::FPToSI:
4923   case Instruction::FPExt:
4924   case Instruction::PtrToInt:
4925   case Instruction::IntToPtr:
4926   case Instruction::SIToFP:
4927   case Instruction::UIToFP:
4928   case Instruction::Trunc:
4929   case Instruction::FPTrunc:
4930   case Instruction::BitCast: {
4931     auto *CI = cast<CastInst>(&I);
4932     setDebugLocFromInst(CI);
4933 
4934     /// Vectorize casts.
4935     Type *DestTy =
4936         (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
4937 
4938     for (unsigned Part = 0; Part < UF; ++Part) {
4939       Value *A = State.get(User.getOperand(0), Part);
4940       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4941       State.set(Def, Cast, Part);
4942       addMetadata(Cast, &I);
4943     }
4944     break;
4945   }
4946   default:
4947     // This instruction is not vectorized by simple widening.
4948     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4949     llvm_unreachable("Unhandled instruction!");
4950   } // end of switch.
4951 }
4952 
4953 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4954                                                VPUser &ArgOperands,
4955                                                VPTransformState &State) {
4956   assert(!isa<DbgInfoIntrinsic>(I) &&
4957          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4958   setDebugLocFromInst(&I);
4959 
4960   Module *M = I.getParent()->getParent()->getParent();
4961   auto *CI = cast<CallInst>(&I);
4962 
4963   SmallVector<Type *, 4> Tys;
4964   for (Value *ArgOperand : CI->args())
4965     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4966 
4967   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4968 
4969   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4970   // version of the instruction.
4971   // Is it beneficial to perform intrinsic call compared to lib call?
4972   bool NeedToScalarize = false;
4973   InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4974   InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
4975   bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
4976   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4977          "Instruction should be scalarized elsewhere.");
4978   assert((IntrinsicCost.isValid() || CallCost.isValid()) &&
4979          "Either the intrinsic cost or vector call cost must be valid");
4980 
4981   for (unsigned Part = 0; Part < UF; ++Part) {
4982     SmallVector<Type *, 2> TysForDecl = {CI->getType()};
4983     SmallVector<Value *, 4> Args;
4984     for (auto &I : enumerate(ArgOperands.operands())) {
4985       // Some intrinsics have a scalar argument - don't replace it with a
4986       // vector.
4987       Value *Arg;
4988       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4989         Arg = State.get(I.value(), Part);
4990       else {
4991         Arg = State.get(I.value(), VPIteration(0, 0));
4992         if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index()))
4993           TysForDecl.push_back(Arg->getType());
4994       }
4995       Args.push_back(Arg);
4996     }
4997 
4998     Function *VectorF;
4999     if (UseVectorIntrinsic) {
5000       // Use vector version of the intrinsic.
5001       if (VF.isVector())
5002         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
5003       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
5004       assert(VectorF && "Can't retrieve vector intrinsic.");
5005     } else {
5006       // Use vector version of the function call.
5007       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
5008 #ifndef NDEBUG
5009       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
5010              "Can't create vector function.");
5011 #endif
5012         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
5013     }
5014       SmallVector<OperandBundleDef, 1> OpBundles;
5015       CI->getOperandBundlesAsDefs(OpBundles);
5016       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
5017 
5018       if (isa<FPMathOperator>(V))
5019         V->copyFastMathFlags(CI);
5020 
5021       State.set(Def, V, Part);
5022       addMetadata(V, &I);
5023   }
5024 }
5025 
5026 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef,
5027                                                  VPUser &Operands,
5028                                                  bool InvariantCond,
5029                                                  VPTransformState &State) {
5030   setDebugLocFromInst(&I);
5031 
5032   // The condition can be loop invariant  but still defined inside the
5033   // loop. This means that we can't just use the original 'cond' value.
5034   // We have to take the 'vectorized' value and pick the first lane.
5035   // Instcombine will make this a no-op.
5036   auto *InvarCond = InvariantCond
5037                         ? State.get(Operands.getOperand(0), VPIteration(0, 0))
5038                         : nullptr;
5039 
5040   for (unsigned Part = 0; Part < UF; ++Part) {
5041     Value *Cond =
5042         InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
5043     Value *Op0 = State.get(Operands.getOperand(1), Part);
5044     Value *Op1 = State.get(Operands.getOperand(2), Part);
5045     Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
5046     State.set(VPDef, Sel, Part);
5047     addMetadata(Sel, &I);
5048   }
5049 }
5050 
5051 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
5052   // We should not collect Scalars more than once per VF. Right now, this
5053   // function is called from collectUniformsAndScalars(), which already does
5054   // this check. Collecting Scalars for VF=1 does not make any sense.
5055   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
5056          "This function should not be visited twice for the same VF");
5057 
5058   SmallSetVector<Instruction *, 8> Worklist;
5059 
5060   // These sets are used to seed the analysis with pointers used by memory
5061   // accesses that will remain scalar.
5062   SmallSetVector<Instruction *, 8> ScalarPtrs;
5063   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
5064   auto *Latch = TheLoop->getLoopLatch();
5065 
5066   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
5067   // The pointer operands of loads and stores will be scalar as long as the
5068   // memory access is not a gather or scatter operation. The value operand of a
5069   // store will remain scalar if the store is scalarized.
5070   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
5071     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
5072     assert(WideningDecision != CM_Unknown &&
5073            "Widening decision should be ready at this moment");
5074     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
5075       if (Ptr == Store->getValueOperand())
5076         return WideningDecision == CM_Scalarize;
5077     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
5078            "Ptr is neither a value or pointer operand");
5079     return WideningDecision != CM_GatherScatter;
5080   };
5081 
5082   // A helper that returns true if the given value is a bitcast or
5083   // getelementptr instruction contained in the loop.
5084   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
5085     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
5086             isa<GetElementPtrInst>(V)) &&
5087            !TheLoop->isLoopInvariant(V);
5088   };
5089 
5090   auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
5091     if (!isa<PHINode>(Ptr) ||
5092         !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
5093       return false;
5094     auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
5095     if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
5096       return false;
5097     return isScalarUse(MemAccess, Ptr);
5098   };
5099 
5100   // A helper that evaluates a memory access's use of a pointer. If the
5101   // pointer is actually the pointer induction of a loop, it is being
5102   // inserted into Worklist. If the use will be a scalar use, and the
5103   // pointer is only used by memory accesses, we place the pointer in
5104   // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
5105   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
5106     if (isScalarPtrInduction(MemAccess, Ptr)) {
5107       Worklist.insert(cast<Instruction>(Ptr));
5108       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
5109                         << "\n");
5110 
5111       Instruction *Update = cast<Instruction>(
5112           cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
5113 
5114       // If there is more than one user of Update (Ptr), we shouldn't assume it
5115       // will be scalar after vectorisation as other users of the instruction
5116       // may require widening. Otherwise, add it to ScalarPtrs.
5117       if (Update->hasOneUse() && cast<Value>(*Update->user_begin()) == Ptr) {
5118         ScalarPtrs.insert(Update);
5119         return;
5120       }
5121     }
5122     // We only care about bitcast and getelementptr instructions contained in
5123     // the loop.
5124     if (!isLoopVaryingBitCastOrGEP(Ptr))
5125       return;
5126 
5127     // If the pointer has already been identified as scalar (e.g., if it was
5128     // also identified as uniform), there's nothing to do.
5129     auto *I = cast<Instruction>(Ptr);
5130     if (Worklist.count(I))
5131       return;
5132 
5133     // If the use of the pointer will be a scalar use, and all users of the
5134     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
5135     // place the pointer in PossibleNonScalarPtrs.
5136     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
5137           return isa<LoadInst>(U) || isa<StoreInst>(U);
5138         }))
5139       ScalarPtrs.insert(I);
5140     else
5141       PossibleNonScalarPtrs.insert(I);
5142   };
5143 
5144   // We seed the scalars analysis with three classes of instructions: (1)
5145   // instructions marked uniform-after-vectorization and (2) bitcast,
5146   // getelementptr and (pointer) phi instructions used by memory accesses
5147   // requiring a scalar use.
5148   //
5149   // (1) Add to the worklist all instructions that have been identified as
5150   // uniform-after-vectorization.
5151   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
5152 
5153   // (2) Add to the worklist all bitcast and getelementptr instructions used by
5154   // memory accesses requiring a scalar use. The pointer operands of loads and
5155   // stores will be scalar as long as the memory accesses is not a gather or
5156   // scatter operation. The value operand of a store will remain scalar if the
5157   // store is scalarized.
5158   for (auto *BB : TheLoop->blocks())
5159     for (auto &I : *BB) {
5160       if (auto *Load = dyn_cast<LoadInst>(&I)) {
5161         evaluatePtrUse(Load, Load->getPointerOperand());
5162       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
5163         evaluatePtrUse(Store, Store->getPointerOperand());
5164         evaluatePtrUse(Store, Store->getValueOperand());
5165       }
5166     }
5167   for (auto *I : ScalarPtrs)
5168     if (!PossibleNonScalarPtrs.count(I)) {
5169       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
5170       Worklist.insert(I);
5171     }
5172 
5173   // Insert the forced scalars.
5174   // FIXME: Currently widenPHIInstruction() often creates a dead vector
5175   // induction variable when the PHI user is scalarized.
5176   auto ForcedScalar = ForcedScalars.find(VF);
5177   if (ForcedScalar != ForcedScalars.end())
5178     for (auto *I : ForcedScalar->second)
5179       Worklist.insert(I);
5180 
5181   // Expand the worklist by looking through any bitcasts and getelementptr
5182   // instructions we've already identified as scalar. This is similar to the
5183   // expansion step in collectLoopUniforms(); however, here we're only
5184   // expanding to include additional bitcasts and getelementptr instructions.
5185   unsigned Idx = 0;
5186   while (Idx != Worklist.size()) {
5187     Instruction *Dst = Worklist[Idx++];
5188     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
5189       continue;
5190     auto *Src = cast<Instruction>(Dst->getOperand(0));
5191     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
5192           auto *J = cast<Instruction>(U);
5193           return !TheLoop->contains(J) || Worklist.count(J) ||
5194                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
5195                   isScalarUse(J, Src));
5196         })) {
5197       Worklist.insert(Src);
5198       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
5199     }
5200   }
5201 
5202   // An induction variable will remain scalar if all users of the induction
5203   // variable and induction variable update remain scalar.
5204   for (auto &Induction : Legal->getInductionVars()) {
5205     auto *Ind = Induction.first;
5206     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5207 
5208     // If tail-folding is applied, the primary induction variable will be used
5209     // to feed a vector compare.
5210     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
5211       continue;
5212 
5213     // Determine if all users of the induction variable are scalar after
5214     // vectorization.
5215     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5216       auto *I = cast<Instruction>(U);
5217       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
5218     });
5219     if (!ScalarInd)
5220       continue;
5221 
5222     // Determine if all users of the induction variable update instruction are
5223     // scalar after vectorization.
5224     auto ScalarIndUpdate =
5225         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5226           auto *I = cast<Instruction>(U);
5227           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
5228         });
5229     if (!ScalarIndUpdate)
5230       continue;
5231 
5232     // The induction variable and its update instruction will remain scalar.
5233     Worklist.insert(Ind);
5234     Worklist.insert(IndUpdate);
5235     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
5236     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
5237                       << "\n");
5238   }
5239 
5240   Scalars[VF].insert(Worklist.begin(), Worklist.end());
5241 }
5242 
5243 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const {
5244   if (!blockNeedsPredication(I->getParent()))
5245     return false;
5246   switch(I->getOpcode()) {
5247   default:
5248     break;
5249   case Instruction::Load:
5250   case Instruction::Store: {
5251     if (!Legal->isMaskRequired(I))
5252       return false;
5253     auto *Ptr = getLoadStorePointerOperand(I);
5254     auto *Ty = getLoadStoreType(I);
5255     const Align Alignment = getLoadStoreAlignment(I);
5256     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
5257                                 TTI.isLegalMaskedGather(Ty, Alignment))
5258                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
5259                                 TTI.isLegalMaskedScatter(Ty, Alignment));
5260   }
5261   case Instruction::UDiv:
5262   case Instruction::SDiv:
5263   case Instruction::SRem:
5264   case Instruction::URem:
5265     return mayDivideByZero(*I);
5266   }
5267   return false;
5268 }
5269 
5270 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
5271     Instruction *I, ElementCount VF) {
5272   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
5273   assert(getWideningDecision(I, VF) == CM_Unknown &&
5274          "Decision should not be set yet.");
5275   auto *Group = getInterleavedAccessGroup(I);
5276   assert(Group && "Must have a group.");
5277 
5278   // If the instruction's allocated size doesn't equal it's type size, it
5279   // requires padding and will be scalarized.
5280   auto &DL = I->getModule()->getDataLayout();
5281   auto *ScalarTy = getLoadStoreType(I);
5282   if (hasIrregularType(ScalarTy, DL))
5283     return false;
5284 
5285   // Check if masking is required.
5286   // A Group may need masking for one of two reasons: it resides in a block that
5287   // needs predication, or it was decided to use masking to deal with gaps
5288   // (either a gap at the end of a load-access that may result in a speculative
5289   // load, or any gaps in a store-access).
5290   bool PredicatedAccessRequiresMasking =
5291       blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
5292   bool LoadAccessWithGapsRequiresEpilogMasking =
5293       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
5294       !isScalarEpilogueAllowed();
5295   bool StoreAccessWithGapsRequiresMasking =
5296       isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
5297   if (!PredicatedAccessRequiresMasking &&
5298       !LoadAccessWithGapsRequiresEpilogMasking &&
5299       !StoreAccessWithGapsRequiresMasking)
5300     return true;
5301 
5302   // If masked interleaving is required, we expect that the user/target had
5303   // enabled it, because otherwise it either wouldn't have been created or
5304   // it should have been invalidated by the CostModel.
5305   assert(useMaskedInterleavedAccesses(TTI) &&
5306          "Masked interleave-groups for predicated accesses are not enabled.");
5307 
5308   if (Group->isReverse())
5309     return false;
5310 
5311   auto *Ty = getLoadStoreType(I);
5312   const Align Alignment = getLoadStoreAlignment(I);
5313   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
5314                           : TTI.isLegalMaskedStore(Ty, Alignment);
5315 }
5316 
5317 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
5318     Instruction *I, ElementCount VF) {
5319   // Get and ensure we have a valid memory instruction.
5320   assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
5321 
5322   auto *Ptr = getLoadStorePointerOperand(I);
5323   auto *ScalarTy = getLoadStoreType(I);
5324 
5325   // In order to be widened, the pointer should be consecutive, first of all.
5326   if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
5327     return false;
5328 
5329   // If the instruction is a store located in a predicated block, it will be
5330   // scalarized.
5331   if (isScalarWithPredication(I))
5332     return false;
5333 
5334   // If the instruction's allocated size doesn't equal it's type size, it
5335   // requires padding and will be scalarized.
5336   auto &DL = I->getModule()->getDataLayout();
5337   if (hasIrregularType(ScalarTy, DL))
5338     return false;
5339 
5340   return true;
5341 }
5342 
5343 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
5344   // We should not collect Uniforms more than once per VF. Right now,
5345   // this function is called from collectUniformsAndScalars(), which
5346   // already does this check. Collecting Uniforms for VF=1 does not make any
5347   // sense.
5348 
5349   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
5350          "This function should not be visited twice for the same VF");
5351 
5352   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5353   // not analyze again.  Uniforms.count(VF) will return 1.
5354   Uniforms[VF].clear();
5355 
5356   // We now know that the loop is vectorizable!
5357   // Collect instructions inside the loop that will remain uniform after
5358   // vectorization.
5359 
5360   // Global values, params and instructions outside of current loop are out of
5361   // scope.
5362   auto isOutOfScope = [&](Value *V) -> bool {
5363     Instruction *I = dyn_cast<Instruction>(V);
5364     return (!I || !TheLoop->contains(I));
5365   };
5366 
5367   SetVector<Instruction *> Worklist;
5368   BasicBlock *Latch = TheLoop->getLoopLatch();
5369 
5370   // Instructions that are scalar with predication must not be considered
5371   // uniform after vectorization, because that would create an erroneous
5372   // replicating region where only a single instance out of VF should be formed.
5373   // TODO: optimize such seldom cases if found important, see PR40816.
5374   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
5375     if (isOutOfScope(I)) {
5376       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
5377                         << *I << "\n");
5378       return;
5379     }
5380     if (isScalarWithPredication(I)) {
5381       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
5382                         << *I << "\n");
5383       return;
5384     }
5385     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
5386     Worklist.insert(I);
5387   };
5388 
5389   // Start with the conditional branch. If the branch condition is an
5390   // instruction contained in the loop that is only used by the branch, it is
5391   // uniform.
5392   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5393   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5394     addToWorklistIfAllowed(Cmp);
5395 
5396   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5397     InstWidening WideningDecision = getWideningDecision(I, VF);
5398     assert(WideningDecision != CM_Unknown &&
5399            "Widening decision should be ready at this moment");
5400 
5401     // A uniform memory op is itself uniform.  We exclude uniform stores
5402     // here as they demand the last lane, not the first one.
5403     if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
5404       assert(WideningDecision == CM_Scalarize);
5405       return true;
5406     }
5407 
5408     return (WideningDecision == CM_Widen ||
5409             WideningDecision == CM_Widen_Reverse ||
5410             WideningDecision == CM_Interleave);
5411   };
5412 
5413 
5414   // Returns true if Ptr is the pointer operand of a memory access instruction
5415   // I, and I is known to not require scalarization.
5416   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5417     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5418   };
5419 
5420   // Holds a list of values which are known to have at least one uniform use.
5421   // Note that there may be other uses which aren't uniform.  A "uniform use"
5422   // here is something which only demands lane 0 of the unrolled iterations;
5423   // it does not imply that all lanes produce the same value (e.g. this is not
5424   // the usual meaning of uniform)
5425   SetVector<Value *> HasUniformUse;
5426 
5427   // Scan the loop for instructions which are either a) known to have only
5428   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
5429   for (auto *BB : TheLoop->blocks())
5430     for (auto &I : *BB) {
5431       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
5432         switch (II->getIntrinsicID()) {
5433         case Intrinsic::sideeffect:
5434         case Intrinsic::experimental_noalias_scope_decl:
5435         case Intrinsic::assume:
5436         case Intrinsic::lifetime_start:
5437         case Intrinsic::lifetime_end:
5438           if (TheLoop->hasLoopInvariantOperands(&I))
5439             addToWorklistIfAllowed(&I);
5440           break;
5441         default:
5442           break;
5443         }
5444       }
5445 
5446       // ExtractValue instructions must be uniform, because the operands are
5447       // known to be loop-invariant.
5448       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
5449         assert(isOutOfScope(EVI->getAggregateOperand()) &&
5450                "Expected aggregate value to be loop invariant");
5451         addToWorklistIfAllowed(EVI);
5452         continue;
5453       }
5454 
5455       // If there's no pointer operand, there's nothing to do.
5456       auto *Ptr = getLoadStorePointerOperand(&I);
5457       if (!Ptr)
5458         continue;
5459 
5460       // A uniform memory op is itself uniform.  We exclude uniform stores
5461       // here as they demand the last lane, not the first one.
5462       if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
5463         addToWorklistIfAllowed(&I);
5464 
5465       if (isUniformDecision(&I, VF)) {
5466         assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
5467         HasUniformUse.insert(Ptr);
5468       }
5469     }
5470 
5471   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
5472   // demanding) users.  Since loops are assumed to be in LCSSA form, this
5473   // disallows uses outside the loop as well.
5474   for (auto *V : HasUniformUse) {
5475     if (isOutOfScope(V))
5476       continue;
5477     auto *I = cast<Instruction>(V);
5478     auto UsersAreMemAccesses =
5479       llvm::all_of(I->users(), [&](User *U) -> bool {
5480         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
5481       });
5482     if (UsersAreMemAccesses)
5483       addToWorklistIfAllowed(I);
5484   }
5485 
5486   // Expand Worklist in topological order: whenever a new instruction
5487   // is added , its users should be already inside Worklist.  It ensures
5488   // a uniform instruction will only be used by uniform instructions.
5489   unsigned idx = 0;
5490   while (idx != Worklist.size()) {
5491     Instruction *I = Worklist[idx++];
5492 
5493     for (auto OV : I->operand_values()) {
5494       // isOutOfScope operands cannot be uniform instructions.
5495       if (isOutOfScope(OV))
5496         continue;
5497       // First order recurrence Phi's should typically be considered
5498       // non-uniform.
5499       auto *OP = dyn_cast<PHINode>(OV);
5500       if (OP && Legal->isFirstOrderRecurrence(OP))
5501         continue;
5502       // If all the users of the operand are uniform, then add the
5503       // operand into the uniform worklist.
5504       auto *OI = cast<Instruction>(OV);
5505       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5506             auto *J = cast<Instruction>(U);
5507             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
5508           }))
5509         addToWorklistIfAllowed(OI);
5510     }
5511   }
5512 
5513   // For an instruction to be added into Worklist above, all its users inside
5514   // the loop should also be in Worklist. However, this condition cannot be
5515   // true for phi nodes that form a cyclic dependence. We must process phi
5516   // nodes separately. An induction variable will remain uniform if all users
5517   // of the induction variable and induction variable update remain uniform.
5518   // The code below handles both pointer and non-pointer induction variables.
5519   for (auto &Induction : Legal->getInductionVars()) {
5520     auto *Ind = Induction.first;
5521     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5522 
5523     // Determine if all users of the induction variable are uniform after
5524     // vectorization.
5525     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5526       auto *I = cast<Instruction>(U);
5527       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5528              isVectorizedMemAccessUse(I, Ind);
5529     });
5530     if (!UniformInd)
5531       continue;
5532 
5533     // Determine if all users of the induction variable update instruction are
5534     // uniform after vectorization.
5535     auto UniformIndUpdate =
5536         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5537           auto *I = cast<Instruction>(U);
5538           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5539                  isVectorizedMemAccessUse(I, IndUpdate);
5540         });
5541     if (!UniformIndUpdate)
5542       continue;
5543 
5544     // The induction variable and its update instruction will remain uniform.
5545     addToWorklistIfAllowed(Ind);
5546     addToWorklistIfAllowed(IndUpdate);
5547   }
5548 
5549   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5550 }
5551 
5552 bool LoopVectorizationCostModel::runtimeChecksRequired() {
5553   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
5554 
5555   if (Legal->getRuntimePointerChecking()->Need) {
5556     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5557         "runtime pointer checks needed. Enable vectorization of this "
5558         "loop with '#pragma clang loop vectorize(enable)' when "
5559         "compiling with -Os/-Oz",
5560         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5561     return true;
5562   }
5563 
5564   if (!PSE.getUnionPredicate().getPredicates().empty()) {
5565     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5566         "runtime SCEV checks needed. Enable vectorization of this "
5567         "loop with '#pragma clang loop vectorize(enable)' when "
5568         "compiling with -Os/-Oz",
5569         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5570     return true;
5571   }
5572 
5573   // FIXME: Avoid specializing for stride==1 instead of bailing out.
5574   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5575     reportVectorizationFailure("Runtime stride check for small trip count",
5576         "runtime stride == 1 checks needed. Enable vectorization of "
5577         "this loop without such check by compiling with -Os/-Oz",
5578         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5579     return true;
5580   }
5581 
5582   return false;
5583 }
5584 
5585 ElementCount
5586 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
5587   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
5588     return ElementCount::getScalable(0);
5589 
5590   if (Hints->isScalableVectorizationDisabled()) {
5591     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
5592                             "ScalableVectorizationDisabled", ORE, TheLoop);
5593     return ElementCount::getScalable(0);
5594   }
5595 
5596   LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
5597 
5598   auto MaxScalableVF = ElementCount::getScalable(
5599       std::numeric_limits<ElementCount::ScalarTy>::max());
5600 
5601   // Test that the loop-vectorizer can legalize all operations for this MaxVF.
5602   // FIXME: While for scalable vectors this is currently sufficient, this should
5603   // be replaced by a more detailed mechanism that filters out specific VFs,
5604   // instead of invalidating vectorization for a whole set of VFs based on the
5605   // MaxVF.
5606 
5607   // Disable scalable vectorization if the loop contains unsupported reductions.
5608   if (!canVectorizeReductions(MaxScalableVF)) {
5609     reportVectorizationInfo(
5610         "Scalable vectorization not supported for the reduction "
5611         "operations found in this loop.",
5612         "ScalableVFUnfeasible", ORE, TheLoop);
5613     return ElementCount::getScalable(0);
5614   }
5615 
5616   // Disable scalable vectorization if the loop contains any instructions
5617   // with element types not supported for scalable vectors.
5618   if (any_of(ElementTypesInLoop, [&](Type *Ty) {
5619         return !Ty->isVoidTy() &&
5620                !this->TTI.isElementTypeLegalForScalableVector(Ty);
5621       })) {
5622     reportVectorizationInfo("Scalable vectorization is not supported "
5623                             "for all element types found in this loop.",
5624                             "ScalableVFUnfeasible", ORE, TheLoop);
5625     return ElementCount::getScalable(0);
5626   }
5627 
5628   if (Legal->isSafeForAnyVectorWidth())
5629     return MaxScalableVF;
5630 
5631   // Limit MaxScalableVF by the maximum safe dependence distance.
5632   Optional<unsigned> MaxVScale = TTI.getMaxVScale();
5633   if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
5634     unsigned VScaleMax = TheFunction->getFnAttribute(Attribute::VScaleRange)
5635                              .getVScaleRangeArgs()
5636                              .second;
5637     if (VScaleMax > 0)
5638       MaxVScale = VScaleMax;
5639   }
5640   MaxScalableVF = ElementCount::getScalable(
5641       MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
5642   if (!MaxScalableVF)
5643     reportVectorizationInfo(
5644         "Max legal vector width too small, scalable vectorization "
5645         "unfeasible.",
5646         "ScalableVFUnfeasible", ORE, TheLoop);
5647 
5648   return MaxScalableVF;
5649 }
5650 
5651 FixedScalableVFPair
5652 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
5653                                                  ElementCount UserVF) {
5654   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5655   unsigned SmallestType, WidestType;
5656   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5657 
5658   // Get the maximum safe dependence distance in bits computed by LAA.
5659   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5660   // the memory accesses that is most restrictive (involved in the smallest
5661   // dependence distance).
5662   unsigned MaxSafeElements =
5663       PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
5664 
5665   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
5666   auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
5667 
5668   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
5669                     << ".\n");
5670   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
5671                     << ".\n");
5672 
5673   // First analyze the UserVF, fall back if the UserVF should be ignored.
5674   if (UserVF) {
5675     auto MaxSafeUserVF =
5676         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
5677 
5678     if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
5679       // If `VF=vscale x N` is safe, then so is `VF=N`
5680       if (UserVF.isScalable())
5681         return FixedScalableVFPair(
5682             ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
5683       else
5684         return UserVF;
5685     }
5686 
5687     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
5688 
5689     // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
5690     // is better to ignore the hint and let the compiler choose a suitable VF.
5691     if (!UserVF.isScalable()) {
5692       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5693                         << " is unsafe, clamping to max safe VF="
5694                         << MaxSafeFixedVF << ".\n");
5695       ORE->emit([&]() {
5696         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5697                                           TheLoop->getStartLoc(),
5698                                           TheLoop->getHeader())
5699                << "User-specified vectorization factor "
5700                << ore::NV("UserVectorizationFactor", UserVF)
5701                << " is unsafe, clamping to maximum safe vectorization factor "
5702                << ore::NV("VectorizationFactor", MaxSafeFixedVF);
5703       });
5704       return MaxSafeFixedVF;
5705     }
5706 
5707     if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
5708       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5709                         << " is ignored because scalable vectors are not "
5710                            "available.\n");
5711       ORE->emit([&]() {
5712         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5713                                           TheLoop->getStartLoc(),
5714                                           TheLoop->getHeader())
5715                << "User-specified vectorization factor "
5716                << ore::NV("UserVectorizationFactor", UserVF)
5717                << " is ignored because the target does not support scalable "
5718                   "vectors. The compiler will pick a more suitable value.";
5719       });
5720     } else {
5721       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5722                         << " is unsafe. Ignoring scalable UserVF.\n");
5723       ORE->emit([&]() {
5724         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5725                                           TheLoop->getStartLoc(),
5726                                           TheLoop->getHeader())
5727                << "User-specified vectorization factor "
5728                << ore::NV("UserVectorizationFactor", UserVF)
5729                << " is unsafe. Ignoring the hint to let the compiler pick a "
5730                   "more suitable value.";
5731       });
5732     }
5733   }
5734 
5735   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5736                     << " / " << WidestType << " bits.\n");
5737 
5738   FixedScalableVFPair Result(ElementCount::getFixed(1),
5739                              ElementCount::getScalable(0));
5740   if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType,
5741                                            WidestType, MaxSafeFixedVF))
5742     Result.FixedVF = MaxVF;
5743 
5744   if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType,
5745                                            WidestType, MaxSafeScalableVF))
5746     if (MaxVF.isScalable()) {
5747       Result.ScalableVF = MaxVF;
5748       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
5749                         << "\n");
5750     }
5751 
5752   return Result;
5753 }
5754 
5755 FixedScalableVFPair
5756 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5757   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5758     // TODO: It may by useful to do since it's still likely to be dynamically
5759     // uniform if the target can skip.
5760     reportVectorizationFailure(
5761         "Not inserting runtime ptr check for divergent target",
5762         "runtime pointer checks needed. Not enabled for divergent target",
5763         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5764     return FixedScalableVFPair::getNone();
5765   }
5766 
5767   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5768   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5769   if (TC == 1) {
5770     reportVectorizationFailure("Single iteration (non) loop",
5771         "loop trip count is one, irrelevant for vectorization",
5772         "SingleIterationLoop", ORE, TheLoop);
5773     return FixedScalableVFPair::getNone();
5774   }
5775 
5776   switch (ScalarEpilogueStatus) {
5777   case CM_ScalarEpilogueAllowed:
5778     return computeFeasibleMaxVF(TC, UserVF);
5779   case CM_ScalarEpilogueNotAllowedUsePredicate:
5780     LLVM_FALLTHROUGH;
5781   case CM_ScalarEpilogueNotNeededUsePredicate:
5782     LLVM_DEBUG(
5783         dbgs() << "LV: vector predicate hint/switch found.\n"
5784                << "LV: Not allowing scalar epilogue, creating predicated "
5785                << "vector loop.\n");
5786     break;
5787   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5788     // fallthrough as a special case of OptForSize
5789   case CM_ScalarEpilogueNotAllowedOptSize:
5790     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5791       LLVM_DEBUG(
5792           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5793     else
5794       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5795                         << "count.\n");
5796 
5797     // Bail if runtime checks are required, which are not good when optimising
5798     // for size.
5799     if (runtimeChecksRequired())
5800       return FixedScalableVFPair::getNone();
5801 
5802     break;
5803   }
5804 
5805   // The only loops we can vectorize without a scalar epilogue, are loops with
5806   // a bottom-test and a single exiting block. We'd have to handle the fact
5807   // that not every instruction executes on the last iteration.  This will
5808   // require a lane mask which varies through the vector loop body.  (TODO)
5809   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5810     // If there was a tail-folding hint/switch, but we can't fold the tail by
5811     // masking, fallback to a vectorization with a scalar epilogue.
5812     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5813       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5814                            "scalar epilogue instead.\n");
5815       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5816       return computeFeasibleMaxVF(TC, UserVF);
5817     }
5818     return FixedScalableVFPair::getNone();
5819   }
5820 
5821   // Now try the tail folding
5822 
5823   // Invalidate interleave groups that require an epilogue if we can't mask
5824   // the interleave-group.
5825   if (!useMaskedInterleavedAccesses(TTI)) {
5826     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5827            "No decisions should have been taken at this point");
5828     // Note: There is no need to invalidate any cost modeling decisions here, as
5829     // non where taken so far.
5830     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5831   }
5832 
5833   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF);
5834   // Avoid tail folding if the trip count is known to be a multiple of any VF
5835   // we chose.
5836   // FIXME: The condition below pessimises the case for fixed-width vectors,
5837   // when scalable VFs are also candidates for vectorization.
5838   if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
5839     ElementCount MaxFixedVF = MaxFactors.FixedVF;
5840     assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&
5841            "MaxFixedVF must be a power of 2");
5842     unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
5843                                    : MaxFixedVF.getFixedValue();
5844     ScalarEvolution *SE = PSE.getSE();
5845     const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5846     const SCEV *ExitCount = SE->getAddExpr(
5847         BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5848     const SCEV *Rem = SE->getURemExpr(
5849         SE->applyLoopGuards(ExitCount, TheLoop),
5850         SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5851     if (Rem->isZero()) {
5852       // Accept MaxFixedVF if we do not have a tail.
5853       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5854       return MaxFactors;
5855     }
5856   }
5857 
5858   // For scalable vectors, don't use tail folding as this is currently not yet
5859   // supported. The code is likely to have ended up here if the tripcount is
5860   // low, in which case it makes sense not to use scalable vectors.
5861   if (MaxFactors.ScalableVF.isVector())
5862     MaxFactors.ScalableVF = ElementCount::getScalable(0);
5863 
5864   // If we don't know the precise trip count, or if the trip count that we
5865   // found modulo the vectorization factor is not zero, try to fold the tail
5866   // by masking.
5867   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5868   if (Legal->prepareToFoldTailByMasking()) {
5869     FoldTailByMasking = true;
5870     return MaxFactors;
5871   }
5872 
5873   // If there was a tail-folding hint/switch, but we can't fold the tail by
5874   // masking, fallback to a vectorization with a scalar epilogue.
5875   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5876     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5877                          "scalar epilogue instead.\n");
5878     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5879     return MaxFactors;
5880   }
5881 
5882   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5883     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5884     return FixedScalableVFPair::getNone();
5885   }
5886 
5887   if (TC == 0) {
5888     reportVectorizationFailure(
5889         "Unable to calculate the loop count due to complex control flow",
5890         "unable to calculate the loop count due to complex control flow",
5891         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5892     return FixedScalableVFPair::getNone();
5893   }
5894 
5895   reportVectorizationFailure(
5896       "Cannot optimize for size and vectorize at the same time.",
5897       "cannot optimize for size and vectorize at the same time. "
5898       "Enable vectorization of this loop with '#pragma clang loop "
5899       "vectorize(enable)' when compiling with -Os/-Oz",
5900       "NoTailLoopWithOptForSize", ORE, TheLoop);
5901   return FixedScalableVFPair::getNone();
5902 }
5903 
5904 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
5905     unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
5906     const ElementCount &MaxSafeVF) {
5907   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
5908   TypeSize WidestRegister = TTI.getRegisterBitWidth(
5909       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5910                            : TargetTransformInfo::RGK_FixedWidthVector);
5911 
5912   // Convenience function to return the minimum of two ElementCounts.
5913   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
5914     assert((LHS.isScalable() == RHS.isScalable()) &&
5915            "Scalable flags must match");
5916     return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
5917   };
5918 
5919   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5920   // Note that both WidestRegister and WidestType may not be a powers of 2.
5921   auto MaxVectorElementCount = ElementCount::get(
5922       PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType),
5923       ComputeScalableMaxVF);
5924   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
5925   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5926                     << (MaxVectorElementCount * WidestType) << " bits.\n");
5927 
5928   if (!MaxVectorElementCount) {
5929     LLVM_DEBUG(dbgs() << "LV: The target has no "
5930                       << (ComputeScalableMaxVF ? "scalable" : "fixed")
5931                       << " vector registers.\n");
5932     return ElementCount::getFixed(1);
5933   }
5934 
5935   const auto TripCountEC = ElementCount::getFixed(ConstTripCount);
5936   if (ConstTripCount &&
5937       ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) &&
5938       isPowerOf2_32(ConstTripCount)) {
5939     // We need to clamp the VF to be the ConstTripCount. There is no point in
5940     // choosing a higher viable VF as done in the loop below. If
5941     // MaxVectorElementCount is scalable, we only fall back on a fixed VF when
5942     // the TC is less than or equal to the known number of lanes.
5943     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5944                       << ConstTripCount << "\n");
5945     return TripCountEC;
5946   }
5947 
5948   ElementCount MaxVF = MaxVectorElementCount;
5949   if (TTI.shouldMaximizeVectorBandwidth() ||
5950       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5951     auto MaxVectorElementCountMaxBW = ElementCount::get(
5952         PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
5953         ComputeScalableMaxVF);
5954     MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
5955 
5956     // Collect all viable vectorization factors larger than the default MaxVF
5957     // (i.e. MaxVectorElementCount).
5958     SmallVector<ElementCount, 8> VFs;
5959     for (ElementCount VS = MaxVectorElementCount * 2;
5960          ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
5961       VFs.push_back(VS);
5962 
5963     // For each VF calculate its register usage.
5964     auto RUs = calculateRegisterUsage(VFs);
5965 
5966     // Select the largest VF which doesn't require more registers than existing
5967     // ones.
5968     for (int i = RUs.size() - 1; i >= 0; --i) {
5969       bool Selected = true;
5970       for (auto &pair : RUs[i].MaxLocalUsers) {
5971         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5972         if (pair.second > TargetNumRegisters)
5973           Selected = false;
5974       }
5975       if (Selected) {
5976         MaxVF = VFs[i];
5977         break;
5978       }
5979     }
5980     if (ElementCount MinVF =
5981             TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
5982       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
5983         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5984                           << ") with target's minimum: " << MinVF << '\n');
5985         MaxVF = MinVF;
5986       }
5987     }
5988   }
5989   return MaxVF;
5990 }
5991 
5992 bool LoopVectorizationCostModel::isMoreProfitable(
5993     const VectorizationFactor &A, const VectorizationFactor &B) const {
5994   InstructionCost CostA = A.Cost;
5995   InstructionCost CostB = B.Cost;
5996 
5997   unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
5998 
5999   if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking &&
6000       MaxTripCount) {
6001     // If we are folding the tail and the trip count is a known (possibly small)
6002     // constant, the trip count will be rounded up to an integer number of
6003     // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),
6004     // which we compare directly. When not folding the tail, the total cost will
6005     // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is
6006     // approximated with the per-lane cost below instead of using the tripcount
6007     // as here.
6008     auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue());
6009     auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue());
6010     return RTCostA < RTCostB;
6011   }
6012 
6013   // When set to preferred, for now assume vscale may be larger than 1, so
6014   // that scalable vectorization is slightly favorable over fixed-width
6015   // vectorization.
6016   if (Hints->isScalableVectorizationPreferred())
6017     if (A.Width.isScalable() && !B.Width.isScalable())
6018       return (CostA * B.Width.getKnownMinValue()) <=
6019              (CostB * A.Width.getKnownMinValue());
6020 
6021   // To avoid the need for FP division:
6022   //      (CostA / A.Width) < (CostB / B.Width)
6023   // <=>  (CostA * B.Width) < (CostB * A.Width)
6024   return (CostA * B.Width.getKnownMinValue()) <
6025          (CostB * A.Width.getKnownMinValue());
6026 }
6027 
6028 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
6029     const ElementCountSet &VFCandidates) {
6030   InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
6031   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
6032   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
6033   assert(VFCandidates.count(ElementCount::getFixed(1)) &&
6034          "Expected Scalar VF to be a candidate");
6035 
6036   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost);
6037   VectorizationFactor ChosenFactor = ScalarCost;
6038 
6039   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
6040   if (ForceVectorization && VFCandidates.size() > 1) {
6041     // Ignore scalar width, because the user explicitly wants vectorization.
6042     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
6043     // evaluation.
6044     ChosenFactor.Cost = InstructionCost::getMax();
6045   }
6046 
6047   SmallVector<InstructionVFPair> InvalidCosts;
6048   for (const auto &i : VFCandidates) {
6049     // The cost for scalar VF=1 is already calculated, so ignore it.
6050     if (i.isScalar())
6051       continue;
6052 
6053     VectorizationCostTy C = expectedCost(i, &InvalidCosts);
6054     VectorizationFactor Candidate(i, C.first);
6055     LLVM_DEBUG(
6056         dbgs() << "LV: Vector loop of width " << i << " costs: "
6057                << (Candidate.Cost / Candidate.Width.getKnownMinValue())
6058                << (i.isScalable() ? " (assuming a minimum vscale of 1)" : "")
6059                << ".\n");
6060 
6061     if (!C.second && !ForceVectorization) {
6062       LLVM_DEBUG(
6063           dbgs() << "LV: Not considering vector loop of width " << i
6064                  << " because it will not generate any vector instructions.\n");
6065       continue;
6066     }
6067 
6068     // If profitable add it to ProfitableVF list.
6069     if (isMoreProfitable(Candidate, ScalarCost))
6070       ProfitableVFs.push_back(Candidate);
6071 
6072     if (isMoreProfitable(Candidate, ChosenFactor))
6073       ChosenFactor = Candidate;
6074   }
6075 
6076   // Emit a report of VFs with invalid costs in the loop.
6077   if (!InvalidCosts.empty()) {
6078     // Group the remarks per instruction, keeping the instruction order from
6079     // InvalidCosts.
6080     std::map<Instruction *, unsigned> Numbering;
6081     unsigned I = 0;
6082     for (auto &Pair : InvalidCosts)
6083       if (!Numbering.count(Pair.first))
6084         Numbering[Pair.first] = I++;
6085 
6086     // Sort the list, first on instruction(number) then on VF.
6087     llvm::sort(InvalidCosts,
6088                [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
6089                  if (Numbering[A.first] != Numbering[B.first])
6090                    return Numbering[A.first] < Numbering[B.first];
6091                  ElementCountComparator ECC;
6092                  return ECC(A.second, B.second);
6093                });
6094 
6095     // For a list of ordered instruction-vf pairs:
6096     //   [(load, vf1), (load, vf2), (store, vf1)]
6097     // Group the instructions together to emit separate remarks for:
6098     //   load  (vf1, vf2)
6099     //   store (vf1)
6100     auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
6101     auto Subset = ArrayRef<InstructionVFPair>();
6102     do {
6103       if (Subset.empty())
6104         Subset = Tail.take_front(1);
6105 
6106       Instruction *I = Subset.front().first;
6107 
6108       // If the next instruction is different, or if there are no other pairs,
6109       // emit a remark for the collated subset. e.g.
6110       //   [(load, vf1), (load, vf2))]
6111       // to emit:
6112       //  remark: invalid costs for 'load' at VF=(vf, vf2)
6113       if (Subset == Tail || Tail[Subset.size()].first != I) {
6114         std::string OutString;
6115         raw_string_ostream OS(OutString);
6116         assert(!Subset.empty() && "Unexpected empty range");
6117         OS << "Instruction with invalid costs prevented vectorization at VF=(";
6118         for (auto &Pair : Subset)
6119           OS << (Pair.second == Subset.front().second ? "" : ", ")
6120              << Pair.second;
6121         OS << "):";
6122         if (auto *CI = dyn_cast<CallInst>(I))
6123           OS << " call to " << CI->getCalledFunction()->getName();
6124         else
6125           OS << " " << I->getOpcodeName();
6126         OS.flush();
6127         reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
6128         Tail = Tail.drop_front(Subset.size());
6129         Subset = {};
6130       } else
6131         // Grow the subset by one element
6132         Subset = Tail.take_front(Subset.size() + 1);
6133     } while (!Tail.empty());
6134   }
6135 
6136   if (!EnableCondStoresVectorization && NumPredStores) {
6137     reportVectorizationFailure("There are conditional stores.",
6138         "store that is conditionally executed prevents vectorization",
6139         "ConditionalStore", ORE, TheLoop);
6140     ChosenFactor = ScalarCost;
6141   }
6142 
6143   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
6144                  ChosenFactor.Cost >= ScalarCost.Cost) dbgs()
6145              << "LV: Vectorization seems to be not beneficial, "
6146              << "but was forced by a user.\n");
6147   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
6148   return ChosenFactor;
6149 }
6150 
6151 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
6152     const Loop &L, ElementCount VF) const {
6153   // Cross iteration phis such as reductions need special handling and are
6154   // currently unsupported.
6155   if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) {
6156         return Legal->isFirstOrderRecurrence(&Phi) ||
6157                Legal->isReductionVariable(&Phi);
6158       }))
6159     return false;
6160 
6161   // Phis with uses outside of the loop require special handling and are
6162   // currently unsupported.
6163   for (auto &Entry : Legal->getInductionVars()) {
6164     // Look for uses of the value of the induction at the last iteration.
6165     Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
6166     for (User *U : PostInc->users())
6167       if (!L.contains(cast<Instruction>(U)))
6168         return false;
6169     // Look for uses of penultimate value of the induction.
6170     for (User *U : Entry.first->users())
6171       if (!L.contains(cast<Instruction>(U)))
6172         return false;
6173   }
6174 
6175   // Induction variables that are widened require special handling that is
6176   // currently not supported.
6177   if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
6178         return !(this->isScalarAfterVectorization(Entry.first, VF) ||
6179                  this->isProfitableToScalarize(Entry.first, VF));
6180       }))
6181     return false;
6182 
6183   // Epilogue vectorization code has not been auditted to ensure it handles
6184   // non-latch exits properly.  It may be fine, but it needs auditted and
6185   // tested.
6186   if (L.getExitingBlock() != L.getLoopLatch())
6187     return false;
6188 
6189   return true;
6190 }
6191 
6192 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
6193     const ElementCount VF) const {
6194   // FIXME: We need a much better cost-model to take different parameters such
6195   // as register pressure, code size increase and cost of extra branches into
6196   // account. For now we apply a very crude heuristic and only consider loops
6197   // with vectorization factors larger than a certain value.
6198   // We also consider epilogue vectorization unprofitable for targets that don't
6199   // consider interleaving beneficial (eg. MVE).
6200   if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
6201     return false;
6202   if (VF.getFixedValue() >= EpilogueVectorizationMinVF)
6203     return true;
6204   return false;
6205 }
6206 
6207 VectorizationFactor
6208 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
6209     const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
6210   VectorizationFactor Result = VectorizationFactor::Disabled();
6211   if (!EnableEpilogueVectorization) {
6212     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
6213     return Result;
6214   }
6215 
6216   if (!isScalarEpilogueAllowed()) {
6217     LLVM_DEBUG(
6218         dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
6219                   "allowed.\n";);
6220     return Result;
6221   }
6222 
6223   // FIXME: This can be fixed for scalable vectors later, because at this stage
6224   // the LoopVectorizer will only consider vectorizing a loop with scalable
6225   // vectors when the loop has a hint to enable vectorization for a given VF.
6226   if (MainLoopVF.isScalable()) {
6227     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not "
6228                          "yet supported.\n");
6229     return Result;
6230   }
6231 
6232   // Not really a cost consideration, but check for unsupported cases here to
6233   // simplify the logic.
6234   if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
6235     LLVM_DEBUG(
6236         dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
6237                   "not a supported candidate.\n";);
6238     return Result;
6239   }
6240 
6241   if (EpilogueVectorizationForceVF > 1) {
6242     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
6243     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
6244     if (LVP.hasPlanWithVFs({MainLoopVF, ForcedEC}))
6245       return {ForcedEC, 0};
6246     else {
6247       LLVM_DEBUG(
6248           dbgs()
6249               << "LEV: Epilogue vectorization forced factor is not viable.\n";);
6250       return Result;
6251     }
6252   }
6253 
6254   if (TheLoop->getHeader()->getParent()->hasOptSize() ||
6255       TheLoop->getHeader()->getParent()->hasMinSize()) {
6256     LLVM_DEBUG(
6257         dbgs()
6258             << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
6259     return Result;
6260   }
6261 
6262   if (!isEpilogueVectorizationProfitable(MainLoopVF))
6263     return Result;
6264 
6265   for (auto &NextVF : ProfitableVFs)
6266     if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) &&
6267         (Result.Width.getFixedValue() == 1 ||
6268          isMoreProfitable(NextVF, Result)) &&
6269         LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width}))
6270       Result = NextVF;
6271 
6272   if (Result != VectorizationFactor::Disabled())
6273     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
6274                       << Result.Width.getFixedValue() << "\n";);
6275   return Result;
6276 }
6277 
6278 std::pair<unsigned, unsigned>
6279 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
6280   unsigned MinWidth = -1U;
6281   unsigned MaxWidth = 8;
6282   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
6283   for (Type *T : ElementTypesInLoop) {
6284     MinWidth = std::min<unsigned>(
6285         MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
6286     MaxWidth = std::max<unsigned>(
6287         MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
6288   }
6289   return {MinWidth, MaxWidth};
6290 }
6291 
6292 void LoopVectorizationCostModel::collectElementTypesForWidening() {
6293   ElementTypesInLoop.clear();
6294   // For each block.
6295   for (BasicBlock *BB : TheLoop->blocks()) {
6296     // For each instruction in the loop.
6297     for (Instruction &I : BB->instructionsWithoutDebug()) {
6298       Type *T = I.getType();
6299 
6300       // Skip ignored values.
6301       if (ValuesToIgnore.count(&I))
6302         continue;
6303 
6304       // Only examine Loads, Stores and PHINodes.
6305       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
6306         continue;
6307 
6308       // Examine PHI nodes that are reduction variables. Update the type to
6309       // account for the recurrence type.
6310       if (auto *PN = dyn_cast<PHINode>(&I)) {
6311         if (!Legal->isReductionVariable(PN))
6312           continue;
6313         const RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[PN];
6314         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
6315             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
6316                                       RdxDesc.getRecurrenceType(),
6317                                       TargetTransformInfo::ReductionFlags()))
6318           continue;
6319         T = RdxDesc.getRecurrenceType();
6320       }
6321 
6322       // Examine the stored values.
6323       if (auto *ST = dyn_cast<StoreInst>(&I))
6324         T = ST->getValueOperand()->getType();
6325 
6326       // Ignore loaded pointer types and stored pointer types that are not
6327       // vectorizable.
6328       //
6329       // FIXME: The check here attempts to predict whether a load or store will
6330       //        be vectorized. We only know this for certain after a VF has
6331       //        been selected. Here, we assume that if an access can be
6332       //        vectorized, it will be. We should also look at extending this
6333       //        optimization to non-pointer types.
6334       //
6335       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
6336           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
6337         continue;
6338 
6339       ElementTypesInLoop.insert(T);
6340     }
6341   }
6342 }
6343 
6344 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
6345                                                            unsigned LoopCost) {
6346   // -- The interleave heuristics --
6347   // We interleave the loop in order to expose ILP and reduce the loop overhead.
6348   // There are many micro-architectural considerations that we can't predict
6349   // at this level. For example, frontend pressure (on decode or fetch) due to
6350   // code size, or the number and capabilities of the execution ports.
6351   //
6352   // We use the following heuristics to select the interleave count:
6353   // 1. If the code has reductions, then we interleave to break the cross
6354   // iteration dependency.
6355   // 2. If the loop is really small, then we interleave to reduce the loop
6356   // overhead.
6357   // 3. We don't interleave if we think that we will spill registers to memory
6358   // due to the increased register pressure.
6359 
6360   if (!isScalarEpilogueAllowed())
6361     return 1;
6362 
6363   // We used the distance for the interleave count.
6364   if (Legal->getMaxSafeDepDistBytes() != -1U)
6365     return 1;
6366 
6367   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
6368   const bool HasReductions = !Legal->getReductionVars().empty();
6369   // Do not interleave loops with a relatively small known or estimated trip
6370   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
6371   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
6372   // because with the above conditions interleaving can expose ILP and break
6373   // cross iteration dependences for reductions.
6374   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
6375       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
6376     return 1;
6377 
6378   RegisterUsage R = calculateRegisterUsage({VF})[0];
6379   // We divide by these constants so assume that we have at least one
6380   // instruction that uses at least one register.
6381   for (auto& pair : R.MaxLocalUsers) {
6382     pair.second = std::max(pair.second, 1U);
6383   }
6384 
6385   // We calculate the interleave count using the following formula.
6386   // Subtract the number of loop invariants from the number of available
6387   // registers. These registers are used by all of the interleaved instances.
6388   // Next, divide the remaining registers by the number of registers that is
6389   // required by the loop, in order to estimate how many parallel instances
6390   // fit without causing spills. All of this is rounded down if necessary to be
6391   // a power of two. We want power of two interleave count to simplify any
6392   // addressing operations or alignment considerations.
6393   // We also want power of two interleave counts to ensure that the induction
6394   // variable of the vector loop wraps to zero, when tail is folded by masking;
6395   // this currently happens when OptForSize, in which case IC is set to 1 above.
6396   unsigned IC = UINT_MAX;
6397 
6398   for (auto& pair : R.MaxLocalUsers) {
6399     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
6400     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
6401                       << " registers of "
6402                       << TTI.getRegisterClassName(pair.first) << " register class\n");
6403     if (VF.isScalar()) {
6404       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
6405         TargetNumRegisters = ForceTargetNumScalarRegs;
6406     } else {
6407       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
6408         TargetNumRegisters = ForceTargetNumVectorRegs;
6409     }
6410     unsigned MaxLocalUsers = pair.second;
6411     unsigned LoopInvariantRegs = 0;
6412     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
6413       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
6414 
6415     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
6416     // Don't count the induction variable as interleaved.
6417     if (EnableIndVarRegisterHeur) {
6418       TmpIC =
6419           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
6420                         std::max(1U, (MaxLocalUsers - 1)));
6421     }
6422 
6423     IC = std::min(IC, TmpIC);
6424   }
6425 
6426   // Clamp the interleave ranges to reasonable counts.
6427   unsigned MaxInterleaveCount =
6428       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
6429 
6430   // Check if the user has overridden the max.
6431   if (VF.isScalar()) {
6432     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
6433       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
6434   } else {
6435     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
6436       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
6437   }
6438 
6439   // If trip count is known or estimated compile time constant, limit the
6440   // interleave count to be less than the trip count divided by VF, provided it
6441   // is at least 1.
6442   //
6443   // For scalable vectors we can't know if interleaving is beneficial. It may
6444   // not be beneficial for small loops if none of the lanes in the second vector
6445   // iterations is enabled. However, for larger loops, there is likely to be a
6446   // similar benefit as for fixed-width vectors. For now, we choose to leave
6447   // the InterleaveCount as if vscale is '1', although if some information about
6448   // the vector is known (e.g. min vector size), we can make a better decision.
6449   if (BestKnownTC) {
6450     MaxInterleaveCount =
6451         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
6452     // Make sure MaxInterleaveCount is greater than 0.
6453     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
6454   }
6455 
6456   assert(MaxInterleaveCount > 0 &&
6457          "Maximum interleave count must be greater than 0");
6458 
6459   // Clamp the calculated IC to be between the 1 and the max interleave count
6460   // that the target and trip count allows.
6461   if (IC > MaxInterleaveCount)
6462     IC = MaxInterleaveCount;
6463   else
6464     // Make sure IC is greater than 0.
6465     IC = std::max(1u, IC);
6466 
6467   assert(IC > 0 && "Interleave count must be greater than 0.");
6468 
6469   // If we did not calculate the cost for VF (because the user selected the VF)
6470   // then we calculate the cost of VF here.
6471   if (LoopCost == 0) {
6472     InstructionCost C = expectedCost(VF).first;
6473     assert(C.isValid() && "Expected to have chosen a VF with valid cost");
6474     LoopCost = *C.getValue();
6475   }
6476 
6477   assert(LoopCost && "Non-zero loop cost expected");
6478 
6479   // Interleave if we vectorized this loop and there is a reduction that could
6480   // benefit from interleaving.
6481   if (VF.isVector() && HasReductions) {
6482     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
6483     return IC;
6484   }
6485 
6486   // Note that if we've already vectorized the loop we will have done the
6487   // runtime check and so interleaving won't require further checks.
6488   bool InterleavingRequiresRuntimePointerCheck =
6489       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
6490 
6491   // We want to interleave small loops in order to reduce the loop overhead and
6492   // potentially expose ILP opportunities.
6493   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
6494                     << "LV: IC is " << IC << '\n'
6495                     << "LV: VF is " << VF << '\n');
6496   const bool AggressivelyInterleaveReductions =
6497       TTI.enableAggressiveInterleaving(HasReductions);
6498   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
6499     // We assume that the cost overhead is 1 and we use the cost model
6500     // to estimate the cost of the loop and interleave until the cost of the
6501     // loop overhead is about 5% of the cost of the loop.
6502     unsigned SmallIC =
6503         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
6504 
6505     // Interleave until store/load ports (estimated by max interleave count) are
6506     // saturated.
6507     unsigned NumStores = Legal->getNumStores();
6508     unsigned NumLoads = Legal->getNumLoads();
6509     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
6510     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
6511 
6512     // There is little point in interleaving for reductions containing selects
6513     // and compares when VF=1 since it may just create more overhead than it's
6514     // worth for loops with small trip counts. This is because we still have to
6515     // do the final reduction after the loop.
6516     bool HasSelectCmpReductions =
6517         HasReductions &&
6518         any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
6519           const RecurrenceDescriptor &RdxDesc = Reduction.second;
6520           return RecurrenceDescriptor::isSelectCmpRecurrenceKind(
6521               RdxDesc.getRecurrenceKind());
6522         });
6523     if (HasSelectCmpReductions) {
6524       LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
6525       return 1;
6526     }
6527 
6528     // If we have a scalar reduction (vector reductions are already dealt with
6529     // by this point), we can increase the critical path length if the loop
6530     // we're interleaving is inside another loop. For tree-wise reductions
6531     // set the limit to 2, and for ordered reductions it's best to disable
6532     // interleaving entirely.
6533     if (HasReductions && TheLoop->getLoopDepth() > 1) {
6534       bool HasOrderedReductions =
6535           any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
6536             const RecurrenceDescriptor &RdxDesc = Reduction.second;
6537             return RdxDesc.isOrdered();
6538           });
6539       if (HasOrderedReductions) {
6540         LLVM_DEBUG(
6541             dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
6542         return 1;
6543       }
6544 
6545       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
6546       SmallIC = std::min(SmallIC, F);
6547       StoresIC = std::min(StoresIC, F);
6548       LoadsIC = std::min(LoadsIC, F);
6549     }
6550 
6551     if (EnableLoadStoreRuntimeInterleave &&
6552         std::max(StoresIC, LoadsIC) > SmallIC) {
6553       LLVM_DEBUG(
6554           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
6555       return std::max(StoresIC, LoadsIC);
6556     }
6557 
6558     // If there are scalar reductions and TTI has enabled aggressive
6559     // interleaving for reductions, we will interleave to expose ILP.
6560     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
6561         AggressivelyInterleaveReductions) {
6562       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6563       // Interleave no less than SmallIC but not as aggressive as the normal IC
6564       // to satisfy the rare situation when resources are too limited.
6565       return std::max(IC / 2, SmallIC);
6566     } else {
6567       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
6568       return SmallIC;
6569     }
6570   }
6571 
6572   // Interleave if this is a large loop (small loops are already dealt with by
6573   // this point) that could benefit from interleaving.
6574   if (AggressivelyInterleaveReductions) {
6575     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6576     return IC;
6577   }
6578 
6579   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
6580   return 1;
6581 }
6582 
6583 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
6584 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
6585   // This function calculates the register usage by measuring the highest number
6586   // of values that are alive at a single location. Obviously, this is a very
6587   // rough estimation. We scan the loop in a topological order in order and
6588   // assign a number to each instruction. We use RPO to ensure that defs are
6589   // met before their users. We assume that each instruction that has in-loop
6590   // users starts an interval. We record every time that an in-loop value is
6591   // used, so we have a list of the first and last occurrences of each
6592   // instruction. Next, we transpose this data structure into a multi map that
6593   // holds the list of intervals that *end* at a specific location. This multi
6594   // map allows us to perform a linear search. We scan the instructions linearly
6595   // and record each time that a new interval starts, by placing it in a set.
6596   // If we find this value in the multi-map then we remove it from the set.
6597   // The max register usage is the maximum size of the set.
6598   // We also search for instructions that are defined outside the loop, but are
6599   // used inside the loop. We need this number separately from the max-interval
6600   // usage number because when we unroll, loop-invariant values do not take
6601   // more register.
6602   LoopBlocksDFS DFS(TheLoop);
6603   DFS.perform(LI);
6604 
6605   RegisterUsage RU;
6606 
6607   // Each 'key' in the map opens a new interval. The values
6608   // of the map are the index of the 'last seen' usage of the
6609   // instruction that is the key.
6610   using IntervalMap = DenseMap<Instruction *, unsigned>;
6611 
6612   // Maps instruction to its index.
6613   SmallVector<Instruction *, 64> IdxToInstr;
6614   // Marks the end of each interval.
6615   IntervalMap EndPoint;
6616   // Saves the list of instruction indices that are used in the loop.
6617   SmallPtrSet<Instruction *, 8> Ends;
6618   // Saves the list of values that are used in the loop but are
6619   // defined outside the loop, such as arguments and constants.
6620   SmallPtrSet<Value *, 8> LoopInvariants;
6621 
6622   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6623     for (Instruction &I : BB->instructionsWithoutDebug()) {
6624       IdxToInstr.push_back(&I);
6625 
6626       // Save the end location of each USE.
6627       for (Value *U : I.operands()) {
6628         auto *Instr = dyn_cast<Instruction>(U);
6629 
6630         // Ignore non-instruction values such as arguments, constants, etc.
6631         if (!Instr)
6632           continue;
6633 
6634         // If this instruction is outside the loop then record it and continue.
6635         if (!TheLoop->contains(Instr)) {
6636           LoopInvariants.insert(Instr);
6637           continue;
6638         }
6639 
6640         // Overwrite previous end points.
6641         EndPoint[Instr] = IdxToInstr.size();
6642         Ends.insert(Instr);
6643       }
6644     }
6645   }
6646 
6647   // Saves the list of intervals that end with the index in 'key'.
6648   using InstrList = SmallVector<Instruction *, 2>;
6649   DenseMap<unsigned, InstrList> TransposeEnds;
6650 
6651   // Transpose the EndPoints to a list of values that end at each index.
6652   for (auto &Interval : EndPoint)
6653     TransposeEnds[Interval.second].push_back(Interval.first);
6654 
6655   SmallPtrSet<Instruction *, 8> OpenIntervals;
6656   SmallVector<RegisterUsage, 8> RUs(VFs.size());
6657   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
6658 
6659   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
6660 
6661   // A lambda that gets the register usage for the given type and VF.
6662   const auto &TTICapture = TTI;
6663   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
6664     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
6665       return 0;
6666     InstructionCost::CostType RegUsage =
6667         *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue();
6668     assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() &&
6669            "Nonsensical values for register usage.");
6670     return RegUsage;
6671   };
6672 
6673   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
6674     Instruction *I = IdxToInstr[i];
6675 
6676     // Remove all of the instructions that end at this location.
6677     InstrList &List = TransposeEnds[i];
6678     for (Instruction *ToRemove : List)
6679       OpenIntervals.erase(ToRemove);
6680 
6681     // Ignore instructions that are never used within the loop.
6682     if (!Ends.count(I))
6683       continue;
6684 
6685     // Skip ignored values.
6686     if (ValuesToIgnore.count(I))
6687       continue;
6688 
6689     // For each VF find the maximum usage of registers.
6690     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6691       // Count the number of live intervals.
6692       SmallMapVector<unsigned, unsigned, 4> RegUsage;
6693 
6694       if (VFs[j].isScalar()) {
6695         for (auto Inst : OpenIntervals) {
6696           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6697           if (RegUsage.find(ClassID) == RegUsage.end())
6698             RegUsage[ClassID] = 1;
6699           else
6700             RegUsage[ClassID] += 1;
6701         }
6702       } else {
6703         collectUniformsAndScalars(VFs[j]);
6704         for (auto Inst : OpenIntervals) {
6705           // Skip ignored values for VF > 1.
6706           if (VecValuesToIgnore.count(Inst))
6707             continue;
6708           if (isScalarAfterVectorization(Inst, VFs[j])) {
6709             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6710             if (RegUsage.find(ClassID) == RegUsage.end())
6711               RegUsage[ClassID] = 1;
6712             else
6713               RegUsage[ClassID] += 1;
6714           } else {
6715             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
6716             if (RegUsage.find(ClassID) == RegUsage.end())
6717               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
6718             else
6719               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6720           }
6721         }
6722       }
6723 
6724       for (auto& pair : RegUsage) {
6725         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
6726           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
6727         else
6728           MaxUsages[j][pair.first] = pair.second;
6729       }
6730     }
6731 
6732     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6733                       << OpenIntervals.size() << '\n');
6734 
6735     // Add the current instruction to the list of open intervals.
6736     OpenIntervals.insert(I);
6737   }
6738 
6739   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6740     SmallMapVector<unsigned, unsigned, 4> Invariant;
6741 
6742     for (auto Inst : LoopInvariants) {
6743       unsigned Usage =
6744           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6745       unsigned ClassID =
6746           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6747       if (Invariant.find(ClassID) == Invariant.end())
6748         Invariant[ClassID] = Usage;
6749       else
6750         Invariant[ClassID] += Usage;
6751     }
6752 
6753     LLVM_DEBUG({
6754       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
6755       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
6756              << " item\n";
6757       for (const auto &pair : MaxUsages[i]) {
6758         dbgs() << "LV(REG): RegisterClass: "
6759                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6760                << " registers\n";
6761       }
6762       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
6763              << " item\n";
6764       for (const auto &pair : Invariant) {
6765         dbgs() << "LV(REG): RegisterClass: "
6766                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6767                << " registers\n";
6768       }
6769     });
6770 
6771     RU.LoopInvariantRegs = Invariant;
6772     RU.MaxLocalUsers = MaxUsages[i];
6773     RUs[i] = RU;
6774   }
6775 
6776   return RUs;
6777 }
6778 
6779 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
6780   // TODO: Cost model for emulated masked load/store is completely
6781   // broken. This hack guides the cost model to use an artificially
6782   // high enough value to practically disable vectorization with such
6783   // operations, except where previously deployed legality hack allowed
6784   // using very low cost values. This is to avoid regressions coming simply
6785   // from moving "masked load/store" check from legality to cost model.
6786   // Masked Load/Gather emulation was previously never allowed.
6787   // Limited number of Masked Store/Scatter emulation was allowed.
6788   assert(isPredicatedInst(I) &&
6789          "Expecting a scalar emulated instruction");
6790   return isa<LoadInst>(I) ||
6791          (isa<StoreInst>(I) &&
6792           NumPredStores > NumberOfStoresToPredicate);
6793 }
6794 
6795 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6796   // If we aren't vectorizing the loop, or if we've already collected the
6797   // instructions to scalarize, there's nothing to do. Collection may already
6798   // have occurred if we have a user-selected VF and are now computing the
6799   // expected cost for interleaving.
6800   if (VF.isScalar() || VF.isZero() ||
6801       InstsToScalarize.find(VF) != InstsToScalarize.end())
6802     return;
6803 
6804   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6805   // not profitable to scalarize any instructions, the presence of VF in the
6806   // map will indicate that we've analyzed it already.
6807   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6808 
6809   // Find all the instructions that are scalar with predication in the loop and
6810   // determine if it would be better to not if-convert the blocks they are in.
6811   // If so, we also record the instructions to scalarize.
6812   for (BasicBlock *BB : TheLoop->blocks()) {
6813     if (!blockNeedsPredication(BB))
6814       continue;
6815     for (Instruction &I : *BB)
6816       if (isScalarWithPredication(&I)) {
6817         ScalarCostsTy ScalarCosts;
6818         // Do not apply discount if scalable, because that would lead to
6819         // invalid scalarization costs.
6820         // Do not apply discount logic if hacked cost is needed
6821         // for emulated masked memrefs.
6822         if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I) &&
6823             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6824           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6825         // Remember that BB will remain after vectorization.
6826         PredicatedBBsAfterVectorization.insert(BB);
6827       }
6828   }
6829 }
6830 
6831 int LoopVectorizationCostModel::computePredInstDiscount(
6832     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
6833   assert(!isUniformAfterVectorization(PredInst, VF) &&
6834          "Instruction marked uniform-after-vectorization will be predicated");
6835 
6836   // Initialize the discount to zero, meaning that the scalar version and the
6837   // vector version cost the same.
6838   InstructionCost Discount = 0;
6839 
6840   // Holds instructions to analyze. The instructions we visit are mapped in
6841   // ScalarCosts. Those instructions are the ones that would be scalarized if
6842   // we find that the scalar version costs less.
6843   SmallVector<Instruction *, 8> Worklist;
6844 
6845   // Returns true if the given instruction can be scalarized.
6846   auto canBeScalarized = [&](Instruction *I) -> bool {
6847     // We only attempt to scalarize instructions forming a single-use chain
6848     // from the original predicated block that would otherwise be vectorized.
6849     // Although not strictly necessary, we give up on instructions we know will
6850     // already be scalar to avoid traversing chains that are unlikely to be
6851     // beneficial.
6852     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6853         isScalarAfterVectorization(I, VF))
6854       return false;
6855 
6856     // If the instruction is scalar with predication, it will be analyzed
6857     // separately. We ignore it within the context of PredInst.
6858     if (isScalarWithPredication(I))
6859       return false;
6860 
6861     // If any of the instruction's operands are uniform after vectorization,
6862     // the instruction cannot be scalarized. This prevents, for example, a
6863     // masked load from being scalarized.
6864     //
6865     // We assume we will only emit a value for lane zero of an instruction
6866     // marked uniform after vectorization, rather than VF identical values.
6867     // Thus, if we scalarize an instruction that uses a uniform, we would
6868     // create uses of values corresponding to the lanes we aren't emitting code
6869     // for. This behavior can be changed by allowing getScalarValue to clone
6870     // the lane zero values for uniforms rather than asserting.
6871     for (Use &U : I->operands())
6872       if (auto *J = dyn_cast<Instruction>(U.get()))
6873         if (isUniformAfterVectorization(J, VF))
6874           return false;
6875 
6876     // Otherwise, we can scalarize the instruction.
6877     return true;
6878   };
6879 
6880   // Compute the expected cost discount from scalarizing the entire expression
6881   // feeding the predicated instruction. We currently only consider expressions
6882   // that are single-use instruction chains.
6883   Worklist.push_back(PredInst);
6884   while (!Worklist.empty()) {
6885     Instruction *I = Worklist.pop_back_val();
6886 
6887     // If we've already analyzed the instruction, there's nothing to do.
6888     if (ScalarCosts.find(I) != ScalarCosts.end())
6889       continue;
6890 
6891     // Compute the cost of the vector instruction. Note that this cost already
6892     // includes the scalarization overhead of the predicated instruction.
6893     InstructionCost VectorCost = getInstructionCost(I, VF).first;
6894 
6895     // Compute the cost of the scalarized instruction. This cost is the cost of
6896     // the instruction as if it wasn't if-converted and instead remained in the
6897     // predicated block. We will scale this cost by block probability after
6898     // computing the scalarization overhead.
6899     InstructionCost ScalarCost =
6900         VF.getFixedValue() *
6901         getInstructionCost(I, ElementCount::getFixed(1)).first;
6902 
6903     // Compute the scalarization overhead of needed insertelement instructions
6904     // and phi nodes.
6905     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
6906       ScalarCost += TTI.getScalarizationOverhead(
6907           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6908           APInt::getAllOnes(VF.getFixedValue()), true, false);
6909       ScalarCost +=
6910           VF.getFixedValue() *
6911           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6912     }
6913 
6914     // Compute the scalarization overhead of needed extractelement
6915     // instructions. For each of the instruction's operands, if the operand can
6916     // be scalarized, add it to the worklist; otherwise, account for the
6917     // overhead.
6918     for (Use &U : I->operands())
6919       if (auto *J = dyn_cast<Instruction>(U.get())) {
6920         assert(VectorType::isValidElementType(J->getType()) &&
6921                "Instruction has non-scalar type");
6922         if (canBeScalarized(J))
6923           Worklist.push_back(J);
6924         else if (needsExtract(J, VF)) {
6925           ScalarCost += TTI.getScalarizationOverhead(
6926               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6927               APInt::getAllOnes(VF.getFixedValue()), false, true);
6928         }
6929       }
6930 
6931     // Scale the total scalar cost by block probability.
6932     ScalarCost /= getReciprocalPredBlockProb();
6933 
6934     // Compute the discount. A non-negative discount means the vector version
6935     // of the instruction costs more, and scalarizing would be beneficial.
6936     Discount += VectorCost - ScalarCost;
6937     ScalarCosts[I] = ScalarCost;
6938   }
6939 
6940   return *Discount.getValue();
6941 }
6942 
6943 LoopVectorizationCostModel::VectorizationCostTy
6944 LoopVectorizationCostModel::expectedCost(
6945     ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
6946   VectorizationCostTy Cost;
6947 
6948   // For each block.
6949   for (BasicBlock *BB : TheLoop->blocks()) {
6950     VectorizationCostTy BlockCost;
6951 
6952     // For each instruction in the old loop.
6953     for (Instruction &I : BB->instructionsWithoutDebug()) {
6954       // Skip ignored values.
6955       if (ValuesToIgnore.count(&I) ||
6956           (VF.isVector() && VecValuesToIgnore.count(&I)))
6957         continue;
6958 
6959       VectorizationCostTy C = getInstructionCost(&I, VF);
6960 
6961       // Check if we should override the cost.
6962       if (C.first.isValid() &&
6963           ForceTargetInstructionCost.getNumOccurrences() > 0)
6964         C.first = InstructionCost(ForceTargetInstructionCost);
6965 
6966       // Keep a list of instructions with invalid costs.
6967       if (Invalid && !C.first.isValid())
6968         Invalid->emplace_back(&I, VF);
6969 
6970       BlockCost.first += C.first;
6971       BlockCost.second |= C.second;
6972       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6973                         << " for VF " << VF << " For instruction: " << I
6974                         << '\n');
6975     }
6976 
6977     // If we are vectorizing a predicated block, it will have been
6978     // if-converted. This means that the block's instructions (aside from
6979     // stores and instructions that may divide by zero) will now be
6980     // unconditionally executed. For the scalar case, we may not always execute
6981     // the predicated block, if it is an if-else block. Thus, scale the block's
6982     // cost by the probability of executing it. blockNeedsPredication from
6983     // Legal is used so as to not include all blocks in tail folded loops.
6984     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6985       BlockCost.first /= getReciprocalPredBlockProb();
6986 
6987     Cost.first += BlockCost.first;
6988     Cost.second |= BlockCost.second;
6989   }
6990 
6991   return Cost;
6992 }
6993 
6994 /// Gets Address Access SCEV after verifying that the access pattern
6995 /// is loop invariant except the induction variable dependence.
6996 ///
6997 /// This SCEV can be sent to the Target in order to estimate the address
6998 /// calculation cost.
6999 static const SCEV *getAddressAccessSCEV(
7000               Value *Ptr,
7001               LoopVectorizationLegality *Legal,
7002               PredicatedScalarEvolution &PSE,
7003               const Loop *TheLoop) {
7004 
7005   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
7006   if (!Gep)
7007     return nullptr;
7008 
7009   // We are looking for a gep with all loop invariant indices except for one
7010   // which should be an induction variable.
7011   auto SE = PSE.getSE();
7012   unsigned NumOperands = Gep->getNumOperands();
7013   for (unsigned i = 1; i < NumOperands; ++i) {
7014     Value *Opd = Gep->getOperand(i);
7015     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
7016         !Legal->isInductionVariable(Opd))
7017       return nullptr;
7018   }
7019 
7020   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
7021   return PSE.getSCEV(Ptr);
7022 }
7023 
7024 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
7025   return Legal->hasStride(I->getOperand(0)) ||
7026          Legal->hasStride(I->getOperand(1));
7027 }
7028 
7029 InstructionCost
7030 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
7031                                                         ElementCount VF) {
7032   assert(VF.isVector() &&
7033          "Scalarization cost of instruction implies vectorization.");
7034   if (VF.isScalable())
7035     return InstructionCost::getInvalid();
7036 
7037   Type *ValTy = getLoadStoreType(I);
7038   auto SE = PSE.getSE();
7039 
7040   unsigned AS = getLoadStoreAddressSpace(I);
7041   Value *Ptr = getLoadStorePointerOperand(I);
7042   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
7043 
7044   // Figure out whether the access is strided and get the stride value
7045   // if it's known in compile time
7046   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
7047 
7048   // Get the cost of the scalar memory instruction and address computation.
7049   InstructionCost Cost =
7050       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
7051 
7052   // Don't pass *I here, since it is scalar but will actually be part of a
7053   // vectorized loop where the user of it is a vectorized instruction.
7054   const Align Alignment = getLoadStoreAlignment(I);
7055   Cost += VF.getKnownMinValue() *
7056           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
7057                               AS, TTI::TCK_RecipThroughput);
7058 
7059   // Get the overhead of the extractelement and insertelement instructions
7060   // we might create due to scalarization.
7061   Cost += getScalarizationOverhead(I, VF);
7062 
7063   // If we have a predicated load/store, it will need extra i1 extracts and
7064   // conditional branches, but may not be executed for each vector lane. Scale
7065   // the cost by the probability of executing the predicated block.
7066   if (isPredicatedInst(I)) {
7067     Cost /= getReciprocalPredBlockProb();
7068 
7069     // Add the cost of an i1 extract and a branch
7070     auto *Vec_i1Ty =
7071         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
7072     Cost += TTI.getScalarizationOverhead(
7073         Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
7074         /*Insert=*/false, /*Extract=*/true);
7075     Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
7076 
7077     if (useEmulatedMaskMemRefHack(I))
7078       // Artificially setting to a high enough value to practically disable
7079       // vectorization with such operations.
7080       Cost = 3000000;
7081   }
7082 
7083   return Cost;
7084 }
7085 
7086 InstructionCost
7087 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
7088                                                     ElementCount VF) {
7089   Type *ValTy = getLoadStoreType(I);
7090   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
7091   Value *Ptr = getLoadStorePointerOperand(I);
7092   unsigned AS = getLoadStoreAddressSpace(I);
7093   int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
7094   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7095 
7096   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
7097          "Stride should be 1 or -1 for consecutive memory access");
7098   const Align Alignment = getLoadStoreAlignment(I);
7099   InstructionCost Cost = 0;
7100   if (Legal->isMaskRequired(I))
7101     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
7102                                       CostKind);
7103   else
7104     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
7105                                 CostKind, I);
7106 
7107   bool Reverse = ConsecutiveStride < 0;
7108   if (Reverse)
7109     Cost +=
7110         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
7111   return Cost;
7112 }
7113 
7114 InstructionCost
7115 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
7116                                                 ElementCount VF) {
7117   assert(Legal->isUniformMemOp(*I));
7118 
7119   Type *ValTy = getLoadStoreType(I);
7120   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
7121   const Align Alignment = getLoadStoreAlignment(I);
7122   unsigned AS = getLoadStoreAddressSpace(I);
7123   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7124   if (isa<LoadInst>(I)) {
7125     return TTI.getAddressComputationCost(ValTy) +
7126            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
7127                                CostKind) +
7128            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
7129   }
7130   StoreInst *SI = cast<StoreInst>(I);
7131 
7132   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
7133   return TTI.getAddressComputationCost(ValTy) +
7134          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
7135                              CostKind) +
7136          (isLoopInvariantStoreValue
7137               ? 0
7138               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
7139                                        VF.getKnownMinValue() - 1));
7140 }
7141 
7142 InstructionCost
7143 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
7144                                                  ElementCount VF) {
7145   Type *ValTy = getLoadStoreType(I);
7146   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
7147   const Align Alignment = getLoadStoreAlignment(I);
7148   const Value *Ptr = getLoadStorePointerOperand(I);
7149 
7150   return TTI.getAddressComputationCost(VectorTy) +
7151          TTI.getGatherScatterOpCost(
7152              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
7153              TargetTransformInfo::TCK_RecipThroughput, I);
7154 }
7155 
7156 InstructionCost
7157 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
7158                                                    ElementCount VF) {
7159   // TODO: Once we have support for interleaving with scalable vectors
7160   // we can calculate the cost properly here.
7161   if (VF.isScalable())
7162     return InstructionCost::getInvalid();
7163 
7164   Type *ValTy = getLoadStoreType(I);
7165   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
7166   unsigned AS = getLoadStoreAddressSpace(I);
7167 
7168   auto Group = getInterleavedAccessGroup(I);
7169   assert(Group && "Fail to get an interleaved access group.");
7170 
7171   unsigned InterleaveFactor = Group->getFactor();
7172   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
7173 
7174   // Holds the indices of existing members in the interleaved group.
7175   SmallVector<unsigned, 4> Indices;
7176   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
7177     if (Group->getMember(IF))
7178       Indices.push_back(IF);
7179 
7180   // Calculate the cost of the whole interleaved group.
7181   bool UseMaskForGaps =
7182       (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
7183       (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
7184   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
7185       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
7186       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
7187 
7188   if (Group->isReverse()) {
7189     // TODO: Add support for reversed masked interleaved access.
7190     assert(!Legal->isMaskRequired(I) &&
7191            "Reverse masked interleaved access not supported.");
7192     Cost +=
7193         Group->getNumMembers() *
7194         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
7195   }
7196   return Cost;
7197 }
7198 
7199 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
7200     Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
7201   using namespace llvm::PatternMatch;
7202   // Early exit for no inloop reductions
7203   if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
7204     return None;
7205   auto *VectorTy = cast<VectorType>(Ty);
7206 
7207   // We are looking for a pattern of, and finding the minimal acceptable cost:
7208   //  reduce(mul(ext(A), ext(B))) or
7209   //  reduce(mul(A, B)) or
7210   //  reduce(ext(A)) or
7211   //  reduce(A).
7212   // The basic idea is that we walk down the tree to do that, finding the root
7213   // reduction instruction in InLoopReductionImmediateChains. From there we find
7214   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
7215   // of the components. If the reduction cost is lower then we return it for the
7216   // reduction instruction and 0 for the other instructions in the pattern. If
7217   // it is not we return an invalid cost specifying the orignal cost method
7218   // should be used.
7219   Instruction *RetI = I;
7220   if (match(RetI, m_ZExtOrSExt(m_Value()))) {
7221     if (!RetI->hasOneUser())
7222       return None;
7223     RetI = RetI->user_back();
7224   }
7225   if (match(RetI, m_Mul(m_Value(), m_Value())) &&
7226       RetI->user_back()->getOpcode() == Instruction::Add) {
7227     if (!RetI->hasOneUser())
7228       return None;
7229     RetI = RetI->user_back();
7230   }
7231 
7232   // Test if the found instruction is a reduction, and if not return an invalid
7233   // cost specifying the parent to use the original cost modelling.
7234   if (!InLoopReductionImmediateChains.count(RetI))
7235     return None;
7236 
7237   // Find the reduction this chain is a part of and calculate the basic cost of
7238   // the reduction on its own.
7239   Instruction *LastChain = InLoopReductionImmediateChains[RetI];
7240   Instruction *ReductionPhi = LastChain;
7241   while (!isa<PHINode>(ReductionPhi))
7242     ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
7243 
7244   const RecurrenceDescriptor &RdxDesc =
7245       Legal->getReductionVars()[cast<PHINode>(ReductionPhi)];
7246 
7247   InstructionCost BaseCost = TTI.getArithmeticReductionCost(
7248       RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
7249 
7250   // If we're using ordered reductions then we can just return the base cost
7251   // here, since getArithmeticReductionCost calculates the full ordered
7252   // reduction cost when FP reassociation is not allowed.
7253   if (useOrderedReductions(RdxDesc))
7254     return BaseCost;
7255 
7256   // Get the operand that was not the reduction chain and match it to one of the
7257   // patterns, returning the better cost if it is found.
7258   Instruction *RedOp = RetI->getOperand(1) == LastChain
7259                            ? dyn_cast<Instruction>(RetI->getOperand(0))
7260                            : dyn_cast<Instruction>(RetI->getOperand(1));
7261 
7262   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
7263 
7264   Instruction *Op0, *Op1;
7265   if (RedOp &&
7266       match(RedOp,
7267             m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
7268       match(Op0, m_ZExtOrSExt(m_Value())) &&
7269       Op0->getOpcode() == Op1->getOpcode() &&
7270       Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
7271       !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
7272       (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
7273 
7274     // Matched reduce(ext(mul(ext(A), ext(B)))
7275     // Note that the extend opcodes need to all match, or if A==B they will have
7276     // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
7277     // which is equally fine.
7278     bool IsUnsigned = isa<ZExtInst>(Op0);
7279     auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
7280     auto *MulType = VectorType::get(Op0->getType(), VectorTy);
7281 
7282     InstructionCost ExtCost =
7283         TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
7284                              TTI::CastContextHint::None, CostKind, Op0);
7285     InstructionCost MulCost =
7286         TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
7287     InstructionCost Ext2Cost =
7288         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
7289                              TTI::CastContextHint::None, CostKind, RedOp);
7290 
7291     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7292         /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
7293         CostKind);
7294 
7295     if (RedCost.isValid() &&
7296         RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
7297       return I == RetI ? RedCost : 0;
7298   } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
7299              !TheLoop->isLoopInvariant(RedOp)) {
7300     // Matched reduce(ext(A))
7301     bool IsUnsigned = isa<ZExtInst>(RedOp);
7302     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
7303     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7304         /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
7305         CostKind);
7306 
7307     InstructionCost ExtCost =
7308         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
7309                              TTI::CastContextHint::None, CostKind, RedOp);
7310     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
7311       return I == RetI ? RedCost : 0;
7312   } else if (RedOp &&
7313              match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
7314     if (match(Op0, m_ZExtOrSExt(m_Value())) &&
7315         Op0->getOpcode() == Op1->getOpcode() &&
7316         Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
7317         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
7318       bool IsUnsigned = isa<ZExtInst>(Op0);
7319       auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
7320       // Matched reduce(mul(ext, ext))
7321       InstructionCost ExtCost =
7322           TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType,
7323                                TTI::CastContextHint::None, CostKind, Op0);
7324       InstructionCost MulCost =
7325           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7326 
7327       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7328           /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
7329           CostKind);
7330 
7331       if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost)
7332         return I == RetI ? RedCost : 0;
7333     } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
7334       // Matched reduce(mul())
7335       InstructionCost MulCost =
7336           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7337 
7338       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7339           /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy,
7340           CostKind);
7341 
7342       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
7343         return I == RetI ? RedCost : 0;
7344     }
7345   }
7346 
7347   return I == RetI ? Optional<InstructionCost>(BaseCost) : None;
7348 }
7349 
7350 InstructionCost
7351 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
7352                                                      ElementCount VF) {
7353   // Calculate scalar cost only. Vectorization cost should be ready at this
7354   // moment.
7355   if (VF.isScalar()) {
7356     Type *ValTy = getLoadStoreType(I);
7357     const Align Alignment = getLoadStoreAlignment(I);
7358     unsigned AS = getLoadStoreAddressSpace(I);
7359 
7360     return TTI.getAddressComputationCost(ValTy) +
7361            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
7362                                TTI::TCK_RecipThroughput, I);
7363   }
7364   return getWideningCost(I, VF);
7365 }
7366 
7367 LoopVectorizationCostModel::VectorizationCostTy
7368 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
7369                                                ElementCount VF) {
7370   // If we know that this instruction will remain uniform, check the cost of
7371   // the scalar version.
7372   if (isUniformAfterVectorization(I, VF))
7373     VF = ElementCount::getFixed(1);
7374 
7375   if (VF.isVector() && isProfitableToScalarize(I, VF))
7376     return VectorizationCostTy(InstsToScalarize[VF][I], false);
7377 
7378   // Forced scalars do not have any scalarization overhead.
7379   auto ForcedScalar = ForcedScalars.find(VF);
7380   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
7381     auto InstSet = ForcedScalar->second;
7382     if (InstSet.count(I))
7383       return VectorizationCostTy(
7384           (getInstructionCost(I, ElementCount::getFixed(1)).first *
7385            VF.getKnownMinValue()),
7386           false);
7387   }
7388 
7389   Type *VectorTy;
7390   InstructionCost C = getInstructionCost(I, VF, VectorTy);
7391 
7392   bool TypeNotScalarized =
7393       VF.isVector() && VectorTy->isVectorTy() &&
7394       TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue();
7395   return VectorizationCostTy(C, TypeNotScalarized);
7396 }
7397 
7398 InstructionCost
7399 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
7400                                                      ElementCount VF) const {
7401 
7402   // There is no mechanism yet to create a scalable scalarization loop,
7403   // so this is currently Invalid.
7404   if (VF.isScalable())
7405     return InstructionCost::getInvalid();
7406 
7407   if (VF.isScalar())
7408     return 0;
7409 
7410   InstructionCost Cost = 0;
7411   Type *RetTy = ToVectorTy(I->getType(), VF);
7412   if (!RetTy->isVoidTy() &&
7413       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
7414     Cost += TTI.getScalarizationOverhead(
7415         cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true,
7416         false);
7417 
7418   // Some targets keep addresses scalar.
7419   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
7420     return Cost;
7421 
7422   // Some targets support efficient element stores.
7423   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
7424     return Cost;
7425 
7426   // Collect operands to consider.
7427   CallInst *CI = dyn_cast<CallInst>(I);
7428   Instruction::op_range Ops = CI ? CI->args() : I->operands();
7429 
7430   // Skip operands that do not require extraction/scalarization and do not incur
7431   // any overhead.
7432   SmallVector<Type *> Tys;
7433   for (auto *V : filterExtractingOperands(Ops, VF))
7434     Tys.push_back(MaybeVectorizeType(V->getType(), VF));
7435   return Cost + TTI.getOperandsScalarizationOverhead(
7436                     filterExtractingOperands(Ops, VF), Tys);
7437 }
7438 
7439 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
7440   if (VF.isScalar())
7441     return;
7442   NumPredStores = 0;
7443   for (BasicBlock *BB : TheLoop->blocks()) {
7444     // For each instruction in the old loop.
7445     for (Instruction &I : *BB) {
7446       Value *Ptr =  getLoadStorePointerOperand(&I);
7447       if (!Ptr)
7448         continue;
7449 
7450       // TODO: We should generate better code and update the cost model for
7451       // predicated uniform stores. Today they are treated as any other
7452       // predicated store (see added test cases in
7453       // invariant-store-vectorization.ll).
7454       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
7455         NumPredStores++;
7456 
7457       if (Legal->isUniformMemOp(I)) {
7458         // TODO: Avoid replicating loads and stores instead of
7459         // relying on instcombine to remove them.
7460         // Load: Scalar load + broadcast
7461         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
7462         InstructionCost Cost;
7463         if (isa<StoreInst>(&I) && VF.isScalable() &&
7464             isLegalGatherOrScatter(&I)) {
7465           Cost = getGatherScatterCost(&I, VF);
7466           setWideningDecision(&I, VF, CM_GatherScatter, Cost);
7467         } else {
7468           assert((isa<LoadInst>(&I) || !VF.isScalable()) &&
7469                  "Cannot yet scalarize uniform stores");
7470           Cost = getUniformMemOpCost(&I, VF);
7471           setWideningDecision(&I, VF, CM_Scalarize, Cost);
7472         }
7473         continue;
7474       }
7475 
7476       // We assume that widening is the best solution when possible.
7477       if (memoryInstructionCanBeWidened(&I, VF)) {
7478         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
7479         int ConsecutiveStride = Legal->isConsecutivePtr(
7480             getLoadStoreType(&I), getLoadStorePointerOperand(&I));
7481         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
7482                "Expected consecutive stride.");
7483         InstWidening Decision =
7484             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
7485         setWideningDecision(&I, VF, Decision, Cost);
7486         continue;
7487       }
7488 
7489       // Choose between Interleaving, Gather/Scatter or Scalarization.
7490       InstructionCost InterleaveCost = InstructionCost::getInvalid();
7491       unsigned NumAccesses = 1;
7492       if (isAccessInterleaved(&I)) {
7493         auto Group = getInterleavedAccessGroup(&I);
7494         assert(Group && "Fail to get an interleaved access group.");
7495 
7496         // Make one decision for the whole group.
7497         if (getWideningDecision(&I, VF) != CM_Unknown)
7498           continue;
7499 
7500         NumAccesses = Group->getNumMembers();
7501         if (interleavedAccessCanBeWidened(&I, VF))
7502           InterleaveCost = getInterleaveGroupCost(&I, VF);
7503       }
7504 
7505       InstructionCost GatherScatterCost =
7506           isLegalGatherOrScatter(&I)
7507               ? getGatherScatterCost(&I, VF) * NumAccesses
7508               : InstructionCost::getInvalid();
7509 
7510       InstructionCost ScalarizationCost =
7511           getMemInstScalarizationCost(&I, VF) * NumAccesses;
7512 
7513       // Choose better solution for the current VF,
7514       // write down this decision and use it during vectorization.
7515       InstructionCost Cost;
7516       InstWidening Decision;
7517       if (InterleaveCost <= GatherScatterCost &&
7518           InterleaveCost < ScalarizationCost) {
7519         Decision = CM_Interleave;
7520         Cost = InterleaveCost;
7521       } else if (GatherScatterCost < ScalarizationCost) {
7522         Decision = CM_GatherScatter;
7523         Cost = GatherScatterCost;
7524       } else {
7525         Decision = CM_Scalarize;
7526         Cost = ScalarizationCost;
7527       }
7528       // If the instructions belongs to an interleave group, the whole group
7529       // receives the same decision. The whole group receives the cost, but
7530       // the cost will actually be assigned to one instruction.
7531       if (auto Group = getInterleavedAccessGroup(&I))
7532         setWideningDecision(Group, VF, Decision, Cost);
7533       else
7534         setWideningDecision(&I, VF, Decision, Cost);
7535     }
7536   }
7537 
7538   // Make sure that any load of address and any other address computation
7539   // remains scalar unless there is gather/scatter support. This avoids
7540   // inevitable extracts into address registers, and also has the benefit of
7541   // activating LSR more, since that pass can't optimize vectorized
7542   // addresses.
7543   if (TTI.prefersVectorizedAddressing())
7544     return;
7545 
7546   // Start with all scalar pointer uses.
7547   SmallPtrSet<Instruction *, 8> AddrDefs;
7548   for (BasicBlock *BB : TheLoop->blocks())
7549     for (Instruction &I : *BB) {
7550       Instruction *PtrDef =
7551         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
7552       if (PtrDef && TheLoop->contains(PtrDef) &&
7553           getWideningDecision(&I, VF) != CM_GatherScatter)
7554         AddrDefs.insert(PtrDef);
7555     }
7556 
7557   // Add all instructions used to generate the addresses.
7558   SmallVector<Instruction *, 4> Worklist;
7559   append_range(Worklist, AddrDefs);
7560   while (!Worklist.empty()) {
7561     Instruction *I = Worklist.pop_back_val();
7562     for (auto &Op : I->operands())
7563       if (auto *InstOp = dyn_cast<Instruction>(Op))
7564         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
7565             AddrDefs.insert(InstOp).second)
7566           Worklist.push_back(InstOp);
7567   }
7568 
7569   for (auto *I : AddrDefs) {
7570     if (isa<LoadInst>(I)) {
7571       // Setting the desired widening decision should ideally be handled in
7572       // by cost functions, but since this involves the task of finding out
7573       // if the loaded register is involved in an address computation, it is
7574       // instead changed here when we know this is the case.
7575       InstWidening Decision = getWideningDecision(I, VF);
7576       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
7577         // Scalarize a widened load of address.
7578         setWideningDecision(
7579             I, VF, CM_Scalarize,
7580             (VF.getKnownMinValue() *
7581              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
7582       else if (auto Group = getInterleavedAccessGroup(I)) {
7583         // Scalarize an interleave group of address loads.
7584         for (unsigned I = 0; I < Group->getFactor(); ++I) {
7585           if (Instruction *Member = Group->getMember(I))
7586             setWideningDecision(
7587                 Member, VF, CM_Scalarize,
7588                 (VF.getKnownMinValue() *
7589                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
7590         }
7591       }
7592     } else
7593       // Make sure I gets scalarized and a cost estimate without
7594       // scalarization overhead.
7595       ForcedScalars[VF].insert(I);
7596   }
7597 }
7598 
7599 InstructionCost
7600 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
7601                                                Type *&VectorTy) {
7602   Type *RetTy = I->getType();
7603   if (canTruncateToMinimalBitwidth(I, VF))
7604     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
7605   auto SE = PSE.getSE();
7606   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7607 
7608   auto hasSingleCopyAfterVectorization = [this](Instruction *I,
7609                                                 ElementCount VF) -> bool {
7610     if (VF.isScalar())
7611       return true;
7612 
7613     auto Scalarized = InstsToScalarize.find(VF);
7614     assert(Scalarized != InstsToScalarize.end() &&
7615            "VF not yet analyzed for scalarization profitability");
7616     return !Scalarized->second.count(I) &&
7617            llvm::all_of(I->users(), [&](User *U) {
7618              auto *UI = cast<Instruction>(U);
7619              return !Scalarized->second.count(UI);
7620            });
7621   };
7622   (void) hasSingleCopyAfterVectorization;
7623 
7624   if (isScalarAfterVectorization(I, VF)) {
7625     // With the exception of GEPs and PHIs, after scalarization there should
7626     // only be one copy of the instruction generated in the loop. This is
7627     // because the VF is either 1, or any instructions that need scalarizing
7628     // have already been dealt with by the the time we get here. As a result,
7629     // it means we don't have to multiply the instruction cost by VF.
7630     assert(I->getOpcode() == Instruction::GetElementPtr ||
7631            I->getOpcode() == Instruction::PHI ||
7632            (I->getOpcode() == Instruction::BitCast &&
7633             I->getType()->isPointerTy()) ||
7634            hasSingleCopyAfterVectorization(I, VF));
7635     VectorTy = RetTy;
7636   } else
7637     VectorTy = ToVectorTy(RetTy, VF);
7638 
7639   // TODO: We need to estimate the cost of intrinsic calls.
7640   switch (I->getOpcode()) {
7641   case Instruction::GetElementPtr:
7642     // We mark this instruction as zero-cost because the cost of GEPs in
7643     // vectorized code depends on whether the corresponding memory instruction
7644     // is scalarized or not. Therefore, we handle GEPs with the memory
7645     // instruction cost.
7646     return 0;
7647   case Instruction::Br: {
7648     // In cases of scalarized and predicated instructions, there will be VF
7649     // predicated blocks in the vectorized loop. Each branch around these
7650     // blocks requires also an extract of its vector compare i1 element.
7651     bool ScalarPredicatedBB = false;
7652     BranchInst *BI = cast<BranchInst>(I);
7653     if (VF.isVector() && BI->isConditional() &&
7654         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
7655          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
7656       ScalarPredicatedBB = true;
7657 
7658     if (ScalarPredicatedBB) {
7659       // Not possible to scalarize scalable vector with predicated instructions.
7660       if (VF.isScalable())
7661         return InstructionCost::getInvalid();
7662       // Return cost for branches around scalarized and predicated blocks.
7663       auto *Vec_i1Ty =
7664           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
7665       return (
7666           TTI.getScalarizationOverhead(
7667               Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) +
7668           (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
7669     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
7670       // The back-edge branch will remain, as will all scalar branches.
7671       return TTI.getCFInstrCost(Instruction::Br, CostKind);
7672     else
7673       // This branch will be eliminated by if-conversion.
7674       return 0;
7675     // Note: We currently assume zero cost for an unconditional branch inside
7676     // a predicated block since it will become a fall-through, although we
7677     // may decide in the future to call TTI for all branches.
7678   }
7679   case Instruction::PHI: {
7680     auto *Phi = cast<PHINode>(I);
7681 
7682     // First-order recurrences are replaced by vector shuffles inside the loop.
7683     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
7684     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
7685       return TTI.getShuffleCost(
7686           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
7687           None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
7688 
7689     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7690     // converted into select instructions. We require N - 1 selects per phi
7691     // node, where N is the number of incoming values.
7692     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
7693       return (Phi->getNumIncomingValues() - 1) *
7694              TTI.getCmpSelInstrCost(
7695                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
7696                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
7697                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
7698 
7699     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
7700   }
7701   case Instruction::UDiv:
7702   case Instruction::SDiv:
7703   case Instruction::URem:
7704   case Instruction::SRem:
7705     // If we have a predicated instruction, it may not be executed for each
7706     // vector lane. Get the scalarization cost and scale this amount by the
7707     // probability of executing the predicated block. If the instruction is not
7708     // predicated, we fall through to the next case.
7709     if (VF.isVector() && isScalarWithPredication(I)) {
7710       InstructionCost Cost = 0;
7711 
7712       // These instructions have a non-void type, so account for the phi nodes
7713       // that we will create. This cost is likely to be zero. The phi node
7714       // cost, if any, should be scaled by the block probability because it
7715       // models a copy at the end of each predicated block.
7716       Cost += VF.getKnownMinValue() *
7717               TTI.getCFInstrCost(Instruction::PHI, CostKind);
7718 
7719       // The cost of the non-predicated instruction.
7720       Cost += VF.getKnownMinValue() *
7721               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
7722 
7723       // The cost of insertelement and extractelement instructions needed for
7724       // scalarization.
7725       Cost += getScalarizationOverhead(I, VF);
7726 
7727       // Scale the cost by the probability of executing the predicated blocks.
7728       // This assumes the predicated block for each vector lane is equally
7729       // likely.
7730       return Cost / getReciprocalPredBlockProb();
7731     }
7732     LLVM_FALLTHROUGH;
7733   case Instruction::Add:
7734   case Instruction::FAdd:
7735   case Instruction::Sub:
7736   case Instruction::FSub:
7737   case Instruction::Mul:
7738   case Instruction::FMul:
7739   case Instruction::FDiv:
7740   case Instruction::FRem:
7741   case Instruction::Shl:
7742   case Instruction::LShr:
7743   case Instruction::AShr:
7744   case Instruction::And:
7745   case Instruction::Or:
7746   case Instruction::Xor: {
7747     // Since we will replace the stride by 1 the multiplication should go away.
7748     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7749       return 0;
7750 
7751     // Detect reduction patterns
7752     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7753       return *RedCost;
7754 
7755     // Certain instructions can be cheaper to vectorize if they have a constant
7756     // second vector operand. One example of this are shifts on x86.
7757     Value *Op2 = I->getOperand(1);
7758     TargetTransformInfo::OperandValueProperties Op2VP;
7759     TargetTransformInfo::OperandValueKind Op2VK =
7760         TTI.getOperandInfo(Op2, Op2VP);
7761     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7762       Op2VK = TargetTransformInfo::OK_UniformValue;
7763 
7764     SmallVector<const Value *, 4> Operands(I->operand_values());
7765     return TTI.getArithmeticInstrCost(
7766         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7767         Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
7768   }
7769   case Instruction::FNeg: {
7770     return TTI.getArithmeticInstrCost(
7771         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7772         TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None,
7773         TargetTransformInfo::OP_None, I->getOperand(0), I);
7774   }
7775   case Instruction::Select: {
7776     SelectInst *SI = cast<SelectInst>(I);
7777     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7778     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7779 
7780     const Value *Op0, *Op1;
7781     using namespace llvm::PatternMatch;
7782     if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
7783                         match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
7784       // select x, y, false --> x & y
7785       // select x, true, y --> x | y
7786       TTI::OperandValueProperties Op1VP = TTI::OP_None;
7787       TTI::OperandValueProperties Op2VP = TTI::OP_None;
7788       TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP);
7789       TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP);
7790       assert(Op0->getType()->getScalarSizeInBits() == 1 &&
7791               Op1->getType()->getScalarSizeInBits() == 1);
7792 
7793       SmallVector<const Value *, 2> Operands{Op0, Op1};
7794       return TTI.getArithmeticInstrCost(
7795           match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
7796           CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I);
7797     }
7798 
7799     Type *CondTy = SI->getCondition()->getType();
7800     if (!ScalarCond)
7801       CondTy = VectorType::get(CondTy, VF);
7802     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
7803                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
7804   }
7805   case Instruction::ICmp:
7806   case Instruction::FCmp: {
7807     Type *ValTy = I->getOperand(0)->getType();
7808     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7809     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7810       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7811     VectorTy = ToVectorTy(ValTy, VF);
7812     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7813                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
7814   }
7815   case Instruction::Store:
7816   case Instruction::Load: {
7817     ElementCount Width = VF;
7818     if (Width.isVector()) {
7819       InstWidening Decision = getWideningDecision(I, Width);
7820       assert(Decision != CM_Unknown &&
7821              "CM decision should be taken at this point");
7822       if (Decision == CM_Scalarize)
7823         Width = ElementCount::getFixed(1);
7824     }
7825     VectorTy = ToVectorTy(getLoadStoreType(I), Width);
7826     return getMemoryInstructionCost(I, VF);
7827   }
7828   case Instruction::BitCast:
7829     if (I->getType()->isPointerTy())
7830       return 0;
7831     LLVM_FALLTHROUGH;
7832   case Instruction::ZExt:
7833   case Instruction::SExt:
7834   case Instruction::FPToUI:
7835   case Instruction::FPToSI:
7836   case Instruction::FPExt:
7837   case Instruction::PtrToInt:
7838   case Instruction::IntToPtr:
7839   case Instruction::SIToFP:
7840   case Instruction::UIToFP:
7841   case Instruction::Trunc:
7842   case Instruction::FPTrunc: {
7843     // Computes the CastContextHint from a Load/Store instruction.
7844     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7845       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7846              "Expected a load or a store!");
7847 
7848       if (VF.isScalar() || !TheLoop->contains(I))
7849         return TTI::CastContextHint::Normal;
7850 
7851       switch (getWideningDecision(I, VF)) {
7852       case LoopVectorizationCostModel::CM_GatherScatter:
7853         return TTI::CastContextHint::GatherScatter;
7854       case LoopVectorizationCostModel::CM_Interleave:
7855         return TTI::CastContextHint::Interleave;
7856       case LoopVectorizationCostModel::CM_Scalarize:
7857       case LoopVectorizationCostModel::CM_Widen:
7858         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7859                                         : TTI::CastContextHint::Normal;
7860       case LoopVectorizationCostModel::CM_Widen_Reverse:
7861         return TTI::CastContextHint::Reversed;
7862       case LoopVectorizationCostModel::CM_Unknown:
7863         llvm_unreachable("Instr did not go through cost modelling?");
7864       }
7865 
7866       llvm_unreachable("Unhandled case!");
7867     };
7868 
7869     unsigned Opcode = I->getOpcode();
7870     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7871     // For Trunc, the context is the only user, which must be a StoreInst.
7872     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7873       if (I->hasOneUse())
7874         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7875           CCH = ComputeCCH(Store);
7876     }
7877     // For Z/Sext, the context is the operand, which must be a LoadInst.
7878     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7879              Opcode == Instruction::FPExt) {
7880       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7881         CCH = ComputeCCH(Load);
7882     }
7883 
7884     // We optimize the truncation of induction variables having constant
7885     // integer steps. The cost of these truncations is the same as the scalar
7886     // operation.
7887     if (isOptimizableIVTruncate(I, VF)) {
7888       auto *Trunc = cast<TruncInst>(I);
7889       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7890                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7891     }
7892 
7893     // Detect reduction patterns
7894     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7895       return *RedCost;
7896 
7897     Type *SrcScalarTy = I->getOperand(0)->getType();
7898     Type *SrcVecTy =
7899         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7900     if (canTruncateToMinimalBitwidth(I, VF)) {
7901       // This cast is going to be shrunk. This may remove the cast or it might
7902       // turn it into slightly different cast. For example, if MinBW == 16,
7903       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7904       //
7905       // Calculate the modified src and dest types.
7906       Type *MinVecTy = VectorTy;
7907       if (Opcode == Instruction::Trunc) {
7908         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7909         VectorTy =
7910             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7911       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7912         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7913         VectorTy =
7914             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7915       }
7916     }
7917 
7918     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7919   }
7920   case Instruction::Call: {
7921     bool NeedToScalarize;
7922     CallInst *CI = cast<CallInst>(I);
7923     InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7924     if (getVectorIntrinsicIDForCall(CI, TLI)) {
7925       InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
7926       return std::min(CallCost, IntrinsicCost);
7927     }
7928     return CallCost;
7929   }
7930   case Instruction::ExtractValue:
7931     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7932   case Instruction::Alloca:
7933     // We cannot easily widen alloca to a scalable alloca, as
7934     // the result would need to be a vector of pointers.
7935     if (VF.isScalable())
7936       return InstructionCost::getInvalid();
7937     LLVM_FALLTHROUGH;
7938   default:
7939     // This opcode is unknown. Assume that it is the same as 'mul'.
7940     return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7941   } // end of switch.
7942 }
7943 
7944 char LoopVectorize::ID = 0;
7945 
7946 static const char lv_name[] = "Loop Vectorization";
7947 
7948 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7949 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7950 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7951 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
7952 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7953 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7954 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7955 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7956 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7957 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7958 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7959 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7960 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7961 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
7962 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
7963 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7964 
7965 namespace llvm {
7966 
7967 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7968 
7969 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7970                               bool VectorizeOnlyWhenForced) {
7971   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7972 }
7973 
7974 } // end namespace llvm
7975 
7976 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
7977   // Check if the pointer operand of a load or store instruction is
7978   // consecutive.
7979   if (auto *Ptr = getLoadStorePointerOperand(Inst))
7980     return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr);
7981   return false;
7982 }
7983 
7984 void LoopVectorizationCostModel::collectValuesToIgnore() {
7985   // Ignore ephemeral values.
7986   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7987 
7988   // Ignore type-promoting instructions we identified during reduction
7989   // detection.
7990   for (auto &Reduction : Legal->getReductionVars()) {
7991     RecurrenceDescriptor &RedDes = Reduction.second;
7992     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7993     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7994   }
7995   // Ignore type-casting instructions we identified during induction
7996   // detection.
7997   for (auto &Induction : Legal->getInductionVars()) {
7998     InductionDescriptor &IndDes = Induction.second;
7999     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
8000     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
8001   }
8002 }
8003 
8004 void LoopVectorizationCostModel::collectInLoopReductions() {
8005   for (auto &Reduction : Legal->getReductionVars()) {
8006     PHINode *Phi = Reduction.first;
8007     RecurrenceDescriptor &RdxDesc = Reduction.second;
8008 
8009     // We don't collect reductions that are type promoted (yet).
8010     if (RdxDesc.getRecurrenceType() != Phi->getType())
8011       continue;
8012 
8013     // If the target would prefer this reduction to happen "in-loop", then we
8014     // want to record it as such.
8015     unsigned Opcode = RdxDesc.getOpcode();
8016     if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
8017         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
8018                                    TargetTransformInfo::ReductionFlags()))
8019       continue;
8020 
8021     // Check that we can correctly put the reductions into the loop, by
8022     // finding the chain of operations that leads from the phi to the loop
8023     // exit value.
8024     SmallVector<Instruction *, 4> ReductionOperations =
8025         RdxDesc.getReductionOpChain(Phi, TheLoop);
8026     bool InLoop = !ReductionOperations.empty();
8027     if (InLoop) {
8028       InLoopReductionChains[Phi] = ReductionOperations;
8029       // Add the elements to InLoopReductionImmediateChains for cost modelling.
8030       Instruction *LastChain = Phi;
8031       for (auto *I : ReductionOperations) {
8032         InLoopReductionImmediateChains[I] = LastChain;
8033         LastChain = I;
8034       }
8035     }
8036     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
8037                       << " reduction for phi: " << *Phi << "\n");
8038   }
8039 }
8040 
8041 // TODO: we could return a pair of values that specify the max VF and
8042 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
8043 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
8044 // doesn't have a cost model that can choose which plan to execute if
8045 // more than one is generated.
8046 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
8047                                  LoopVectorizationCostModel &CM) {
8048   unsigned WidestType;
8049   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
8050   return WidestVectorRegBits / WidestType;
8051 }
8052 
8053 VectorizationFactor
8054 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
8055   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
8056   ElementCount VF = UserVF;
8057   // Outer loop handling: They may require CFG and instruction level
8058   // transformations before even evaluating whether vectorization is profitable.
8059   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8060   // the vectorization pipeline.
8061   if (!OrigLoop->isInnermost()) {
8062     // If the user doesn't provide a vectorization factor, determine a
8063     // reasonable one.
8064     if (UserVF.isZero()) {
8065       VF = ElementCount::getFixed(determineVPlanVF(
8066           TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
8067               .getFixedSize(),
8068           CM));
8069       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
8070 
8071       // Make sure we have a VF > 1 for stress testing.
8072       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
8073         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
8074                           << "overriding computed VF.\n");
8075         VF = ElementCount::getFixed(4);
8076       }
8077     }
8078     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8079     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
8080            "VF needs to be a power of two");
8081     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
8082                       << "VF " << VF << " to build VPlans.\n");
8083     buildVPlans(VF, VF);
8084 
8085     // For VPlan build stress testing, we bail out after VPlan construction.
8086     if (VPlanBuildStressTest)
8087       return VectorizationFactor::Disabled();
8088 
8089     return {VF, 0 /*Cost*/};
8090   }
8091 
8092   LLVM_DEBUG(
8093       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
8094                 "VPlan-native path.\n");
8095   return VectorizationFactor::Disabled();
8096 }
8097 
8098 Optional<VectorizationFactor>
8099 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
8100   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8101   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
8102   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
8103     return None;
8104 
8105   // Invalidate interleave groups if all blocks of loop will be predicated.
8106   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
8107       !useMaskedInterleavedAccesses(*TTI)) {
8108     LLVM_DEBUG(
8109         dbgs()
8110         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
8111            "which requires masked-interleaved support.\n");
8112     if (CM.InterleaveInfo.invalidateGroups())
8113       // Invalidating interleave groups also requires invalidating all decisions
8114       // based on them, which includes widening decisions and uniform and scalar
8115       // values.
8116       CM.invalidateCostModelingDecisions();
8117   }
8118 
8119   ElementCount MaxUserVF =
8120       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
8121   bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
8122   if (!UserVF.isZero() && UserVFIsLegal) {
8123     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
8124            "VF needs to be a power of two");
8125     // Collect the instructions (and their associated costs) that will be more
8126     // profitable to scalarize.
8127     if (CM.selectUserVectorizationFactor(UserVF)) {
8128       LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
8129       CM.collectInLoopReductions();
8130       buildVPlansWithVPRecipes(UserVF, UserVF);
8131       LLVM_DEBUG(printPlans(dbgs()));
8132       return {{UserVF, 0}};
8133     } else
8134       reportVectorizationInfo("UserVF ignored because of invalid costs.",
8135                               "InvalidCost", ORE, OrigLoop);
8136   }
8137 
8138   // Populate the set of Vectorization Factor Candidates.
8139   ElementCountSet VFCandidates;
8140   for (auto VF = ElementCount::getFixed(1);
8141        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
8142     VFCandidates.insert(VF);
8143   for (auto VF = ElementCount::getScalable(1);
8144        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
8145     VFCandidates.insert(VF);
8146 
8147   for (const auto &VF : VFCandidates) {
8148     // Collect Uniform and Scalar instructions after vectorization with VF.
8149     CM.collectUniformsAndScalars(VF);
8150 
8151     // Collect the instructions (and their associated costs) that will be more
8152     // profitable to scalarize.
8153     if (VF.isVector())
8154       CM.collectInstsToScalarize(VF);
8155   }
8156 
8157   CM.collectInLoopReductions();
8158   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
8159   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
8160 
8161   LLVM_DEBUG(printPlans(dbgs()));
8162   if (!MaxFactors.hasVector())
8163     return VectorizationFactor::Disabled();
8164 
8165   // Select the optimal vectorization factor.
8166   auto SelectedVF = CM.selectVectorizationFactor(VFCandidates);
8167 
8168   // Check if it is profitable to vectorize with runtime checks.
8169   unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
8170   if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) {
8171     bool PragmaThresholdReached =
8172         NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
8173     bool ThresholdReached =
8174         NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
8175     if ((ThresholdReached && !Hints.allowReordering()) ||
8176         PragmaThresholdReached) {
8177       ORE->emit([&]() {
8178         return OptimizationRemarkAnalysisAliasing(
8179                    DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(),
8180                    OrigLoop->getHeader())
8181                << "loop not vectorized: cannot prove it is safe to reorder "
8182                   "memory operations";
8183       });
8184       LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
8185       Hints.emitRemarkWithHints();
8186       return VectorizationFactor::Disabled();
8187     }
8188   }
8189   return SelectedVF;
8190 }
8191 
8192 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) {
8193   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
8194                     << '\n');
8195   BestVF = VF;
8196   BestUF = UF;
8197 
8198   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
8199     return !Plan->hasVF(VF);
8200   });
8201   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
8202 }
8203 
8204 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
8205                                            DominatorTree *DT) {
8206   // Perform the actual loop transformation.
8207 
8208   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
8209   assert(BestVF.hasValue() && "Vectorization Factor is missing");
8210   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
8211 
8212   VPTransformState State{
8213       *BestVF, BestUF, LI, DT, ILV.Builder, &ILV, VPlans.front().get()};
8214   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
8215   State.TripCount = ILV.getOrCreateTripCount(nullptr);
8216   State.CanonicalIV = ILV.Induction;
8217 
8218   ILV.printDebugTracesAtStart();
8219 
8220   //===------------------------------------------------===//
8221   //
8222   // Notice: any optimization or new instruction that go
8223   // into the code below should also be implemented in
8224   // the cost-model.
8225   //
8226   //===------------------------------------------------===//
8227 
8228   // 2. Copy and widen instructions from the old loop into the new loop.
8229   VPlans.front()->execute(&State);
8230 
8231   // 3. Fix the vectorized code: take care of header phi's, live-outs,
8232   //    predication, updating analyses.
8233   ILV.fixVectorizedLoop(State);
8234 
8235   ILV.printDebugTracesAtEnd();
8236 }
8237 
8238 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
8239 void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
8240   for (const auto &Plan : VPlans)
8241     if (PrintVPlansInDotFormat)
8242       Plan->printDOT(O);
8243     else
8244       Plan->print(O);
8245 }
8246 #endif
8247 
8248 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
8249     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
8250 
8251   // We create new control-flow for the vectorized loop, so the original exit
8252   // conditions will be dead after vectorization if it's only used by the
8253   // terminator
8254   SmallVector<BasicBlock*> ExitingBlocks;
8255   OrigLoop->getExitingBlocks(ExitingBlocks);
8256   for (auto *BB : ExitingBlocks) {
8257     auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));
8258     if (!Cmp || !Cmp->hasOneUse())
8259       continue;
8260 
8261     // TODO: we should introduce a getUniqueExitingBlocks on Loop
8262     if (!DeadInstructions.insert(Cmp).second)
8263       continue;
8264 
8265     // The operands of the icmp is often a dead trunc, used by IndUpdate.
8266     // TODO: can recurse through operands in general
8267     for (Value *Op : Cmp->operands()) {
8268       if (isa<TruncInst>(Op) && Op->hasOneUse())
8269           DeadInstructions.insert(cast<Instruction>(Op));
8270     }
8271   }
8272 
8273   // We create new "steps" for induction variable updates to which the original
8274   // induction variables map. An original update instruction will be dead if
8275   // all its users except the induction variable are dead.
8276   auto *Latch = OrigLoop->getLoopLatch();
8277   for (auto &Induction : Legal->getInductionVars()) {
8278     PHINode *Ind = Induction.first;
8279     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
8280 
8281     // If the tail is to be folded by masking, the primary induction variable,
8282     // if exists, isn't dead: it will be used for masking. Don't kill it.
8283     if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
8284       continue;
8285 
8286     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
8287           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
8288         }))
8289       DeadInstructions.insert(IndUpdate);
8290 
8291     // We record as "Dead" also the type-casting instructions we had identified
8292     // during induction analysis. We don't need any handling for them in the
8293     // vectorized loop because we have proven that, under a proper runtime
8294     // test guarding the vectorized loop, the value of the phi, and the casted
8295     // value of the phi, are the same. The last instruction in this casting chain
8296     // will get its scalar/vector/widened def from the scalar/vector/widened def
8297     // of the respective phi node. Any other casts in the induction def-use chain
8298     // have no other uses outside the phi update chain, and will be ignored.
8299     InductionDescriptor &IndDes = Induction.second;
8300     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
8301     DeadInstructions.insert(Casts.begin(), Casts.end());
8302   }
8303 }
8304 
8305 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
8306 
8307 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
8308 
8309 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
8310                                         Instruction::BinaryOps BinOp) {
8311   // When unrolling and the VF is 1, we only need to add a simple scalar.
8312   Type *Ty = Val->getType();
8313   assert(!Ty->isVectorTy() && "Val must be a scalar");
8314 
8315   if (Ty->isFloatingPointTy()) {
8316     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
8317 
8318     // Floating-point operations inherit FMF via the builder's flags.
8319     Value *MulOp = Builder.CreateFMul(C, Step);
8320     return Builder.CreateBinOp(BinOp, Val, MulOp);
8321   }
8322   Constant *C = ConstantInt::get(Ty, StartIdx);
8323   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
8324 }
8325 
8326 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
8327   SmallVector<Metadata *, 4> MDs;
8328   // Reserve first location for self reference to the LoopID metadata node.
8329   MDs.push_back(nullptr);
8330   bool IsUnrollMetadata = false;
8331   MDNode *LoopID = L->getLoopID();
8332   if (LoopID) {
8333     // First find existing loop unrolling disable metadata.
8334     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
8335       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
8336       if (MD) {
8337         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
8338         IsUnrollMetadata =
8339             S && S->getString().startswith("llvm.loop.unroll.disable");
8340       }
8341       MDs.push_back(LoopID->getOperand(i));
8342     }
8343   }
8344 
8345   if (!IsUnrollMetadata) {
8346     // Add runtime unroll disable metadata.
8347     LLVMContext &Context = L->getHeader()->getContext();
8348     SmallVector<Metadata *, 1> DisableOperands;
8349     DisableOperands.push_back(
8350         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
8351     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
8352     MDs.push_back(DisableNode);
8353     MDNode *NewLoopID = MDNode::get(Context, MDs);
8354     // Set operand 0 to refer to the loop id itself.
8355     NewLoopID->replaceOperandWith(0, NewLoopID);
8356     L->setLoopID(NewLoopID);
8357   }
8358 }
8359 
8360 //===--------------------------------------------------------------------===//
8361 // EpilogueVectorizerMainLoop
8362 //===--------------------------------------------------------------------===//
8363 
8364 /// This function is partially responsible for generating the control flow
8365 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
8366 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
8367   MDNode *OrigLoopID = OrigLoop->getLoopID();
8368   Loop *Lp = createVectorLoopSkeleton("");
8369 
8370   // Generate the code to check the minimum iteration count of the vector
8371   // epilogue (see below).
8372   EPI.EpilogueIterationCountCheck =
8373       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true);
8374   EPI.EpilogueIterationCountCheck->setName("iter.check");
8375 
8376   // Generate the code to check any assumptions that we've made for SCEV
8377   // expressions.
8378   EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader);
8379 
8380   // Generate the code that checks at runtime if arrays overlap. We put the
8381   // checks into a separate block to make the more common case of few elements
8382   // faster.
8383   EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
8384 
8385   // Generate the iteration count check for the main loop, *after* the check
8386   // for the epilogue loop, so that the path-length is shorter for the case
8387   // that goes directly through the vector epilogue. The longer-path length for
8388   // the main loop is compensated for, by the gain from vectorizing the larger
8389   // trip count. Note: the branch will get updated later on when we vectorize
8390   // the epilogue.
8391   EPI.MainLoopIterationCountCheck =
8392       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
8393 
8394   // Generate the induction variable.
8395   OldInduction = Legal->getPrimaryInduction();
8396   Type *IdxTy = Legal->getWidestInductionType();
8397   Value *StartIdx = ConstantInt::get(IdxTy, 0);
8398   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
8399   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
8400   EPI.VectorTripCount = CountRoundDown;
8401   Induction =
8402       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
8403                               getDebugLocFromInstOrOperands(OldInduction));
8404 
8405   // Skip induction resume value creation here because they will be created in
8406   // the second pass. If we created them here, they wouldn't be used anyway,
8407   // because the vplan in the second pass still contains the inductions from the
8408   // original loop.
8409 
8410   return completeLoopSkeleton(Lp, OrigLoopID);
8411 }
8412 
8413 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
8414   LLVM_DEBUG({
8415     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
8416            << "Main Loop VF:" << EPI.MainLoopVF
8417            << ", Main Loop UF:" << EPI.MainLoopUF
8418            << ", Epilogue Loop VF:" << EPI.EpilogueVF
8419            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8420   });
8421 }
8422 
8423 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
8424   DEBUG_WITH_TYPE(VerboseDebug, {
8425     dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n";
8426   });
8427 }
8428 
8429 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
8430     Loop *L, BasicBlock *Bypass, bool ForEpilogue) {
8431   assert(L && "Expected valid Loop.");
8432   assert(Bypass && "Expected valid bypass basic block.");
8433   ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
8434   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
8435   Value *Count = getOrCreateTripCount(L);
8436   // Reuse existing vector loop preheader for TC checks.
8437   // Note that new preheader block is generated for vector loop.
8438   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
8439   IRBuilder<> Builder(TCCheckBlock->getTerminator());
8440 
8441   // Generate code to check if the loop's trip count is less than VF * UF of the
8442   // main vector loop.
8443   auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ?
8444       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
8445 
8446   Value *CheckMinIters = Builder.CreateICmp(
8447       P, Count, getRuntimeVF(Builder, Count->getType(), VFactor * UFactor),
8448       "min.iters.check");
8449 
8450   if (!ForEpilogue)
8451     TCCheckBlock->setName("vector.main.loop.iter.check");
8452 
8453   // Create new preheader for vector loop.
8454   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
8455                                    DT, LI, nullptr, "vector.ph");
8456 
8457   if (ForEpilogue) {
8458     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
8459                                  DT->getNode(Bypass)->getIDom()) &&
8460            "TC check is expected to dominate Bypass");
8461 
8462     // Update dominator for Bypass & LoopExit.
8463     DT->changeImmediateDominator(Bypass, TCCheckBlock);
8464     if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
8465       // For loops with multiple exits, there's no edge from the middle block
8466       // to exit blocks (as the epilogue must run) and thus no need to update
8467       // the immediate dominator of the exit blocks.
8468       DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
8469 
8470     LoopBypassBlocks.push_back(TCCheckBlock);
8471 
8472     // Save the trip count so we don't have to regenerate it in the
8473     // vec.epilog.iter.check. This is safe to do because the trip count
8474     // generated here dominates the vector epilog iter check.
8475     EPI.TripCount = Count;
8476   }
8477 
8478   ReplaceInstWithInst(
8479       TCCheckBlock->getTerminator(),
8480       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
8481 
8482   return TCCheckBlock;
8483 }
8484 
8485 //===--------------------------------------------------------------------===//
8486 // EpilogueVectorizerEpilogueLoop
8487 //===--------------------------------------------------------------------===//
8488 
8489 /// This function is partially responsible for generating the control flow
8490 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
8491 BasicBlock *
8492 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
8493   MDNode *OrigLoopID = OrigLoop->getLoopID();
8494   Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
8495 
8496   // Now, compare the remaining count and if there aren't enough iterations to
8497   // execute the vectorized epilogue skip to the scalar part.
8498   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
8499   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
8500   LoopVectorPreHeader =
8501       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
8502                  LI, nullptr, "vec.epilog.ph");
8503   emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader,
8504                                           VecEpilogueIterationCountCheck);
8505 
8506   // Adjust the control flow taking the state info from the main loop
8507   // vectorization into account.
8508   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
8509          "expected this to be saved from the previous pass.");
8510   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
8511       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
8512 
8513   DT->changeImmediateDominator(LoopVectorPreHeader,
8514                                EPI.MainLoopIterationCountCheck);
8515 
8516   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
8517       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8518 
8519   if (EPI.SCEVSafetyCheck)
8520     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
8521         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8522   if (EPI.MemSafetyCheck)
8523     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
8524         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8525 
8526   DT->changeImmediateDominator(
8527       VecEpilogueIterationCountCheck,
8528       VecEpilogueIterationCountCheck->getSinglePredecessor());
8529 
8530   DT->changeImmediateDominator(LoopScalarPreHeader,
8531                                EPI.EpilogueIterationCountCheck);
8532   if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
8533     // If there is an epilogue which must run, there's no edge from the
8534     // middle block to exit blocks  and thus no need to update the immediate
8535     // dominator of the exit blocks.
8536     DT->changeImmediateDominator(LoopExitBlock,
8537                                  EPI.EpilogueIterationCountCheck);
8538 
8539   // Keep track of bypass blocks, as they feed start values to the induction
8540   // phis in the scalar loop preheader.
8541   if (EPI.SCEVSafetyCheck)
8542     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
8543   if (EPI.MemSafetyCheck)
8544     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
8545   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
8546 
8547   // Generate a resume induction for the vector epilogue and put it in the
8548   // vector epilogue preheader
8549   Type *IdxTy = Legal->getWidestInductionType();
8550   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
8551                                          LoopVectorPreHeader->getFirstNonPHI());
8552   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
8553   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
8554                            EPI.MainLoopIterationCountCheck);
8555 
8556   // Generate the induction variable.
8557   OldInduction = Legal->getPrimaryInduction();
8558   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
8559   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
8560   Value *StartIdx = EPResumeVal;
8561   Induction =
8562       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
8563                               getDebugLocFromInstOrOperands(OldInduction));
8564 
8565   // Generate induction resume values. These variables save the new starting
8566   // indexes for the scalar loop. They are used to test if there are any tail
8567   // iterations left once the vector loop has completed.
8568   // Note that when the vectorized epilogue is skipped due to iteration count
8569   // check, then the resume value for the induction variable comes from
8570   // the trip count of the main vector loop, hence passing the AdditionalBypass
8571   // argument.
8572   createInductionResumeValues(Lp, CountRoundDown,
8573                               {VecEpilogueIterationCountCheck,
8574                                EPI.VectorTripCount} /* AdditionalBypass */);
8575 
8576   AddRuntimeUnrollDisableMetaData(Lp);
8577   return completeLoopSkeleton(Lp, OrigLoopID);
8578 }
8579 
8580 BasicBlock *
8581 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
8582     Loop *L, BasicBlock *Bypass, BasicBlock *Insert) {
8583 
8584   assert(EPI.TripCount &&
8585          "Expected trip count to have been safed in the first pass.");
8586   assert(
8587       (!isa<Instruction>(EPI.TripCount) ||
8588        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
8589       "saved trip count does not dominate insertion point.");
8590   Value *TC = EPI.TripCount;
8591   IRBuilder<> Builder(Insert->getTerminator());
8592   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
8593 
8594   // Generate code to check if the loop's trip count is less than VF * UF of the
8595   // vector epilogue loop.
8596   auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ?
8597       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
8598 
8599   Value *CheckMinIters = Builder.CreateICmp(
8600       P, Count,
8601       getRuntimeVF(Builder, Count->getType(), EPI.EpilogueVF * EPI.EpilogueUF),
8602       "min.epilog.iters.check");
8603 
8604   ReplaceInstWithInst(
8605       Insert->getTerminator(),
8606       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
8607 
8608   LoopBypassBlocks.push_back(Insert);
8609   return Insert;
8610 }
8611 
8612 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
8613   LLVM_DEBUG({
8614     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
8615            << "Epilogue Loop VF:" << EPI.EpilogueVF
8616            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8617   });
8618 }
8619 
8620 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
8621   DEBUG_WITH_TYPE(VerboseDebug, {
8622     dbgs() << "final fn:\n" << *Induction->getFunction() << "\n";
8623   });
8624 }
8625 
8626 bool LoopVectorizationPlanner::getDecisionAndClampRange(
8627     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
8628   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
8629   bool PredicateAtRangeStart = Predicate(Range.Start);
8630 
8631   for (ElementCount TmpVF = Range.Start * 2;
8632        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
8633     if (Predicate(TmpVF) != PredicateAtRangeStart) {
8634       Range.End = TmpVF;
8635       break;
8636     }
8637 
8638   return PredicateAtRangeStart;
8639 }
8640 
8641 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
8642 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
8643 /// of VF's starting at a given VF and extending it as much as possible. Each
8644 /// vectorization decision can potentially shorten this sub-range during
8645 /// buildVPlan().
8646 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
8647                                            ElementCount MaxVF) {
8648   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8649   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8650     VFRange SubRange = {VF, MaxVFPlusOne};
8651     VPlans.push_back(buildVPlan(SubRange));
8652     VF = SubRange.End;
8653   }
8654 }
8655 
8656 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
8657                                          VPlanPtr &Plan) {
8658   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8659 
8660   // Look for cached value.
8661   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8662   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8663   if (ECEntryIt != EdgeMaskCache.end())
8664     return ECEntryIt->second;
8665 
8666   VPValue *SrcMask = createBlockInMask(Src, Plan);
8667 
8668   // The terminator has to be a branch inst!
8669   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8670   assert(BI && "Unexpected terminator found");
8671 
8672   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8673     return EdgeMaskCache[Edge] = SrcMask;
8674 
8675   // If source is an exiting block, we know the exit edge is dynamically dead
8676   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
8677   // adding uses of an otherwise potentially dead instruction.
8678   if (OrigLoop->isLoopExiting(Src))
8679     return EdgeMaskCache[Edge] = SrcMask;
8680 
8681   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
8682   assert(EdgeMask && "No Edge Mask found for condition");
8683 
8684   if (BI->getSuccessor(0) != Dst)
8685     EdgeMask = Builder.createNot(EdgeMask);
8686 
8687   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8688     // The condition is 'SrcMask && EdgeMask', which is equivalent to
8689     // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8690     // The select version does not introduce new UB if SrcMask is false and
8691     // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8692     VPValue *False = Plan->getOrAddVPValue(
8693         ConstantInt::getFalse(BI->getCondition()->getType()));
8694     EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False);
8695   }
8696 
8697   return EdgeMaskCache[Edge] = EdgeMask;
8698 }
8699 
8700 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
8701   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8702 
8703   // Look for cached value.
8704   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
8705   if (BCEntryIt != BlockMaskCache.end())
8706     return BCEntryIt->second;
8707 
8708   // All-one mask is modelled as no-mask following the convention for masked
8709   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8710   VPValue *BlockMask = nullptr;
8711 
8712   if (OrigLoop->getHeader() == BB) {
8713     if (!CM.blockNeedsPredication(BB))
8714       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
8715 
8716     // Create the block in mask as the first non-phi instruction in the block.
8717     VPBuilder::InsertPointGuard Guard(Builder);
8718     auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
8719     Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
8720 
8721     // Introduce the early-exit compare IV <= BTC to form header block mask.
8722     // This is used instead of IV < TC because TC may wrap, unlike BTC.
8723     // Start by constructing the desired canonical IV.
8724     VPValue *IV = nullptr;
8725     if (Legal->getPrimaryInduction())
8726       IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
8727     else {
8728       auto IVRecipe = new VPWidenCanonicalIVRecipe();
8729       Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint);
8730       IV = IVRecipe->getVPSingleValue();
8731     }
8732     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
8733     bool TailFolded = !CM.isScalarEpilogueAllowed();
8734 
8735     if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
8736       // While ActiveLaneMask is a binary op that consumes the loop tripcount
8737       // as a second argument, we only pass the IV here and extract the
8738       // tripcount from the transform state where codegen of the VP instructions
8739       // happen.
8740       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
8741     } else {
8742       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
8743     }
8744     return BlockMaskCache[BB] = BlockMask;
8745   }
8746 
8747   // This is the block mask. We OR all incoming edges.
8748   for (auto *Predecessor : predecessors(BB)) {
8749     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8750     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
8751       return BlockMaskCache[BB] = EdgeMask;
8752 
8753     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8754       BlockMask = EdgeMask;
8755       continue;
8756     }
8757 
8758     BlockMask = Builder.createOr(BlockMask, EdgeMask);
8759   }
8760 
8761   return BlockMaskCache[BB] = BlockMask;
8762 }
8763 
8764 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
8765                                                 ArrayRef<VPValue *> Operands,
8766                                                 VFRange &Range,
8767                                                 VPlanPtr &Plan) {
8768   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8769          "Must be called with either a load or store");
8770 
8771   auto willWiden = [&](ElementCount VF) -> bool {
8772     if (VF.isScalar())
8773       return false;
8774     LoopVectorizationCostModel::InstWidening Decision =
8775         CM.getWideningDecision(I, VF);
8776     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8777            "CM decision should be taken at this point.");
8778     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8779       return true;
8780     if (CM.isScalarAfterVectorization(I, VF) ||
8781         CM.isProfitableToScalarize(I, VF))
8782       return false;
8783     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8784   };
8785 
8786   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8787     return nullptr;
8788 
8789   VPValue *Mask = nullptr;
8790   if (Legal->isMaskRequired(I))
8791     Mask = createBlockInMask(I->getParent(), Plan);
8792 
8793   // Determine if the pointer operand of the access is either consecutive or
8794   // reverse consecutive.
8795   LoopVectorizationCostModel::InstWidening Decision =
8796       CM.getWideningDecision(I, Range.Start);
8797   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8798   bool Consecutive =
8799       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8800 
8801   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8802     return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
8803                                               Consecutive, Reverse);
8804 
8805   StoreInst *Store = cast<StoreInst>(I);
8806   return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
8807                                             Mask, Consecutive, Reverse);
8808 }
8809 
8810 VPWidenIntOrFpInductionRecipe *
8811 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi,
8812                                            ArrayRef<VPValue *> Operands) const {
8813   // Check if this is an integer or fp induction. If so, build the recipe that
8814   // produces its scalar and vector values.
8815   InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
8816   if (II.getKind() == InductionDescriptor::IK_IntInduction ||
8817       II.getKind() == InductionDescriptor::IK_FpInduction) {
8818     assert(II.getStartValue() ==
8819            Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8820     const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts();
8821     return new VPWidenIntOrFpInductionRecipe(
8822         Phi, Operands[0], Casts.empty() ? nullptr : Casts.front());
8823   }
8824 
8825   return nullptr;
8826 }
8827 
8828 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8829     TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range,
8830     VPlan &Plan) const {
8831   // Optimize the special case where the source is a constant integer
8832   // induction variable. Notice that we can only optimize the 'trunc' case
8833   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8834   // (c) other casts depend on pointer size.
8835 
8836   // Determine whether \p K is a truncation based on an induction variable that
8837   // can be optimized.
8838   auto isOptimizableIVTruncate =
8839       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8840     return [=](ElementCount VF) -> bool {
8841       return CM.isOptimizableIVTruncate(K, VF);
8842     };
8843   };
8844 
8845   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8846           isOptimizableIVTruncate(I), Range)) {
8847 
8848     InductionDescriptor II =
8849         Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0)));
8850     VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
8851     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
8852                                              Start, nullptr, I);
8853   }
8854   return nullptr;
8855 }
8856 
8857 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
8858                                                 ArrayRef<VPValue *> Operands,
8859                                                 VPlanPtr &Plan) {
8860   // If all incoming values are equal, the incoming VPValue can be used directly
8861   // instead of creating a new VPBlendRecipe.
8862   VPValue *FirstIncoming = Operands[0];
8863   if (all_of(Operands, [FirstIncoming](const VPValue *Inc) {
8864         return FirstIncoming == Inc;
8865       })) {
8866     return Operands[0];
8867   }
8868 
8869   // We know that all PHIs in non-header blocks are converted into selects, so
8870   // we don't have to worry about the insertion order and we can just use the
8871   // builder. At this point we generate the predication tree. There may be
8872   // duplications since this is a simple recursive scan, but future
8873   // optimizations will clean it up.
8874   SmallVector<VPValue *, 2> OperandsWithMask;
8875   unsigned NumIncoming = Phi->getNumIncomingValues();
8876 
8877   for (unsigned In = 0; In < NumIncoming; In++) {
8878     VPValue *EdgeMask =
8879       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8880     assert((EdgeMask || NumIncoming == 1) &&
8881            "Multiple predecessors with one having a full mask");
8882     OperandsWithMask.push_back(Operands[In]);
8883     if (EdgeMask)
8884       OperandsWithMask.push_back(EdgeMask);
8885   }
8886   return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
8887 }
8888 
8889 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8890                                                    ArrayRef<VPValue *> Operands,
8891                                                    VFRange &Range) const {
8892 
8893   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8894       [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); },
8895       Range);
8896 
8897   if (IsPredicated)
8898     return nullptr;
8899 
8900   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8901   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8902              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8903              ID == Intrinsic::pseudoprobe ||
8904              ID == Intrinsic::experimental_noalias_scope_decl))
8905     return nullptr;
8906 
8907   auto willWiden = [&](ElementCount VF) -> bool {
8908     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8909     // The following case may be scalarized depending on the VF.
8910     // The flag shows whether we use Intrinsic or a usual Call for vectorized
8911     // version of the instruction.
8912     // Is it beneficial to perform intrinsic call compared to lib call?
8913     bool NeedToScalarize = false;
8914     InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
8915     InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
8916     bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
8917     return UseVectorIntrinsic || !NeedToScalarize;
8918   };
8919 
8920   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8921     return nullptr;
8922 
8923   ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());
8924   return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()));
8925 }
8926 
8927 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8928   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8929          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8930   // Instruction should be widened, unless it is scalar after vectorization,
8931   // scalarization is profitable or it is predicated.
8932   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8933     return CM.isScalarAfterVectorization(I, VF) ||
8934            CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I);
8935   };
8936   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8937                                                              Range);
8938 }
8939 
8940 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8941                                            ArrayRef<VPValue *> Operands) const {
8942   auto IsVectorizableOpcode = [](unsigned Opcode) {
8943     switch (Opcode) {
8944     case Instruction::Add:
8945     case Instruction::And:
8946     case Instruction::AShr:
8947     case Instruction::BitCast:
8948     case Instruction::FAdd:
8949     case Instruction::FCmp:
8950     case Instruction::FDiv:
8951     case Instruction::FMul:
8952     case Instruction::FNeg:
8953     case Instruction::FPExt:
8954     case Instruction::FPToSI:
8955     case Instruction::FPToUI:
8956     case Instruction::FPTrunc:
8957     case Instruction::FRem:
8958     case Instruction::FSub:
8959     case Instruction::ICmp:
8960     case Instruction::IntToPtr:
8961     case Instruction::LShr:
8962     case Instruction::Mul:
8963     case Instruction::Or:
8964     case Instruction::PtrToInt:
8965     case Instruction::SDiv:
8966     case Instruction::Select:
8967     case Instruction::SExt:
8968     case Instruction::Shl:
8969     case Instruction::SIToFP:
8970     case Instruction::SRem:
8971     case Instruction::Sub:
8972     case Instruction::Trunc:
8973     case Instruction::UDiv:
8974     case Instruction::UIToFP:
8975     case Instruction::URem:
8976     case Instruction::Xor:
8977     case Instruction::ZExt:
8978       return true;
8979     }
8980     return false;
8981   };
8982 
8983   if (!IsVectorizableOpcode(I->getOpcode()))
8984     return nullptr;
8985 
8986   // Success: widen this instruction.
8987   return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8988 }
8989 
8990 void VPRecipeBuilder::fixHeaderPhis() {
8991   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8992   for (VPWidenPHIRecipe *R : PhisToFix) {
8993     auto *PN = cast<PHINode>(R->getUnderlyingValue());
8994     VPRecipeBase *IncR =
8995         getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8996     R->addOperand(IncR->getVPSingleValue());
8997   }
8998 }
8999 
9000 VPBasicBlock *VPRecipeBuilder::handleReplication(
9001     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
9002     VPlanPtr &Plan) {
9003   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
9004       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
9005       Range);
9006 
9007   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
9008       [&](ElementCount VF) { return CM.isPredicatedInst(I); }, Range);
9009 
9010   // Even if the instruction is not marked as uniform, there are certain
9011   // intrinsic calls that can be effectively treated as such, so we check for
9012   // them here. Conservatively, we only do this for scalable vectors, since
9013   // for fixed-width VFs we can always fall back on full scalarization.
9014   if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
9015     switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
9016     case Intrinsic::assume:
9017     case Intrinsic::lifetime_start:
9018     case Intrinsic::lifetime_end:
9019       // For scalable vectors if one of the operands is variant then we still
9020       // want to mark as uniform, which will generate one instruction for just
9021       // the first lane of the vector. We can't scalarize the call in the same
9022       // way as for fixed-width vectors because we don't know how many lanes
9023       // there are.
9024       //
9025       // The reasons for doing it this way for scalable vectors are:
9026       //   1. For the assume intrinsic generating the instruction for the first
9027       //      lane is still be better than not generating any at all. For
9028       //      example, the input may be a splat across all lanes.
9029       //   2. For the lifetime start/end intrinsics the pointer operand only
9030       //      does anything useful when the input comes from a stack object,
9031       //      which suggests it should always be uniform. For non-stack objects
9032       //      the effect is to poison the object, which still allows us to
9033       //      remove the call.
9034       IsUniform = true;
9035       break;
9036     default:
9037       break;
9038     }
9039   }
9040 
9041   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
9042                                        IsUniform, IsPredicated);
9043   setRecipe(I, Recipe);
9044   Plan->addVPValue(I, Recipe);
9045 
9046   // Find if I uses a predicated instruction. If so, it will use its scalar
9047   // value. Avoid hoisting the insert-element which packs the scalar value into
9048   // a vector value, as that happens iff all users use the vector value.
9049   for (VPValue *Op : Recipe->operands()) {
9050     auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef());
9051     if (!PredR)
9052       continue;
9053     auto *RepR =
9054         cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef());
9055     assert(RepR->isPredicated() &&
9056            "expected Replicate recipe to be predicated");
9057     RepR->setAlsoPack(false);
9058   }
9059 
9060   // Finalize the recipe for Instr, first if it is not predicated.
9061   if (!IsPredicated) {
9062     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
9063     VPBB->appendRecipe(Recipe);
9064     return VPBB;
9065   }
9066   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
9067   assert(VPBB->getSuccessors().empty() &&
9068          "VPBB has successors when handling predicated replication.");
9069   // Record predicated instructions for above packing optimizations.
9070   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
9071   VPBlockUtils::insertBlockAfter(Region, VPBB);
9072   auto *RegSucc = new VPBasicBlock();
9073   VPBlockUtils::insertBlockAfter(RegSucc, Region);
9074   return RegSucc;
9075 }
9076 
9077 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
9078                                                       VPRecipeBase *PredRecipe,
9079                                                       VPlanPtr &Plan) {
9080   // Instructions marked for predication are replicated and placed under an
9081   // if-then construct to prevent side-effects.
9082 
9083   // Generate recipes to compute the block mask for this region.
9084   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
9085 
9086   // Build the triangular if-then region.
9087   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
9088   assert(Instr->getParent() && "Predicated instruction not in any basic block");
9089   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
9090   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
9091   auto *PHIRecipe = Instr->getType()->isVoidTy()
9092                         ? nullptr
9093                         : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
9094   if (PHIRecipe) {
9095     Plan->removeVPValueFor(Instr);
9096     Plan->addVPValue(Instr, PHIRecipe);
9097   }
9098   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
9099   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
9100   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
9101 
9102   // Note: first set Entry as region entry and then connect successors starting
9103   // from it in order, to propagate the "parent" of each VPBasicBlock.
9104   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
9105   VPBlockUtils::connectBlocks(Pred, Exit);
9106 
9107   return Region;
9108 }
9109 
9110 VPRecipeOrVPValueTy
9111 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
9112                                         ArrayRef<VPValue *> Operands,
9113                                         VFRange &Range, VPlanPtr &Plan) {
9114   // First, check for specific widening recipes that deal with calls, memory
9115   // operations, inductions and Phi nodes.
9116   if (auto *CI = dyn_cast<CallInst>(Instr))
9117     return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
9118 
9119   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
9120     return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
9121 
9122   VPRecipeBase *Recipe;
9123   if (auto Phi = dyn_cast<PHINode>(Instr)) {
9124     if (Phi->getParent() != OrigLoop->getHeader())
9125       return tryToBlend(Phi, Operands, Plan);
9126     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands)))
9127       return toVPRecipeResult(Recipe);
9128 
9129     VPWidenPHIRecipe *PhiRecipe = nullptr;
9130     if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) {
9131       VPValue *StartV = Operands[0];
9132       if (Legal->isReductionVariable(Phi)) {
9133         RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
9134         assert(RdxDesc.getRecurrenceStartValue() ==
9135                Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
9136         PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
9137                                              CM.isInLoopReduction(Phi),
9138                                              CM.useOrderedReductions(RdxDesc));
9139       } else {
9140         PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
9141       }
9142 
9143       // Record the incoming value from the backedge, so we can add the incoming
9144       // value from the backedge after all recipes have been created.
9145       recordRecipeOf(cast<Instruction>(
9146           Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())));
9147       PhisToFix.push_back(PhiRecipe);
9148     } else {
9149       // TODO: record start and backedge value for remaining pointer induction
9150       // phis.
9151       assert(Phi->getType()->isPointerTy() &&
9152              "only pointer phis should be handled here");
9153       PhiRecipe = new VPWidenPHIRecipe(Phi);
9154     }
9155 
9156     return toVPRecipeResult(PhiRecipe);
9157   }
9158 
9159   if (isa<TruncInst>(Instr) &&
9160       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
9161                                                Range, *Plan)))
9162     return toVPRecipeResult(Recipe);
9163 
9164   if (!shouldWiden(Instr, Range))
9165     return nullptr;
9166 
9167   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
9168     return toVPRecipeResult(new VPWidenGEPRecipe(
9169         GEP, make_range(Operands.begin(), Operands.end()), OrigLoop));
9170 
9171   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
9172     bool InvariantCond =
9173         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
9174     return toVPRecipeResult(new VPWidenSelectRecipe(
9175         *SI, make_range(Operands.begin(), Operands.end()), InvariantCond));
9176   }
9177 
9178   return toVPRecipeResult(tryToWiden(Instr, Operands));
9179 }
9180 
9181 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
9182                                                         ElementCount MaxVF) {
9183   assert(OrigLoop->isInnermost() && "Inner loop expected.");
9184 
9185   // Collect instructions from the original loop that will become trivially dead
9186   // in the vectorized loop. We don't need to vectorize these instructions. For
9187   // example, original induction update instructions can become dead because we
9188   // separately emit induction "steps" when generating code for the new loop.
9189   // Similarly, we create a new latch condition when setting up the structure
9190   // of the new loop, so the old one can become dead.
9191   SmallPtrSet<Instruction *, 4> DeadInstructions;
9192   collectTriviallyDeadInstructions(DeadInstructions);
9193 
9194   // Add assume instructions we need to drop to DeadInstructions, to prevent
9195   // them from being added to the VPlan.
9196   // TODO: We only need to drop assumes in blocks that get flattend. If the
9197   // control flow is preserved, we should keep them.
9198   auto &ConditionalAssumes = Legal->getConditionalAssumes();
9199   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
9200 
9201   MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
9202   // Dead instructions do not need sinking. Remove them from SinkAfter.
9203   for (Instruction *I : DeadInstructions)
9204     SinkAfter.erase(I);
9205 
9206   // Cannot sink instructions after dead instructions (there won't be any
9207   // recipes for them). Instead, find the first non-dead previous instruction.
9208   for (auto &P : Legal->getSinkAfter()) {
9209     Instruction *SinkTarget = P.second;
9210     Instruction *FirstInst = &*SinkTarget->getParent()->begin();
9211     (void)FirstInst;
9212     while (DeadInstructions.contains(SinkTarget)) {
9213       assert(
9214           SinkTarget != FirstInst &&
9215           "Must find a live instruction (at least the one feeding the "
9216           "first-order recurrence PHI) before reaching beginning of the block");
9217       SinkTarget = SinkTarget->getPrevNode();
9218       assert(SinkTarget != P.first &&
9219              "sink source equals target, no sinking required");
9220     }
9221     P.second = SinkTarget;
9222   }
9223 
9224   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
9225   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
9226     VFRange SubRange = {VF, MaxVFPlusOne};
9227     VPlans.push_back(
9228         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
9229     VF = SubRange.End;
9230   }
9231 }
9232 
9233 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
9234     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
9235     const MapVector<Instruction *, Instruction *> &SinkAfter) {
9236 
9237   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
9238 
9239   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
9240 
9241   // ---------------------------------------------------------------------------
9242   // Pre-construction: record ingredients whose recipes we'll need to further
9243   // process after constructing the initial VPlan.
9244   // ---------------------------------------------------------------------------
9245 
9246   // Mark instructions we'll need to sink later and their targets as
9247   // ingredients whose recipe we'll need to record.
9248   for (auto &Entry : SinkAfter) {
9249     RecipeBuilder.recordRecipeOf(Entry.first);
9250     RecipeBuilder.recordRecipeOf(Entry.second);
9251   }
9252   for (auto &Reduction : CM.getInLoopReductionChains()) {
9253     PHINode *Phi = Reduction.first;
9254     RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind();
9255     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9256 
9257     RecipeBuilder.recordRecipeOf(Phi);
9258     for (auto &R : ReductionOperations) {
9259       RecipeBuilder.recordRecipeOf(R);
9260       // For min/max reducitons, where we have a pair of icmp/select, we also
9261       // need to record the ICmp recipe, so it can be removed later.
9262       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
9263              "Only min/max recurrences allowed for inloop reductions");
9264       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
9265         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
9266     }
9267   }
9268 
9269   // For each interleave group which is relevant for this (possibly trimmed)
9270   // Range, add it to the set of groups to be later applied to the VPlan and add
9271   // placeholders for its members' Recipes which we'll be replacing with a
9272   // single VPInterleaveRecipe.
9273   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
9274     auto applyIG = [IG, this](ElementCount VF) -> bool {
9275       return (VF.isVector() && // Query is illegal for VF == 1
9276               CM.getWideningDecision(IG->getInsertPos(), VF) ==
9277                   LoopVectorizationCostModel::CM_Interleave);
9278     };
9279     if (!getDecisionAndClampRange(applyIG, Range))
9280       continue;
9281     InterleaveGroups.insert(IG);
9282     for (unsigned i = 0; i < IG->getFactor(); i++)
9283       if (Instruction *Member = IG->getMember(i))
9284         RecipeBuilder.recordRecipeOf(Member);
9285   };
9286 
9287   // ---------------------------------------------------------------------------
9288   // Build initial VPlan: Scan the body of the loop in a topological order to
9289   // visit each basic block after having visited its predecessor basic blocks.
9290   // ---------------------------------------------------------------------------
9291 
9292   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
9293   auto Plan = std::make_unique<VPlan>();
9294   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
9295   Plan->setEntry(VPBB);
9296 
9297   // Scan the body of the loop in a topological order to visit each basic block
9298   // after having visited its predecessor basic blocks.
9299   LoopBlocksDFS DFS(OrigLoop);
9300   DFS.perform(LI);
9301 
9302   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
9303     // Relevant instructions from basic block BB will be grouped into VPRecipe
9304     // ingredients and fill a new VPBasicBlock.
9305     unsigned VPBBsForBB = 0;
9306     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
9307     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
9308     VPBB = FirstVPBBForBB;
9309     Builder.setInsertPoint(VPBB);
9310 
9311     // Introduce each ingredient into VPlan.
9312     // TODO: Model and preserve debug instrinsics in VPlan.
9313     for (Instruction &I : BB->instructionsWithoutDebug()) {
9314       Instruction *Instr = &I;
9315 
9316       // First filter out irrelevant instructions, to ensure no recipes are
9317       // built for them.
9318       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
9319         continue;
9320 
9321       SmallVector<VPValue *, 4> Operands;
9322       auto *Phi = dyn_cast<PHINode>(Instr);
9323       if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
9324         Operands.push_back(Plan->getOrAddVPValue(
9325             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
9326       } else {
9327         auto OpRange = Plan->mapToVPValues(Instr->operands());
9328         Operands = {OpRange.begin(), OpRange.end()};
9329       }
9330       if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
9331               Instr, Operands, Range, Plan)) {
9332         // If Instr can be simplified to an existing VPValue, use it.
9333         if (RecipeOrValue.is<VPValue *>()) {
9334           auto *VPV = RecipeOrValue.get<VPValue *>();
9335           Plan->addVPValue(Instr, VPV);
9336           // If the re-used value is a recipe, register the recipe for the
9337           // instruction, in case the recipe for Instr needs to be recorded.
9338           if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef()))
9339             RecipeBuilder.setRecipe(Instr, R);
9340           continue;
9341         }
9342         // Otherwise, add the new recipe.
9343         VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>();
9344         for (auto *Def : Recipe->definedValues()) {
9345           auto *UV = Def->getUnderlyingValue();
9346           Plan->addVPValue(UV, Def);
9347         }
9348 
9349         RecipeBuilder.setRecipe(Instr, Recipe);
9350         VPBB->appendRecipe(Recipe);
9351         continue;
9352       }
9353 
9354       // Otherwise, if all widening options failed, Instruction is to be
9355       // replicated. This may create a successor for VPBB.
9356       VPBasicBlock *NextVPBB =
9357           RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan);
9358       if (NextVPBB != VPBB) {
9359         VPBB = NextVPBB;
9360         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
9361                                     : "");
9362       }
9363     }
9364   }
9365 
9366   RecipeBuilder.fixHeaderPhis();
9367 
9368   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
9369   // may also be empty, such as the last one VPBB, reflecting original
9370   // basic-blocks with no recipes.
9371   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
9372   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
9373   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
9374   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
9375   delete PreEntry;
9376 
9377   // ---------------------------------------------------------------------------
9378   // Transform initial VPlan: Apply previously taken decisions, in order, to
9379   // bring the VPlan to its final state.
9380   // ---------------------------------------------------------------------------
9381 
9382   // Apply Sink-After legal constraints.
9383   auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {
9384     auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
9385     if (Region && Region->isReplicator()) {
9386       assert(Region->getNumSuccessors() == 1 &&
9387              Region->getNumPredecessors() == 1 && "Expected SESE region!");
9388       assert(R->getParent()->size() == 1 &&
9389              "A recipe in an original replicator region must be the only "
9390              "recipe in its block");
9391       return Region;
9392     }
9393     return nullptr;
9394   };
9395   for (auto &Entry : SinkAfter) {
9396     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
9397     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
9398 
9399     auto *TargetRegion = GetReplicateRegion(Target);
9400     auto *SinkRegion = GetReplicateRegion(Sink);
9401     if (!SinkRegion) {
9402       // If the sink source is not a replicate region, sink the recipe directly.
9403       if (TargetRegion) {
9404         // The target is in a replication region, make sure to move Sink to
9405         // the block after it, not into the replication region itself.
9406         VPBasicBlock *NextBlock =
9407             cast<VPBasicBlock>(TargetRegion->getSuccessors().front());
9408         Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
9409       } else
9410         Sink->moveAfter(Target);
9411       continue;
9412     }
9413 
9414     // The sink source is in a replicate region. Unhook the region from the CFG.
9415     auto *SinkPred = SinkRegion->getSinglePredecessor();
9416     auto *SinkSucc = SinkRegion->getSingleSuccessor();
9417     VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion);
9418     VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc);
9419     VPBlockUtils::connectBlocks(SinkPred, SinkSucc);
9420 
9421     if (TargetRegion) {
9422       // The target recipe is also in a replicate region, move the sink region
9423       // after the target region.
9424       auto *TargetSucc = TargetRegion->getSingleSuccessor();
9425       VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc);
9426       VPBlockUtils::connectBlocks(TargetRegion, SinkRegion);
9427       VPBlockUtils::connectBlocks(SinkRegion, TargetSucc);
9428     } else {
9429       // The sink source is in a replicate region, we need to move the whole
9430       // replicate region, which should only contain a single recipe in the
9431       // main block.
9432       auto *SplitBlock =
9433           Target->getParent()->splitAt(std::next(Target->getIterator()));
9434 
9435       auto *SplitPred = SplitBlock->getSinglePredecessor();
9436 
9437       VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock);
9438       VPBlockUtils::connectBlocks(SplitPred, SinkRegion);
9439       VPBlockUtils::connectBlocks(SinkRegion, SplitBlock);
9440       if (VPBB == SplitPred)
9441         VPBB = SplitBlock;
9442     }
9443   }
9444 
9445   // Adjust the recipes for any inloop reductions.
9446   adjustRecipesForReductions(VPBB, Plan, RecipeBuilder, Range.Start);
9447 
9448   // Introduce a recipe to combine the incoming and previous values of a
9449   // first-order recurrence.
9450   for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
9451     auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R);
9452     if (!RecurPhi)
9453       continue;
9454 
9455     auto *RecurSplice = cast<VPInstruction>(
9456         Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
9457                              {RecurPhi, RecurPhi->getBackedgeValue()}));
9458 
9459     VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe();
9460     if (auto *Region = GetReplicateRegion(PrevRecipe)) {
9461       VPBasicBlock *Succ = cast<VPBasicBlock>(Region->getSingleSuccessor());
9462       RecurSplice->moveBefore(*Succ, Succ->getFirstNonPhi());
9463     } else
9464       RecurSplice->moveAfter(PrevRecipe);
9465     RecurPhi->replaceAllUsesWith(RecurSplice);
9466     // Set the first operand of RecurSplice to RecurPhi again, after replacing
9467     // all users.
9468     RecurSplice->setOperand(0, RecurPhi);
9469   }
9470 
9471   // Interleave memory: for each Interleave Group we marked earlier as relevant
9472   // for this VPlan, replace the Recipes widening its memory instructions with a
9473   // single VPInterleaveRecipe at its insertion point.
9474   for (auto IG : InterleaveGroups) {
9475     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
9476         RecipeBuilder.getRecipe(IG->getInsertPos()));
9477     SmallVector<VPValue *, 4> StoredValues;
9478     for (unsigned i = 0; i < IG->getFactor(); ++i)
9479       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
9480         auto *StoreR =
9481             cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
9482         StoredValues.push_back(StoreR->getStoredValue());
9483       }
9484 
9485     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
9486                                         Recipe->getMask());
9487     VPIG->insertBefore(Recipe);
9488     unsigned J = 0;
9489     for (unsigned i = 0; i < IG->getFactor(); ++i)
9490       if (Instruction *Member = IG->getMember(i)) {
9491         if (!Member->getType()->isVoidTy()) {
9492           VPValue *OriginalV = Plan->getVPValue(Member);
9493           Plan->removeVPValueFor(Member);
9494           Plan->addVPValue(Member, VPIG->getVPValue(J));
9495           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
9496           J++;
9497         }
9498         RecipeBuilder.getRecipe(Member)->eraseFromParent();
9499       }
9500   }
9501 
9502   // From this point onwards, VPlan-to-VPlan transformations may change the plan
9503   // in ways that accessing values using original IR values is incorrect.
9504   Plan->disableValue2VPValue();
9505 
9506   VPlanTransforms::sinkScalarOperands(*Plan);
9507   VPlanTransforms::mergeReplicateRegions(*Plan);
9508 
9509   std::string PlanName;
9510   raw_string_ostream RSO(PlanName);
9511   ElementCount VF = Range.Start;
9512   Plan->addVF(VF);
9513   RSO << "Initial VPlan for VF={" << VF;
9514   for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
9515     Plan->addVF(VF);
9516     RSO << "," << VF;
9517   }
9518   RSO << "},UF>=1";
9519   RSO.flush();
9520   Plan->setName(PlanName);
9521 
9522   return Plan;
9523 }
9524 
9525 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9526   // Outer loop handling: They may require CFG and instruction level
9527   // transformations before even evaluating whether vectorization is profitable.
9528   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9529   // the vectorization pipeline.
9530   assert(!OrigLoop->isInnermost());
9531   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9532 
9533   // Create new empty VPlan
9534   auto Plan = std::make_unique<VPlan>();
9535 
9536   // Build hierarchical CFG
9537   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9538   HCFGBuilder.buildHierarchicalCFG();
9539 
9540   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
9541        VF *= 2)
9542     Plan->addVF(VF);
9543 
9544   if (EnableVPlanPredication) {
9545     VPlanPredicator VPP(*Plan);
9546     VPP.predicate();
9547 
9548     // Avoid running transformation to recipes until masked code generation in
9549     // VPlan-native path is in place.
9550     return Plan;
9551   }
9552 
9553   SmallPtrSet<Instruction *, 1> DeadInstructions;
9554   VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan,
9555                                              Legal->getInductionVars(),
9556                                              DeadInstructions, *PSE.getSE());
9557   return Plan;
9558 }
9559 
9560 // Adjust the recipes for reductions. For in-loop reductions the chain of
9561 // instructions leading from the loop exit instr to the phi need to be converted
9562 // to reductions, with one operand being vector and the other being the scalar
9563 // reduction chain. For other reductions, a select is introduced between the phi
9564 // and live-out recipes when folding the tail.
9565 void LoopVectorizationPlanner::adjustRecipesForReductions(
9566     VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
9567     ElementCount MinVF) {
9568   for (auto &Reduction : CM.getInLoopReductionChains()) {
9569     PHINode *Phi = Reduction.first;
9570     RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
9571     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9572 
9573     if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc))
9574       continue;
9575 
9576     // ReductionOperations are orders top-down from the phi's use to the
9577     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
9578     // which of the two operands will remain scalar and which will be reduced.
9579     // For minmax the chain will be the select instructions.
9580     Instruction *Chain = Phi;
9581     for (Instruction *R : ReductionOperations) {
9582       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
9583       RecurKind Kind = RdxDesc.getRecurrenceKind();
9584 
9585       VPValue *ChainOp = Plan->getVPValue(Chain);
9586       unsigned FirstOpId;
9587       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
9588              "Only min/max recurrences allowed for inloop reductions");
9589       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9590         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
9591                "Expected to replace a VPWidenSelectSC");
9592         FirstOpId = 1;
9593       } else {
9594         assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe)) &&
9595                "Expected to replace a VPWidenSC");
9596         FirstOpId = 0;
9597       }
9598       unsigned VecOpId =
9599           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
9600       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
9601 
9602       auto *CondOp = CM.foldTailByMasking()
9603                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
9604                          : nullptr;
9605       VPReductionRecipe *RedRecipe = new VPReductionRecipe(
9606           &RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
9607       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9608       Plan->removeVPValueFor(R);
9609       Plan->addVPValue(R, RedRecipe);
9610       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
9611       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9612       WidenRecipe->eraseFromParent();
9613 
9614       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9615         VPRecipeBase *CompareRecipe =
9616             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
9617         assert(isa<VPWidenRecipe>(CompareRecipe) &&
9618                "Expected to replace a VPWidenSC");
9619         assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
9620                "Expected no remaining users");
9621         CompareRecipe->eraseFromParent();
9622       }
9623       Chain = R;
9624     }
9625   }
9626 
9627   // If tail is folded by masking, introduce selects between the phi
9628   // and the live-out instruction of each reduction, at the end of the latch.
9629   if (CM.foldTailByMasking()) {
9630     for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
9631       VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9632       if (!PhiR || PhiR->isInLoop())
9633         continue;
9634       Builder.setInsertPoint(LatchVPBB);
9635       VPValue *Cond =
9636           RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
9637       VPValue *Red = PhiR->getBackedgeValue();
9638       Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});
9639     }
9640   }
9641 }
9642 
9643 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9644 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9645                                VPSlotTracker &SlotTracker) const {
9646   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9647   IG->getInsertPos()->printAsOperand(O, false);
9648   O << ", ";
9649   getAddr()->printAsOperand(O, SlotTracker);
9650   VPValue *Mask = getMask();
9651   if (Mask) {
9652     O << ", ";
9653     Mask->printAsOperand(O, SlotTracker);
9654   }
9655 
9656   unsigned OpIdx = 0;
9657   for (unsigned i = 0; i < IG->getFactor(); ++i) {
9658     if (!IG->getMember(i))
9659       continue;
9660     if (getNumStoreOperands() > 0) {
9661       O << "\n" << Indent << "  store ";
9662       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9663       O << " to index " << i;
9664     } else {
9665       O << "\n" << Indent << "  ";
9666       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
9667       O << " = load from index " << i;
9668     }
9669     ++OpIdx;
9670   }
9671 }
9672 #endif
9673 
9674 void VPWidenCallRecipe::execute(VPTransformState &State) {
9675   State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
9676                                   *this, State);
9677 }
9678 
9679 void VPWidenSelectRecipe::execute(VPTransformState &State) {
9680   State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()),
9681                                     this, *this, InvariantCond, State);
9682 }
9683 
9684 void VPWidenRecipe::execute(VPTransformState &State) {
9685   State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State);
9686 }
9687 
9688 void VPWidenGEPRecipe::execute(VPTransformState &State) {
9689   State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this,
9690                       *this, State.UF, State.VF, IsPtrLoopInvariant,
9691                       IsIndexLoopInvariant, State);
9692 }
9693 
9694 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
9695   assert(!State.Instance && "Int or FP induction being replicated.");
9696   State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(),
9697                                    getTruncInst(), getVPValue(0),
9698                                    getCastValue(), State);
9699 }
9700 
9701 void VPWidenPHIRecipe::execute(VPTransformState &State) {
9702   State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this,
9703                                  State);
9704 }
9705 
9706 void VPBlendRecipe::execute(VPTransformState &State) {
9707   State.ILV->setDebugLocFromInst(Phi, &State.Builder);
9708   // We know that all PHIs in non-header blocks are converted into
9709   // selects, so we don't have to worry about the insertion order and we
9710   // can just use the builder.
9711   // At this point we generate the predication tree. There may be
9712   // duplications since this is a simple recursive scan, but future
9713   // optimizations will clean it up.
9714 
9715   unsigned NumIncoming = getNumIncomingValues();
9716 
9717   // Generate a sequence of selects of the form:
9718   // SELECT(Mask3, In3,
9719   //        SELECT(Mask2, In2,
9720   //               SELECT(Mask1, In1,
9721   //                      In0)))
9722   // Note that Mask0 is never used: lanes for which no path reaches this phi and
9723   // are essentially undef are taken from In0.
9724   InnerLoopVectorizer::VectorParts Entry(State.UF);
9725   for (unsigned In = 0; In < NumIncoming; ++In) {
9726     for (unsigned Part = 0; Part < State.UF; ++Part) {
9727       // We might have single edge PHIs (blocks) - use an identity
9728       // 'select' for the first PHI operand.
9729       Value *In0 = State.get(getIncomingValue(In), Part);
9730       if (In == 0)
9731         Entry[Part] = In0; // Initialize with the first incoming value.
9732       else {
9733         // Select between the current value and the previous incoming edge
9734         // based on the incoming mask.
9735         Value *Cond = State.get(getMask(In), Part);
9736         Entry[Part] =
9737             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
9738       }
9739     }
9740   }
9741   for (unsigned Part = 0; Part < State.UF; ++Part)
9742     State.set(this, Entry[Part], Part);
9743 }
9744 
9745 void VPInterleaveRecipe::execute(VPTransformState &State) {
9746   assert(!State.Instance && "Interleave group being replicated.");
9747   State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9748                                       getStoredValues(), getMask());
9749 }
9750 
9751 void VPReductionRecipe::execute(VPTransformState &State) {
9752   assert(!State.Instance && "Reduction being replicated.");
9753   Value *PrevInChain = State.get(getChainOp(), 0);
9754   for (unsigned Part = 0; Part < State.UF; ++Part) {
9755     RecurKind Kind = RdxDesc->getRecurrenceKind();
9756     bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc);
9757     Value *NewVecOp = State.get(getVecOp(), Part);
9758     if (VPValue *Cond = getCondOp()) {
9759       Value *NewCond = State.get(Cond, Part);
9760       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
9761       Value *Iden = RdxDesc->getRecurrenceIdentity(
9762           Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags());
9763       Value *IdenVec =
9764           State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
9765       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
9766       NewVecOp = Select;
9767     }
9768     Value *NewRed;
9769     Value *NextInChain;
9770     if (IsOrdered) {
9771       if (State.VF.isVector())
9772         NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp,
9773                                         PrevInChain);
9774       else
9775         NewRed = State.Builder.CreateBinOp(
9776             (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(),
9777             PrevInChain, NewVecOp);
9778       PrevInChain = NewRed;
9779     } else {
9780       PrevInChain = State.get(getChainOp(), Part);
9781       NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
9782     }
9783     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9784       NextInChain =
9785           createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
9786                          NewRed, PrevInChain);
9787     } else if (IsOrdered)
9788       NextInChain = NewRed;
9789     else {
9790       NextInChain = State.Builder.CreateBinOp(
9791           (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed,
9792           PrevInChain);
9793     }
9794     State.set(this, NextInChain, Part);
9795   }
9796 }
9797 
9798 void VPReplicateRecipe::execute(VPTransformState &State) {
9799   if (State.Instance) { // Generate a single instance.
9800     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9801     State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this,
9802                                     *State.Instance, IsPredicated, State);
9803     // Insert scalar instance packing it into a vector.
9804     if (AlsoPack && State.VF.isVector()) {
9805       // If we're constructing lane 0, initialize to start from poison.
9806       if (State.Instance->Lane.isFirstLane()) {
9807         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9808         Value *Poison = PoisonValue::get(
9809             VectorType::get(getUnderlyingValue()->getType(), State.VF));
9810         State.set(this, Poison, State.Instance->Part);
9811       }
9812       State.ILV->packScalarIntoVectorValue(this, *State.Instance, State);
9813     }
9814     return;
9815   }
9816 
9817   // Generate scalar instances for all VF lanes of all UF parts, unless the
9818   // instruction is uniform inwhich case generate only the first lane for each
9819   // of the UF parts.
9820   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
9821   assert((!State.VF.isScalable() || IsUniform) &&
9822          "Can't scalarize a scalable vector");
9823   for (unsigned Part = 0; Part < State.UF; ++Part)
9824     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9825       State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this,
9826                                       VPIteration(Part, Lane), IsPredicated,
9827                                       State);
9828 }
9829 
9830 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
9831   assert(State.Instance && "Branch on Mask works only on single instance.");
9832 
9833   unsigned Part = State.Instance->Part;
9834   unsigned Lane = State.Instance->Lane.getKnownLane();
9835 
9836   Value *ConditionBit = nullptr;
9837   VPValue *BlockInMask = getMask();
9838   if (BlockInMask) {
9839     ConditionBit = State.get(BlockInMask, Part);
9840     if (ConditionBit->getType()->isVectorTy())
9841       ConditionBit = State.Builder.CreateExtractElement(
9842           ConditionBit, State.Builder.getInt32(Lane));
9843   } else // Block in mask is all-one.
9844     ConditionBit = State.Builder.getTrue();
9845 
9846   // Replace the temporary unreachable terminator with a new conditional branch,
9847   // whose two destinations will be set later when they are created.
9848   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
9849   assert(isa<UnreachableInst>(CurrentTerminator) &&
9850          "Expected to replace unreachable terminator with conditional branch.");
9851   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
9852   CondBr->setSuccessor(0, nullptr);
9853   ReplaceInstWithInst(CurrentTerminator, CondBr);
9854 }
9855 
9856 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
9857   assert(State.Instance && "Predicated instruction PHI works per instance.");
9858   Instruction *ScalarPredInst =
9859       cast<Instruction>(State.get(getOperand(0), *State.Instance));
9860   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
9861   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
9862   assert(PredicatingBB && "Predicated block has no single predecessor.");
9863   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
9864          "operand must be VPReplicateRecipe");
9865 
9866   // By current pack/unpack logic we need to generate only a single phi node: if
9867   // a vector value for the predicated instruction exists at this point it means
9868   // the instruction has vector users only, and a phi for the vector value is
9869   // needed. In this case the recipe of the predicated instruction is marked to
9870   // also do that packing, thereby "hoisting" the insert-element sequence.
9871   // Otherwise, a phi node for the scalar value is needed.
9872   unsigned Part = State.Instance->Part;
9873   if (State.hasVectorValue(getOperand(0), Part)) {
9874     Value *VectorValue = State.get(getOperand(0), Part);
9875     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
9876     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
9877     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
9878     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
9879     if (State.hasVectorValue(this, Part))
9880       State.reset(this, VPhi, Part);
9881     else
9882       State.set(this, VPhi, Part);
9883     // NOTE: Currently we need to update the value of the operand, so the next
9884     // predicated iteration inserts its generated value in the correct vector.
9885     State.reset(getOperand(0), VPhi, Part);
9886   } else {
9887     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
9888     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
9889     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
9890                      PredicatingBB);
9891     Phi->addIncoming(ScalarPredInst, PredicatedBB);
9892     if (State.hasScalarValue(this, *State.Instance))
9893       State.reset(this, Phi, *State.Instance);
9894     else
9895       State.set(this, Phi, *State.Instance);
9896     // NOTE: Currently we need to update the value of the operand, so the next
9897     // predicated iteration inserts its generated value in the correct vector.
9898     State.reset(getOperand(0), Phi, *State.Instance);
9899   }
9900 }
9901 
9902 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9903   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9904   State.ILV->vectorizeMemoryInstruction(
9905       &Ingredient, State, StoredValue ? nullptr : getVPSingleValue(), getAddr(),
9906       StoredValue, getMask(), Consecutive, Reverse);
9907 }
9908 
9909 // Determine how to lower the scalar epilogue, which depends on 1) optimising
9910 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9911 // predication, and 4) a TTI hook that analyses whether the loop is suitable
9912 // for predication.
9913 static ScalarEpilogueLowering getScalarEpilogueLowering(
9914     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9915     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
9916     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
9917     LoopVectorizationLegality &LVL) {
9918   // 1) OptSize takes precedence over all other options, i.e. if this is set,
9919   // don't look at hints or options, and don't request a scalar epilogue.
9920   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9921   // LoopAccessInfo (due to code dependency and not being able to reliably get
9922   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9923   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9924   // versioning when the vectorization is forced, unlike hasOptSize. So revert
9925   // back to the old way and vectorize with versioning when forced. See D81345.)
9926   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9927                                                       PGSOQueryType::IRPass) &&
9928                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
9929     return CM_ScalarEpilogueNotAllowedOptSize;
9930 
9931   // 2) If set, obey the directives
9932   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9933     switch (PreferPredicateOverEpilogue) {
9934     case PreferPredicateTy::ScalarEpilogue:
9935       return CM_ScalarEpilogueAllowed;
9936     case PreferPredicateTy::PredicateElseScalarEpilogue:
9937       return CM_ScalarEpilogueNotNeededUsePredicate;
9938     case PreferPredicateTy::PredicateOrDontVectorize:
9939       return CM_ScalarEpilogueNotAllowedUsePredicate;
9940     };
9941   }
9942 
9943   // 3) If set, obey the hints
9944   switch (Hints.getPredicate()) {
9945   case LoopVectorizeHints::FK_Enabled:
9946     return CM_ScalarEpilogueNotNeededUsePredicate;
9947   case LoopVectorizeHints::FK_Disabled:
9948     return CM_ScalarEpilogueAllowed;
9949   };
9950 
9951   // 4) if the TTI hook indicates this is profitable, request predication.
9952   if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
9953                                        LVL.getLAI()))
9954     return CM_ScalarEpilogueNotNeededUsePredicate;
9955 
9956   return CM_ScalarEpilogueAllowed;
9957 }
9958 
9959 Value *VPTransformState::get(VPValue *Def, unsigned Part) {
9960   // If Values have been set for this Def return the one relevant for \p Part.
9961   if (hasVectorValue(Def, Part))
9962     return Data.PerPartOutput[Def][Part];
9963 
9964   if (!hasScalarValue(Def, {Part, 0})) {
9965     Value *IRV = Def->getLiveInIRValue();
9966     Value *B = ILV->getBroadcastInstrs(IRV);
9967     set(Def, B, Part);
9968     return B;
9969   }
9970 
9971   Value *ScalarValue = get(Def, {Part, 0});
9972   // If we aren't vectorizing, we can just copy the scalar map values over
9973   // to the vector map.
9974   if (VF.isScalar()) {
9975     set(Def, ScalarValue, Part);
9976     return ScalarValue;
9977   }
9978 
9979   auto *RepR = dyn_cast<VPReplicateRecipe>(Def);
9980   bool IsUniform = RepR && RepR->isUniform();
9981 
9982   unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
9983   // Check if there is a scalar value for the selected lane.
9984   if (!hasScalarValue(Def, {Part, LastLane})) {
9985     // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform.
9986     assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) &&
9987            "unexpected recipe found to be invariant");
9988     IsUniform = true;
9989     LastLane = 0;
9990   }
9991 
9992   auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));
9993   // Set the insert point after the last scalarized instruction or after the
9994   // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
9995   // will directly follow the scalar definitions.
9996   auto OldIP = Builder.saveIP();
9997   auto NewIP =
9998       isa<PHINode>(LastInst)
9999           ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
10000           : std::next(BasicBlock::iterator(LastInst));
10001   Builder.SetInsertPoint(&*NewIP);
10002 
10003   // However, if we are vectorizing, we need to construct the vector values.
10004   // If the value is known to be uniform after vectorization, we can just
10005   // broadcast the scalar value corresponding to lane zero for each unroll
10006   // iteration. Otherwise, we construct the vector values using
10007   // insertelement instructions. Since the resulting vectors are stored in
10008   // State, we will only generate the insertelements once.
10009   Value *VectorValue = nullptr;
10010   if (IsUniform) {
10011     VectorValue = ILV->getBroadcastInstrs(ScalarValue);
10012     set(Def, VectorValue, Part);
10013   } else {
10014     // Initialize packing with insertelements to start from undef.
10015     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
10016     Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
10017     set(Def, Undef, Part);
10018     for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
10019       ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this);
10020     VectorValue = get(Def, Part);
10021   }
10022   Builder.restoreIP(OldIP);
10023   return VectorValue;
10024 }
10025 
10026 // Process the loop in the VPlan-native vectorization path. This path builds
10027 // VPlan upfront in the vectorization pipeline, which allows to apply
10028 // VPlan-to-VPlan transformations from the very beginning without modifying the
10029 // input LLVM IR.
10030 static bool processLoopInVPlanNativePath(
10031     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
10032     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
10033     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
10034     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
10035     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
10036     LoopVectorizationRequirements &Requirements) {
10037 
10038   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
10039     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
10040     return false;
10041   }
10042   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
10043   Function *F = L->getHeader()->getParent();
10044   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
10045 
10046   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10047       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
10048 
10049   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
10050                                 &Hints, IAI);
10051   // Use the planner for outer loop vectorization.
10052   // TODO: CM is not used at this point inside the planner. Turn CM into an
10053   // optional argument if we don't need it in the future.
10054   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints,
10055                                Requirements, ORE);
10056 
10057   // Get user vectorization factor.
10058   ElementCount UserVF = Hints.getWidth();
10059 
10060   CM.collectElementTypesForWidening();
10061 
10062   // Plan how to best vectorize, return the best VF and its cost.
10063   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
10064 
10065   // If we are stress testing VPlan builds, do not attempt to generate vector
10066   // code. Masked vector code generation support will follow soon.
10067   // Also, do not attempt to vectorize if no vector code will be produced.
10068   if (VPlanBuildStressTest || EnableVPlanPredication ||
10069       VectorizationFactor::Disabled() == VF)
10070     return false;
10071 
10072   LVP.setBestPlan(VF.Width, 1);
10073 
10074   {
10075     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10076                              F->getParent()->getDataLayout());
10077     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
10078                            &CM, BFI, PSI, Checks);
10079     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
10080                       << L->getHeader()->getParent()->getName() << "\"\n");
10081     LVP.executePlan(LB, DT);
10082   }
10083 
10084   // Mark the loop as already vectorized to avoid vectorizing again.
10085   Hints.setAlreadyVectorized();
10086   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10087   return true;
10088 }
10089 
10090 // Emit a remark if there are stores to floats that required a floating point
10091 // extension. If the vectorized loop was generated with floating point there
10092 // will be a performance penalty from the conversion overhead and the change in
10093 // the vector width.
10094 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
10095   SmallVector<Instruction *, 4> Worklist;
10096   for (BasicBlock *BB : L->getBlocks()) {
10097     for (Instruction &Inst : *BB) {
10098       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
10099         if (S->getValueOperand()->getType()->isFloatTy())
10100           Worklist.push_back(S);
10101       }
10102     }
10103   }
10104 
10105   // Traverse the floating point stores upwards searching, for floating point
10106   // conversions.
10107   SmallPtrSet<const Instruction *, 4> Visited;
10108   SmallPtrSet<const Instruction *, 4> EmittedRemark;
10109   while (!Worklist.empty()) {
10110     auto *I = Worklist.pop_back_val();
10111     if (!L->contains(I))
10112       continue;
10113     if (!Visited.insert(I).second)
10114       continue;
10115 
10116     // Emit a remark if the floating point store required a floating
10117     // point conversion.
10118     // TODO: More work could be done to identify the root cause such as a
10119     // constant or a function return type and point the user to it.
10120     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
10121       ORE->emit([&]() {
10122         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
10123                                           I->getDebugLoc(), L->getHeader())
10124                << "floating point conversion changes vector width. "
10125                << "Mixed floating point precision requires an up/down "
10126                << "cast that will negatively impact performance.";
10127       });
10128 
10129     for (Use &Op : I->operands())
10130       if (auto *OpI = dyn_cast<Instruction>(Op))
10131         Worklist.push_back(OpI);
10132   }
10133 }
10134 
10135 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
10136     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10137                                !EnableLoopInterleaving),
10138       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10139                               !EnableLoopVectorization) {}
10140 
10141 bool LoopVectorizePass::processLoop(Loop *L) {
10142   assert((EnableVPlanNativePath || L->isInnermost()) &&
10143          "VPlan-native path is not enabled. Only process inner loops.");
10144 
10145 #ifndef NDEBUG
10146   const std::string DebugLocStr = getDebugLocString(L);
10147 #endif /* NDEBUG */
10148 
10149   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
10150                     << L->getHeader()->getParent()->getName() << "\" from "
10151                     << DebugLocStr << "\n");
10152 
10153   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
10154 
10155   LLVM_DEBUG(
10156       dbgs() << "LV: Loop hints:"
10157              << " force="
10158              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
10159                      ? "disabled"
10160                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
10161                             ? "enabled"
10162                             : "?"))
10163              << " width=" << Hints.getWidth()
10164              << " interleave=" << Hints.getInterleave() << "\n");
10165 
10166   // Function containing loop
10167   Function *F = L->getHeader()->getParent();
10168 
10169   // Looking at the diagnostic output is the only way to determine if a loop
10170   // was vectorized (other than looking at the IR or machine code), so it
10171   // is important to generate an optimization remark for each loop. Most of
10172   // these messages are generated as OptimizationRemarkAnalysis. Remarks
10173   // generated as OptimizationRemark and OptimizationRemarkMissed are
10174   // less verbose reporting vectorized loops and unvectorized loops that may
10175   // benefit from vectorization, respectively.
10176 
10177   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10178     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10179     return false;
10180   }
10181 
10182   PredicatedScalarEvolution PSE(*SE, *L);
10183 
10184   // Check if it is legal to vectorize the loop.
10185   LoopVectorizationRequirements Requirements;
10186   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
10187                                 &Requirements, &Hints, DB, AC, BFI, PSI);
10188   if (!LVL.canVectorize(EnableVPlanNativePath)) {
10189     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10190     Hints.emitRemarkWithHints();
10191     return false;
10192   }
10193 
10194   // Check the function attributes and profiles to find out if this function
10195   // should be optimized for size.
10196   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10197       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
10198 
10199   // Entrance to the VPlan-native vectorization path. Outer loops are processed
10200   // here. They may require CFG and instruction level transformations before
10201   // even evaluating whether vectorization is profitable. Since we cannot modify
10202   // the incoming IR, we need to build VPlan upfront in the vectorization
10203   // pipeline.
10204   if (!L->isInnermost())
10205     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10206                                         ORE, BFI, PSI, Hints, Requirements);
10207 
10208   assert(L->isInnermost() && "Inner loop expected.");
10209 
10210   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10211   // count by optimizing for size, to minimize overheads.
10212   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
10213   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10214     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10215                       << "This loop is worth vectorizing only if no scalar "
10216                       << "iteration overheads are incurred.");
10217     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10218       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10219     else {
10220       LLVM_DEBUG(dbgs() << "\n");
10221       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10222     }
10223   }
10224 
10225   // Check the function attributes to see if implicit floats are allowed.
10226   // FIXME: This check doesn't seem possibly correct -- what if the loop is
10227   // an integer loop and the vector instructions selected are purely integer
10228   // vector instructions?
10229   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10230     reportVectorizationFailure(
10231         "Can't vectorize when the NoImplicitFloat attribute is used",
10232         "loop not vectorized due to NoImplicitFloat attribute",
10233         "NoImplicitFloat", ORE, L);
10234     Hints.emitRemarkWithHints();
10235     return false;
10236   }
10237 
10238   // Check if the target supports potentially unsafe FP vectorization.
10239   // FIXME: Add a check for the type of safety issue (denormal, signaling)
10240   // for the target we're vectorizing for, to make sure none of the
10241   // additional fp-math flags can help.
10242   if (Hints.isPotentiallyUnsafe() &&
10243       TTI->isFPVectorizationPotentiallyUnsafe()) {
10244     reportVectorizationFailure(
10245         "Potentially unsafe FP op prevents vectorization",
10246         "loop not vectorized due to unsafe FP support.",
10247         "UnsafeFP", ORE, L);
10248     Hints.emitRemarkWithHints();
10249     return false;
10250   }
10251 
10252   bool AllowOrderedReductions;
10253   // If the flag is set, use that instead and override the TTI behaviour.
10254   if (ForceOrderedReductions.getNumOccurrences() > 0)
10255     AllowOrderedReductions = ForceOrderedReductions;
10256   else
10257     AllowOrderedReductions = TTI->enableOrderedReductions();
10258   if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10259     ORE->emit([&]() {
10260       auto *ExactFPMathInst = Requirements.getExactFPInst();
10261       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10262                                                  ExactFPMathInst->getDebugLoc(),
10263                                                  ExactFPMathInst->getParent())
10264              << "loop not vectorized: cannot prove it is safe to reorder "
10265                 "floating-point operations";
10266     });
10267     LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10268                          "reorder floating-point operations\n");
10269     Hints.emitRemarkWithHints();
10270     return false;
10271   }
10272 
10273   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10274   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10275 
10276   // If an override option has been passed in for interleaved accesses, use it.
10277   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
10278     UseInterleaved = EnableInterleavedMemAccesses;
10279 
10280   // Analyze interleaved memory accesses.
10281   if (UseInterleaved) {
10282     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
10283   }
10284 
10285   // Use the cost model.
10286   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10287                                 F, &Hints, IAI);
10288   CM.collectValuesToIgnore();
10289   CM.collectElementTypesForWidening();
10290 
10291   // Use the planner for vectorization.
10292   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints,
10293                                Requirements, ORE);
10294 
10295   // Get user vectorization factor and interleave count.
10296   ElementCount UserVF = Hints.getWidth();
10297   unsigned UserIC = Hints.getInterleave();
10298 
10299   // Plan how to best vectorize, return the best VF and its cost.
10300   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
10301 
10302   VectorizationFactor VF = VectorizationFactor::Disabled();
10303   unsigned IC = 1;
10304 
10305   if (MaybeVF) {
10306     VF = *MaybeVF;
10307     // Select the interleave count.
10308     IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());
10309   }
10310 
10311   // Identify the diagnostic messages that should be produced.
10312   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10313   bool VectorizeLoop = true, InterleaveLoop = true;
10314   if (VF.Width.isScalar()) {
10315     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10316     VecDiagMsg = std::make_pair(
10317         "VectorizationNotBeneficial",
10318         "the cost-model indicates that vectorization is not beneficial");
10319     VectorizeLoop = false;
10320   }
10321 
10322   if (!MaybeVF && UserIC > 1) {
10323     // Tell the user interleaving was avoided up-front, despite being explicitly
10324     // requested.
10325     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10326                          "interleaving should be avoided up front\n");
10327     IntDiagMsg = std::make_pair(
10328         "InterleavingAvoided",
10329         "Ignoring UserIC, because interleaving was avoided up front");
10330     InterleaveLoop = false;
10331   } else if (IC == 1 && UserIC <= 1) {
10332     // Tell the user interleaving is not beneficial.
10333     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10334     IntDiagMsg = std::make_pair(
10335         "InterleavingNotBeneficial",
10336         "the cost-model indicates that interleaving is not beneficial");
10337     InterleaveLoop = false;
10338     if (UserIC == 1) {
10339       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10340       IntDiagMsg.second +=
10341           " and is explicitly disabled or interleave count is set to 1";
10342     }
10343   } else if (IC > 1 && UserIC == 1) {
10344     // Tell the user interleaving is beneficial, but it explicitly disabled.
10345     LLVM_DEBUG(
10346         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10347     IntDiagMsg = std::make_pair(
10348         "InterleavingBeneficialButDisabled",
10349         "the cost-model indicates that interleaving is beneficial "
10350         "but is explicitly disabled or interleave count is set to 1");
10351     InterleaveLoop = false;
10352   }
10353 
10354   // Override IC if user provided an interleave count.
10355   IC = UserIC > 0 ? UserIC : IC;
10356 
10357   // Emit diagnostic messages, if any.
10358   const char *VAPassName = Hints.vectorizeAnalysisPassName();
10359   if (!VectorizeLoop && !InterleaveLoop) {
10360     // Do not vectorize or interleaving the loop.
10361     ORE->emit([&]() {
10362       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10363                                       L->getStartLoc(), L->getHeader())
10364              << VecDiagMsg.second;
10365     });
10366     ORE->emit([&]() {
10367       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10368                                       L->getStartLoc(), L->getHeader())
10369              << IntDiagMsg.second;
10370     });
10371     return false;
10372   } else if (!VectorizeLoop && InterleaveLoop) {
10373     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10374     ORE->emit([&]() {
10375       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10376                                         L->getStartLoc(), L->getHeader())
10377              << VecDiagMsg.second;
10378     });
10379   } else if (VectorizeLoop && !InterleaveLoop) {
10380     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10381                       << ") in " << DebugLocStr << '\n');
10382     ORE->emit([&]() {
10383       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10384                                         L->getStartLoc(), L->getHeader())
10385              << IntDiagMsg.second;
10386     });
10387   } else if (VectorizeLoop && InterleaveLoop) {
10388     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10389                       << ") in " << DebugLocStr << '\n');
10390     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10391   }
10392 
10393   bool DisableRuntimeUnroll = false;
10394   MDNode *OrigLoopID = L->getLoopID();
10395   {
10396     // Optimistically generate runtime checks. Drop them if they turn out to not
10397     // be profitable. Limit the scope of Checks, so the cleanup happens
10398     // immediately after vector codegeneration is done.
10399     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10400                              F->getParent()->getDataLayout());
10401     if (!VF.Width.isScalar() || IC > 1)
10402       Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate());
10403     LVP.setBestPlan(VF.Width, IC);
10404 
10405     using namespace ore;
10406     if (!VectorizeLoop) {
10407       assert(IC > 1 && "interleave count should not be 1 or 0");
10408       // If we decided that it is not legal to vectorize the loop, then
10409       // interleave it.
10410       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10411                                  &CM, BFI, PSI, Checks);
10412       LVP.executePlan(Unroller, DT);
10413 
10414       ORE->emit([&]() {
10415         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10416                                   L->getHeader())
10417                << "interleaved loop (interleaved count: "
10418                << NV("InterleaveCount", IC) << ")";
10419       });
10420     } else {
10421       // If we decided that it is *legal* to vectorize the loop, then do it.
10422 
10423       // Consider vectorizing the epilogue too if it's profitable.
10424       VectorizationFactor EpilogueVF =
10425           CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
10426       if (EpilogueVF.Width.isVector()) {
10427 
10428         // The first pass vectorizes the main loop and creates a scalar epilogue
10429         // to be vectorized by executing the plan (potentially with a different
10430         // factor) again shortly afterwards.
10431         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10432         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10433                                            EPI, &LVL, &CM, BFI, PSI, Checks);
10434 
10435         LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF);
10436         LVP.executePlan(MainILV, DT);
10437         ++LoopsVectorized;
10438 
10439         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10440         formLCSSARecursively(*L, *DT, LI, SE);
10441 
10442         // Second pass vectorizes the epilogue and adjusts the control flow
10443         // edges from the first pass.
10444         LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF);
10445         EPI.MainLoopVF = EPI.EpilogueVF;
10446         EPI.MainLoopUF = EPI.EpilogueUF;
10447         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10448                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
10449                                                  Checks);
10450         LVP.executePlan(EpilogILV, DT);
10451         ++LoopsEpilogueVectorized;
10452 
10453         if (!MainILV.areSafetyChecksAdded())
10454           DisableRuntimeUnroll = true;
10455       } else {
10456         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
10457                                &LVL, &CM, BFI, PSI, Checks);
10458         LVP.executePlan(LB, DT);
10459         ++LoopsVectorized;
10460 
10461         // Add metadata to disable runtime unrolling a scalar loop when there
10462         // are no runtime checks about strides and memory. A scalar loop that is
10463         // rarely used is not worth unrolling.
10464         if (!LB.areSafetyChecksAdded())
10465           DisableRuntimeUnroll = true;
10466       }
10467       // Report the vectorization decision.
10468       ORE->emit([&]() {
10469         return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
10470                                   L->getHeader())
10471                << "vectorized loop (vectorization width: "
10472                << NV("VectorizationFactor", VF.Width)
10473                << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
10474       });
10475     }
10476 
10477     if (ORE->allowExtraAnalysis(LV_NAME))
10478       checkMixedPrecision(L, ORE);
10479   }
10480 
10481   Optional<MDNode *> RemainderLoopID =
10482       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10483                                       LLVMLoopVectorizeFollowupEpilogue});
10484   if (RemainderLoopID.hasValue()) {
10485     L->setLoopID(RemainderLoopID.getValue());
10486   } else {
10487     if (DisableRuntimeUnroll)
10488       AddRuntimeUnrollDisableMetaData(L);
10489 
10490     // Mark the loop as already vectorized to avoid vectorizing again.
10491     Hints.setAlreadyVectorized();
10492   }
10493 
10494   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10495   return true;
10496 }
10497 
10498 LoopVectorizeResult LoopVectorizePass::runImpl(
10499     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10500     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
10501     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
10502     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
10503     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10504   SE = &SE_;
10505   LI = &LI_;
10506   TTI = &TTI_;
10507   DT = &DT_;
10508   BFI = &BFI_;
10509   TLI = TLI_;
10510   AA = &AA_;
10511   AC = &AC_;
10512   GetLAA = &GetLAA_;
10513   DB = &DB_;
10514   ORE = &ORE_;
10515   PSI = PSI_;
10516 
10517   // Don't attempt if
10518   // 1. the target claims to have no vector registers, and
10519   // 2. interleaving won't help ILP.
10520   //
10521   // The second condition is necessary because, even if the target has no
10522   // vector registers, loop vectorization may still enable scalar
10523   // interleaving.
10524   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10525       TTI->getMaxInterleaveFactor(1) < 2)
10526     return LoopVectorizeResult(false, false);
10527 
10528   bool Changed = false, CFGChanged = false;
10529 
10530   // The vectorizer requires loops to be in simplified form.
10531   // Since simplification may add new inner loops, it has to run before the
10532   // legality and profitability checks. This means running the loop vectorizer
10533   // will simplify all loops, regardless of whether anything end up being
10534   // vectorized.
10535   for (auto &L : *LI)
10536     Changed |= CFGChanged |=
10537         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10538 
10539   // Build up a worklist of inner-loops to vectorize. This is necessary as
10540   // the act of vectorizing or partially unrolling a loop creates new loops
10541   // and can invalidate iterators across the loops.
10542   SmallVector<Loop *, 8> Worklist;
10543 
10544   for (Loop *L : *LI)
10545     collectSupportedLoops(*L, LI, ORE, Worklist);
10546 
10547   LoopsAnalyzed += Worklist.size();
10548 
10549   // Now walk the identified inner loops.
10550   while (!Worklist.empty()) {
10551     Loop *L = Worklist.pop_back_val();
10552 
10553     // For the inner loops we actually process, form LCSSA to simplify the
10554     // transform.
10555     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10556 
10557     Changed |= CFGChanged |= processLoop(L);
10558   }
10559 
10560   // Process each loop nest in the function.
10561   return LoopVectorizeResult(Changed, CFGChanged);
10562 }
10563 
10564 PreservedAnalyses LoopVectorizePass::run(Function &F,
10565                                          FunctionAnalysisManager &AM) {
10566     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
10567     auto &LI = AM.getResult<LoopAnalysis>(F);
10568     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10569     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10570     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
10571     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10572     auto &AA = AM.getResult<AAManager>(F);
10573     auto &AC = AM.getResult<AssumptionAnalysis>(F);
10574     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10575     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10576 
10577     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
10578     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
10579         [&](Loop &L) -> const LoopAccessInfo & {
10580       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,      SE,
10581                                         TLI, TTI, nullptr, nullptr, nullptr};
10582       return LAM.getResult<LoopAccessAnalysis>(L, AR);
10583     };
10584     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10585     ProfileSummaryInfo *PSI =
10586         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10587     LoopVectorizeResult Result =
10588         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
10589     if (!Result.MadeAnyChange)
10590       return PreservedAnalyses::all();
10591     PreservedAnalyses PA;
10592 
10593     // We currently do not preserve loopinfo/dominator analyses with outer loop
10594     // vectorization. Until this is addressed, mark these analyses as preserved
10595     // only for non-VPlan-native path.
10596     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10597     if (!EnableVPlanNativePath) {
10598       PA.preserve<LoopAnalysis>();
10599       PA.preserve<DominatorTreeAnalysis>();
10600     }
10601     if (!Result.MadeCFGChange)
10602       PA.preserveSet<CFGAnalyses>();
10603     return PA;
10604 }
10605 
10606 void LoopVectorizePass::printPipeline(
10607     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10608   static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10609       OS, MapClassName2PassName);
10610 
10611   OS << "<";
10612   OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10613   OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10614   OS << ">";
10615 }
10616