1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SmallPtrSet.h"
73 #include "llvm/ADT/SmallSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
95 #include "llvm/Analysis/TargetLibraryInfo.h"
96 #include "llvm/Analysis/TargetTransformInfo.h"
97 #include "llvm/Analysis/VectorUtils.h"
98 #include "llvm/IR/Attributes.h"
99 #include "llvm/IR/BasicBlock.h"
100 #include "llvm/IR/CFG.h"
101 #include "llvm/IR/Constant.h"
102 #include "llvm/IR/Constants.h"
103 #include "llvm/IR/DataLayout.h"
104 #include "llvm/IR/DebugInfoMetadata.h"
105 #include "llvm/IR/DebugLoc.h"
106 #include "llvm/IR/DerivedTypes.h"
107 #include "llvm/IR/DiagnosticInfo.h"
108 #include "llvm/IR/Dominators.h"
109 #include "llvm/IR/Function.h"
110 #include "llvm/IR/IRBuilder.h"
111 #include "llvm/IR/InstrTypes.h"
112 #include "llvm/IR/Instruction.h"
113 #include "llvm/IR/Instructions.h"
114 #include "llvm/IR/IntrinsicInst.h"
115 #include "llvm/IR/Intrinsics.h"
116 #include "llvm/IR/LLVMContext.h"
117 #include "llvm/IR/Metadata.h"
118 #include "llvm/IR/Module.h"
119 #include "llvm/IR/Operator.h"
120 #include "llvm/IR/PatternMatch.h"
121 #include "llvm/IR/Type.h"
122 #include "llvm/IR/Use.h"
123 #include "llvm/IR/User.h"
124 #include "llvm/IR/Value.h"
125 #include "llvm/IR/ValueHandle.h"
126 #include "llvm/IR/Verifier.h"
127 #include "llvm/InitializePasses.h"
128 #include "llvm/Pass.h"
129 #include "llvm/Support/Casting.h"
130 #include "llvm/Support/CommandLine.h"
131 #include "llvm/Support/Compiler.h"
132 #include "llvm/Support/Debug.h"
133 #include "llvm/Support/ErrorHandling.h"
134 #include "llvm/Support/InstructionCost.h"
135 #include "llvm/Support/MathExtras.h"
136 #include "llvm/Support/raw_ostream.h"
137 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
138 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
139 #include "llvm/Transforms/Utils/LoopSimplify.h"
140 #include "llvm/Transforms/Utils/LoopUtils.h"
141 #include "llvm/Transforms/Utils/LoopVersioning.h"
142 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
143 #include "llvm/Transforms/Utils/SizeOpts.h"
144 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
145 #include <algorithm>
146 #include <cassert>
147 #include <cstdint>
148 #include <cstdlib>
149 #include <functional>
150 #include <iterator>
151 #include <limits>
152 #include <memory>
153 #include <string>
154 #include <tuple>
155 #include <utility>
156 
157 using namespace llvm;
158 
159 #define LV_NAME "loop-vectorize"
160 #define DEBUG_TYPE LV_NAME
161 
162 #ifndef NDEBUG
163 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
164 #endif
165 
166 /// @{
167 /// Metadata attribute names
168 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
169 const char LLVMLoopVectorizeFollowupVectorized[] =
170     "llvm.loop.vectorize.followup_vectorized";
171 const char LLVMLoopVectorizeFollowupEpilogue[] =
172     "llvm.loop.vectorize.followup_epilogue";
173 /// @}
174 
175 STATISTIC(LoopsVectorized, "Number of loops vectorized");
176 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
177 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
178 
179 static cl::opt<bool> EnableEpilogueVectorization(
180     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
181     cl::desc("Enable vectorization of epilogue loops."));
182 
183 static cl::opt<unsigned> EpilogueVectorizationForceVF(
184     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
185     cl::desc("When epilogue vectorization is enabled, and a value greater than "
186              "1 is specified, forces the given VF for all applicable epilogue "
187              "loops."));
188 
189 static cl::opt<unsigned> EpilogueVectorizationMinVF(
190     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
191     cl::desc("Only loops with vectorization factor equal to or larger than "
192              "the specified value are considered for epilogue vectorization."));
193 
194 /// Loops with a known constant trip count below this number are vectorized only
195 /// if no scalar iteration overheads are incurred.
196 static cl::opt<unsigned> TinyTripCountVectorThreshold(
197     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
198     cl::desc("Loops with a constant trip count that is smaller than this "
199              "value are vectorized only if no scalar iteration overheads "
200              "are incurred."));
201 
202 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
203     "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
204     cl::desc("The maximum allowed number of runtime memory checks with a "
205              "vectorize(enable) pragma."));
206 
207 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
208 // that predication is preferred, and this lists all options. I.e., the
209 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
210 // and predicate the instructions accordingly. If tail-folding fails, there are
211 // different fallback strategies depending on these values:
212 namespace PreferPredicateTy {
213   enum Option {
214     ScalarEpilogue = 0,
215     PredicateElseScalarEpilogue,
216     PredicateOrDontVectorize
217   };
218 } // namespace PreferPredicateTy
219 
220 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
221     "prefer-predicate-over-epilogue",
222     cl::init(PreferPredicateTy::ScalarEpilogue),
223     cl::Hidden,
224     cl::desc("Tail-folding and predication preferences over creating a scalar "
225              "epilogue loop."),
226     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
227                          "scalar-epilogue",
228                          "Don't tail-predicate loops, create scalar epilogue"),
229               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
230                          "predicate-else-scalar-epilogue",
231                          "prefer tail-folding, create scalar epilogue if tail "
232                          "folding fails."),
233               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
234                          "predicate-dont-vectorize",
235                          "prefers tail-folding, don't attempt vectorization if "
236                          "tail-folding fails.")));
237 
238 static cl::opt<bool> MaximizeBandwidth(
239     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
240     cl::desc("Maximize bandwidth when selecting vectorization factor which "
241              "will be determined by the smallest type in loop."));
242 
243 static cl::opt<bool> EnableInterleavedMemAccesses(
244     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
245     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
246 
247 /// An interleave-group may need masking if it resides in a block that needs
248 /// predication, or in order to mask away gaps.
249 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
250     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
251     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
252 
253 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
254     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
255     cl::desc("We don't interleave loops with a estimated constant trip count "
256              "below this number"));
257 
258 static cl::opt<unsigned> ForceTargetNumScalarRegs(
259     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
260     cl::desc("A flag that overrides the target's number of scalar registers."));
261 
262 static cl::opt<unsigned> ForceTargetNumVectorRegs(
263     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
264     cl::desc("A flag that overrides the target's number of vector registers."));
265 
266 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
267     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
268     cl::desc("A flag that overrides the target's max interleave factor for "
269              "scalar loops."));
270 
271 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
272     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
273     cl::desc("A flag that overrides the target's max interleave factor for "
274              "vectorized loops."));
275 
276 static cl::opt<unsigned> ForceTargetInstructionCost(
277     "force-target-instruction-cost", cl::init(0), cl::Hidden,
278     cl::desc("A flag that overrides the target's expected cost for "
279              "an instruction to a single constant value. Mostly "
280              "useful for getting consistent testing."));
281 
282 static cl::opt<bool> ForceTargetSupportsScalableVectors(
283     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
284     cl::desc(
285         "Pretend that scalable vectors are supported, even if the target does "
286         "not support them. This flag should only be used for testing."));
287 
288 static cl::opt<unsigned> SmallLoopCost(
289     "small-loop-cost", cl::init(20), cl::Hidden,
290     cl::desc(
291         "The cost of a loop that is considered 'small' by the interleaver."));
292 
293 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
294     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
295     cl::desc("Enable the use of the block frequency analysis to access PGO "
296              "heuristics minimizing code growth in cold regions and being more "
297              "aggressive in hot regions."));
298 
299 // Runtime interleave loops for load/store throughput.
300 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
301     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
302     cl::desc(
303         "Enable runtime interleaving until load/store ports are saturated"));
304 
305 /// Interleave small loops with scalar reductions.
306 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
307     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
308     cl::desc("Enable interleaving for loops with small iteration counts that "
309              "contain scalar reductions to expose ILP."));
310 
311 /// The number of stores in a loop that are allowed to need predication.
312 static cl::opt<unsigned> NumberOfStoresToPredicate(
313     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
314     cl::desc("Max number of stores to be predicated behind an if."));
315 
316 static cl::opt<bool> EnableIndVarRegisterHeur(
317     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
318     cl::desc("Count the induction variable only once when interleaving"));
319 
320 static cl::opt<bool> EnableCondStoresVectorization(
321     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
322     cl::desc("Enable if predication of stores during vectorization."));
323 
324 static cl::opt<unsigned> MaxNestedScalarReductionIC(
325     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
326     cl::desc("The maximum interleave count to use when interleaving a scalar "
327              "reduction in a nested loop."));
328 
329 static cl::opt<bool>
330     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
331                            cl::Hidden,
332                            cl::desc("Prefer in-loop vector reductions, "
333                                     "overriding the targets preference."));
334 
335 // FIXME: When loop hints are passed which allow reordering of FP operations,
336 // we still choose to use strict reductions with this flag. We should instead
337 // use the default behaviour of vectorizing with unordered reductions if
338 // reordering is allowed.
339 cl::opt<bool> EnableStrictReductions(
340     "enable-strict-reductions", cl::init(false), cl::Hidden,
341     cl::desc("Enable the vectorisation of loops with in-order (strict) "
342              "FP reductions"));
343 
344 static cl::opt<bool> PreferPredicatedReductionSelect(
345     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
346     cl::desc(
347         "Prefer predicating a reduction operation over an after loop select."));
348 
349 cl::opt<bool> EnableVPlanNativePath(
350     "enable-vplan-native-path", cl::init(false), cl::Hidden,
351     cl::desc("Enable VPlan-native vectorization path with "
352              "support for outer loop vectorization."));
353 
354 // FIXME: Remove this switch once we have divergence analysis. Currently we
355 // assume divergent non-backedge branches when this switch is true.
356 cl::opt<bool> EnableVPlanPredication(
357     "enable-vplan-predication", cl::init(false), cl::Hidden,
358     cl::desc("Enable VPlan-native vectorization path predicator with "
359              "support for outer loop vectorization."));
360 
361 // This flag enables the stress testing of the VPlan H-CFG construction in the
362 // VPlan-native vectorization path. It must be used in conjuction with
363 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
364 // verification of the H-CFGs built.
365 static cl::opt<bool> VPlanBuildStressTest(
366     "vplan-build-stress-test", cl::init(false), cl::Hidden,
367     cl::desc(
368         "Build VPlan for every supported loop nest in the function and bail "
369         "out right after the build (stress test the VPlan H-CFG construction "
370         "in the VPlan-native vectorization path)."));
371 
372 cl::opt<bool> llvm::EnableLoopInterleaving(
373     "interleave-loops", cl::init(true), cl::Hidden,
374     cl::desc("Enable loop interleaving in Loop vectorization passes"));
375 cl::opt<bool> llvm::EnableLoopVectorization(
376     "vectorize-loops", cl::init(true), cl::Hidden,
377     cl::desc("Run the Loop vectorization passes"));
378 
379 cl::opt<bool> PrintVPlansInDotFormat(
380     "vplan-print-in-dot-format", cl::init(false), cl::Hidden,
381     cl::desc("Use dot format instead of plain text when dumping VPlans"));
382 
383 /// A helper function that returns true if the given type is irregular. The
384 /// type is irregular if its allocated size doesn't equal the store size of an
385 /// element of the corresponding vector type.
386 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
387   // Determine if an array of N elements of type Ty is "bitcast compatible"
388   // with a <N x Ty> vector.
389   // This is only true if there is no padding between the array elements.
390   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
391 }
392 
393 /// A helper function that returns the reciprocal of the block probability of
394 /// predicated blocks. If we return X, we are assuming the predicated block
395 /// will execute once for every X iterations of the loop header.
396 ///
397 /// TODO: We should use actual block probability here, if available. Currently,
398 ///       we always assume predicated blocks have a 50% chance of executing.
399 static unsigned getReciprocalPredBlockProb() { return 2; }
400 
401 /// A helper function that returns an integer or floating-point constant with
402 /// value C.
403 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
404   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
405                            : ConstantFP::get(Ty, C);
406 }
407 
408 /// Returns "best known" trip count for the specified loop \p L as defined by
409 /// the following procedure:
410 ///   1) Returns exact trip count if it is known.
411 ///   2) Returns expected trip count according to profile data if any.
412 ///   3) Returns upper bound estimate if it is known.
413 ///   4) Returns None if all of the above failed.
414 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
415   // Check if exact trip count is known.
416   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
417     return ExpectedTC;
418 
419   // Check if there is an expected trip count available from profile data.
420   if (LoopVectorizeWithBlockFrequency)
421     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
422       return EstimatedTC;
423 
424   // Check if upper bound estimate is known.
425   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
426     return ExpectedTC;
427 
428   return None;
429 }
430 
431 // Forward declare GeneratedRTChecks.
432 class GeneratedRTChecks;
433 
434 namespace llvm {
435 
436 /// InnerLoopVectorizer vectorizes loops which contain only one basic
437 /// block to a specified vectorization factor (VF).
438 /// This class performs the widening of scalars into vectors, or multiple
439 /// scalars. This class also implements the following features:
440 /// * It inserts an epilogue loop for handling loops that don't have iteration
441 ///   counts that are known to be a multiple of the vectorization factor.
442 /// * It handles the code generation for reduction variables.
443 /// * Scalarization (implementation using scalars) of un-vectorizable
444 ///   instructions.
445 /// InnerLoopVectorizer does not perform any vectorization-legality
446 /// checks, and relies on the caller to check for the different legality
447 /// aspects. The InnerLoopVectorizer relies on the
448 /// LoopVectorizationLegality class to provide information about the induction
449 /// and reduction variables that were found to a given vectorization factor.
450 class InnerLoopVectorizer {
451 public:
452   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
453                       LoopInfo *LI, DominatorTree *DT,
454                       const TargetLibraryInfo *TLI,
455                       const TargetTransformInfo *TTI, AssumptionCache *AC,
456                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
457                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
458                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
459                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
460       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
461         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
462         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
463         PSI(PSI), RTChecks(RTChecks) {
464     // Query this against the original loop and save it here because the profile
465     // of the original loop header may change as the transformation happens.
466     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
467         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
468   }
469 
470   virtual ~InnerLoopVectorizer() = default;
471 
472   /// Create a new empty loop that will contain vectorized instructions later
473   /// on, while the old loop will be used as the scalar remainder. Control flow
474   /// is generated around the vectorized (and scalar epilogue) loops consisting
475   /// of various checks and bypasses. Return the pre-header block of the new
476   /// loop.
477   /// In the case of epilogue vectorization, this function is overriden to
478   /// handle the more complex control flow around the loops.
479   virtual BasicBlock *createVectorizedLoopSkeleton();
480 
481   /// Widen a single instruction within the innermost loop.
482   void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands,
483                         VPTransformState &State);
484 
485   /// Widen a single call instruction within the innermost loop.
486   void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
487                             VPTransformState &State);
488 
489   /// Widen a single select instruction within the innermost loop.
490   void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands,
491                               bool InvariantCond, VPTransformState &State);
492 
493   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
494   void fixVectorizedLoop(VPTransformState &State);
495 
496   // Return true if any runtime check is added.
497   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
498 
499   /// A type for vectorized values in the new loop. Each value from the
500   /// original loop, when vectorized, is represented by UF vector values in the
501   /// new unrolled loop, where UF is the unroll factor.
502   using VectorParts = SmallVector<Value *, 2>;
503 
504   /// Vectorize a single GetElementPtrInst based on information gathered and
505   /// decisions taken during planning.
506   void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices,
507                 unsigned UF, ElementCount VF, bool IsPtrLoopInvariant,
508                 SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
509 
510   /// Vectorize a single PHINode in a block. This method handles the induction
511   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
512   /// arbitrary length vectors.
513   void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc,
514                            VPWidenPHIRecipe *PhiR, VPTransformState &State);
515 
516   /// A helper function to scalarize a single Instruction in the innermost loop.
517   /// Generates a sequence of scalar instances for each lane between \p MinLane
518   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
519   /// inclusive. Uses the VPValue operands from \p Operands instead of \p
520   /// Instr's operands.
521   void scalarizeInstruction(Instruction *Instr, VPValue *Def, VPUser &Operands,
522                             const VPIteration &Instance, bool IfPredicateInstr,
523                             VPTransformState &State);
524 
525   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
526   /// is provided, the integer induction variable will first be truncated to
527   /// the corresponding type.
528   void widenIntOrFpInduction(PHINode *IV, Value *Start, TruncInst *Trunc,
529                              VPValue *Def, VPValue *CastDef,
530                              VPTransformState &State);
531 
532   /// Construct the vector value of a scalarized value \p V one lane at a time.
533   void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
534                                  VPTransformState &State);
535 
536   /// Try to vectorize interleaved access group \p Group with the base address
537   /// given in \p Addr, optionally masking the vector operations if \p
538   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
539   /// values in the vectorized loop.
540   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
541                                 ArrayRef<VPValue *> VPDefs,
542                                 VPTransformState &State, VPValue *Addr,
543                                 ArrayRef<VPValue *> StoredValues,
544                                 VPValue *BlockInMask = nullptr);
545 
546   /// Vectorize Load and Store instructions with the base address given in \p
547   /// Addr, optionally masking the vector operations if \p BlockInMask is
548   /// non-null. Use \p State to translate given VPValues to IR values in the
549   /// vectorized loop.
550   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
551                                   VPValue *Def, VPValue *Addr,
552                                   VPValue *StoredValue, VPValue *BlockInMask);
553 
554   /// Set the debug location in the builder using the debug location in
555   /// the instruction.
556   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
557 
558   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
559   void fixNonInductionPHIs(VPTransformState &State);
560 
561   /// Create a broadcast instruction. This method generates a broadcast
562   /// instruction (shuffle) for loop invariant values and for the induction
563   /// value. If this is the induction variable then we extend it to N, N+1, ...
564   /// this is needed because each iteration in the loop corresponds to a SIMD
565   /// element.
566   virtual Value *getBroadcastInstrs(Value *V);
567 
568 protected:
569   friend class LoopVectorizationPlanner;
570 
571   /// A small list of PHINodes.
572   using PhiVector = SmallVector<PHINode *, 4>;
573 
574   /// A type for scalarized values in the new loop. Each value from the
575   /// original loop, when scalarized, is represented by UF x VF scalar values
576   /// in the new unrolled loop, where UF is the unroll factor and VF is the
577   /// vectorization factor.
578   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
579 
580   /// Set up the values of the IVs correctly when exiting the vector loop.
581   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
582                     Value *CountRoundDown, Value *EndValue,
583                     BasicBlock *MiddleBlock);
584 
585   /// Create a new induction variable inside L.
586   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
587                                    Value *Step, Instruction *DL);
588 
589   /// Handle all cross-iteration phis in the header.
590   void fixCrossIterationPHIs(VPTransformState &State);
591 
592   /// Fix a first-order recurrence. This is the second phase of vectorizing
593   /// this phi node.
594   void fixFirstOrderRecurrence(PHINode *Phi, VPTransformState &State);
595 
596   /// Fix a reduction cross-iteration phi. This is the second phase of
597   /// vectorizing this phi node.
598   void fixReduction(VPWidenPHIRecipe *Phi, VPTransformState &State);
599 
600   /// Clear NSW/NUW flags from reduction instructions if necessary.
601   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc,
602                                VPTransformState &State);
603 
604   /// Fixup the LCSSA phi nodes in the unique exit block.  This simply
605   /// means we need to add the appropriate incoming value from the middle
606   /// block as exiting edges from the scalar epilogue loop (if present) are
607   /// already in place, and we exit the vector loop exclusively to the middle
608   /// block.
609   void fixLCSSAPHIs(VPTransformState &State);
610 
611   /// Iteratively sink the scalarized operands of a predicated instruction into
612   /// the block that was created for it.
613   void sinkScalarOperands(Instruction *PredInst);
614 
615   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
616   /// represented as.
617   void truncateToMinimalBitwidths(VPTransformState &State);
618 
619   /// This function adds
620   /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
621   /// to each vector element of Val. The sequence starts at StartIndex.
622   /// \p Opcode is relevant for FP induction variable.
623   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
624                                Instruction::BinaryOps Opcode =
625                                Instruction::BinaryOpsEnd);
626 
627   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
628   /// variable on which to base the steps, \p Step is the size of the step, and
629   /// \p EntryVal is the value from the original loop that maps to the steps.
630   /// Note that \p EntryVal doesn't have to be an induction variable - it
631   /// can also be a truncate instruction.
632   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
633                         const InductionDescriptor &ID, VPValue *Def,
634                         VPValue *CastDef, VPTransformState &State);
635 
636   /// Create a vector induction phi node based on an existing scalar one. \p
637   /// EntryVal is the value from the original loop that maps to the vector phi
638   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
639   /// truncate instruction, instead of widening the original IV, we widen a
640   /// version of the IV truncated to \p EntryVal's type.
641   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
642                                        Value *Step, Value *Start,
643                                        Instruction *EntryVal, VPValue *Def,
644                                        VPValue *CastDef,
645                                        VPTransformState &State);
646 
647   /// Returns true if an instruction \p I should be scalarized instead of
648   /// vectorized for the chosen vectorization factor.
649   bool shouldScalarizeInstruction(Instruction *I) const;
650 
651   /// Returns true if we should generate a scalar version of \p IV.
652   bool needsScalarInduction(Instruction *IV) const;
653 
654   /// If there is a cast involved in the induction variable \p ID, which should
655   /// be ignored in the vectorized loop body, this function records the
656   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
657   /// cast. We had already proved that the casted Phi is equal to the uncasted
658   /// Phi in the vectorized loop (under a runtime guard), and therefore
659   /// there is no need to vectorize the cast - the same value can be used in the
660   /// vector loop for both the Phi and the cast.
661   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
662   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
663   ///
664   /// \p EntryVal is the value from the original loop that maps to the vector
665   /// phi node and is used to distinguish what is the IV currently being
666   /// processed - original one (if \p EntryVal is a phi corresponding to the
667   /// original IV) or the "newly-created" one based on the proof mentioned above
668   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
669   /// latter case \p EntryVal is a TruncInst and we must not record anything for
670   /// that IV, but it's error-prone to expect callers of this routine to care
671   /// about that, hence this explicit parameter.
672   void recordVectorLoopValueForInductionCast(
673       const InductionDescriptor &ID, const Instruction *EntryVal,
674       Value *VectorLoopValue, VPValue *CastDef, VPTransformState &State,
675       unsigned Part, unsigned Lane = UINT_MAX);
676 
677   /// Generate a shuffle sequence that will reverse the vector Vec.
678   virtual Value *reverseVector(Value *Vec);
679 
680   /// Returns (and creates if needed) the original loop trip count.
681   Value *getOrCreateTripCount(Loop *NewLoop);
682 
683   /// Returns (and creates if needed) the trip count of the widened loop.
684   Value *getOrCreateVectorTripCount(Loop *NewLoop);
685 
686   /// Returns a bitcasted value to the requested vector type.
687   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
688   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
689                                 const DataLayout &DL);
690 
691   /// Emit a bypass check to see if the vector trip count is zero, including if
692   /// it overflows.
693   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
694 
695   /// Emit a bypass check to see if all of the SCEV assumptions we've
696   /// had to make are correct. Returns the block containing the checks or
697   /// nullptr if no checks have been added.
698   BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass);
699 
700   /// Emit bypass checks to check any memory assumptions we may have made.
701   /// Returns the block containing the checks or nullptr if no checks have been
702   /// added.
703   BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
704 
705   /// Compute the transformed value of Index at offset StartValue using step
706   /// StepValue.
707   /// For integer induction, returns StartValue + Index * StepValue.
708   /// For pointer induction, returns StartValue[Index * StepValue].
709   /// FIXME: The newly created binary instructions should contain nsw/nuw
710   /// flags, which can be found from the original scalar operations.
711   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
712                               const DataLayout &DL,
713                               const InductionDescriptor &ID) const;
714 
715   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
716   /// vector loop preheader, middle block and scalar preheader. Also
717   /// allocate a loop object for the new vector loop and return it.
718   Loop *createVectorLoopSkeleton(StringRef Prefix);
719 
720   /// Create new phi nodes for the induction variables to resume iteration count
721   /// in the scalar epilogue, from where the vectorized loop left off (given by
722   /// \p VectorTripCount).
723   /// In cases where the loop skeleton is more complicated (eg. epilogue
724   /// vectorization) and the resume values can come from an additional bypass
725   /// block, the \p AdditionalBypass pair provides information about the bypass
726   /// block and the end value on the edge from bypass to this loop.
727   void createInductionResumeValues(
728       Loop *L, Value *VectorTripCount,
729       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
730 
731   /// Complete the loop skeleton by adding debug MDs, creating appropriate
732   /// conditional branches in the middle block, preparing the builder and
733   /// running the verifier. Take in the vector loop \p L as argument, and return
734   /// the preheader of the completed vector loop.
735   BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
736 
737   /// Add additional metadata to \p To that was not present on \p Orig.
738   ///
739   /// Currently this is used to add the noalias annotations based on the
740   /// inserted memchecks.  Use this for instructions that are *cloned* into the
741   /// vector loop.
742   void addNewMetadata(Instruction *To, const Instruction *Orig);
743 
744   /// Add metadata from one instruction to another.
745   ///
746   /// This includes both the original MDs from \p From and additional ones (\see
747   /// addNewMetadata).  Use this for *newly created* instructions in the vector
748   /// loop.
749   void addMetadata(Instruction *To, Instruction *From);
750 
751   /// Similar to the previous function but it adds the metadata to a
752   /// vector of instructions.
753   void addMetadata(ArrayRef<Value *> To, Instruction *From);
754 
755   /// Allow subclasses to override and print debug traces before/after vplan
756   /// execution, when trace information is requested.
757   virtual void printDebugTracesAtStart(){};
758   virtual void printDebugTracesAtEnd(){};
759 
760   /// The original loop.
761   Loop *OrigLoop;
762 
763   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
764   /// dynamic knowledge to simplify SCEV expressions and converts them to a
765   /// more usable form.
766   PredicatedScalarEvolution &PSE;
767 
768   /// Loop Info.
769   LoopInfo *LI;
770 
771   /// Dominator Tree.
772   DominatorTree *DT;
773 
774   /// Alias Analysis.
775   AAResults *AA;
776 
777   /// Target Library Info.
778   const TargetLibraryInfo *TLI;
779 
780   /// Target Transform Info.
781   const TargetTransformInfo *TTI;
782 
783   /// Assumption Cache.
784   AssumptionCache *AC;
785 
786   /// Interface to emit optimization remarks.
787   OptimizationRemarkEmitter *ORE;
788 
789   /// LoopVersioning.  It's only set up (non-null) if memchecks were
790   /// used.
791   ///
792   /// This is currently only used to add no-alias metadata based on the
793   /// memchecks.  The actually versioning is performed manually.
794   std::unique_ptr<LoopVersioning> LVer;
795 
796   /// The vectorization SIMD factor to use. Each vector will have this many
797   /// vector elements.
798   ElementCount VF;
799 
800   /// The vectorization unroll factor to use. Each scalar is vectorized to this
801   /// many different vector instructions.
802   unsigned UF;
803 
804   /// The builder that we use
805   IRBuilder<> Builder;
806 
807   // --- Vectorization state ---
808 
809   /// The vector-loop preheader.
810   BasicBlock *LoopVectorPreHeader;
811 
812   /// The scalar-loop preheader.
813   BasicBlock *LoopScalarPreHeader;
814 
815   /// Middle Block between the vector and the scalar.
816   BasicBlock *LoopMiddleBlock;
817 
818   /// The (unique) ExitBlock of the scalar loop.  Note that
819   /// there can be multiple exiting edges reaching this block.
820   BasicBlock *LoopExitBlock;
821 
822   /// The vector loop body.
823   BasicBlock *LoopVectorBody;
824 
825   /// The scalar loop body.
826   BasicBlock *LoopScalarBody;
827 
828   /// A list of all bypass blocks. The first block is the entry of the loop.
829   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
830 
831   /// The new Induction variable which was added to the new block.
832   PHINode *Induction = nullptr;
833 
834   /// The induction variable of the old basic block.
835   PHINode *OldInduction = nullptr;
836 
837   /// Store instructions that were predicated.
838   SmallVector<Instruction *, 4> PredicatedInstructions;
839 
840   /// Trip count of the original loop.
841   Value *TripCount = nullptr;
842 
843   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
844   Value *VectorTripCount = nullptr;
845 
846   /// The legality analysis.
847   LoopVectorizationLegality *Legal;
848 
849   /// The profitablity analysis.
850   LoopVectorizationCostModel *Cost;
851 
852   // Record whether runtime checks are added.
853   bool AddedSafetyChecks = false;
854 
855   // Holds the end values for each induction variable. We save the end values
856   // so we can later fix-up the external users of the induction variables.
857   DenseMap<PHINode *, Value *> IVEndValues;
858 
859   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
860   // fixed up at the end of vector code generation.
861   SmallVector<PHINode *, 8> OrigPHIsToFix;
862 
863   /// BFI and PSI are used to check for profile guided size optimizations.
864   BlockFrequencyInfo *BFI;
865   ProfileSummaryInfo *PSI;
866 
867   // Whether this loop should be optimized for size based on profile guided size
868   // optimizatios.
869   bool OptForSizeBasedOnProfile;
870 
871   /// Structure to hold information about generated runtime checks, responsible
872   /// for cleaning the checks, if vectorization turns out unprofitable.
873   GeneratedRTChecks &RTChecks;
874 };
875 
876 class InnerLoopUnroller : public InnerLoopVectorizer {
877 public:
878   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
879                     LoopInfo *LI, DominatorTree *DT,
880                     const TargetLibraryInfo *TLI,
881                     const TargetTransformInfo *TTI, AssumptionCache *AC,
882                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
883                     LoopVectorizationLegality *LVL,
884                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
885                     ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
886       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
887                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
888                             BFI, PSI, Check) {}
889 
890 private:
891   Value *getBroadcastInstrs(Value *V) override;
892   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
893                        Instruction::BinaryOps Opcode =
894                        Instruction::BinaryOpsEnd) override;
895   Value *reverseVector(Value *Vec) override;
896 };
897 
898 /// Encapsulate information regarding vectorization of a loop and its epilogue.
899 /// This information is meant to be updated and used across two stages of
900 /// epilogue vectorization.
901 struct EpilogueLoopVectorizationInfo {
902   ElementCount MainLoopVF = ElementCount::getFixed(0);
903   unsigned MainLoopUF = 0;
904   ElementCount EpilogueVF = ElementCount::getFixed(0);
905   unsigned EpilogueUF = 0;
906   BasicBlock *MainLoopIterationCountCheck = nullptr;
907   BasicBlock *EpilogueIterationCountCheck = nullptr;
908   BasicBlock *SCEVSafetyCheck = nullptr;
909   BasicBlock *MemSafetyCheck = nullptr;
910   Value *TripCount = nullptr;
911   Value *VectorTripCount = nullptr;
912 
913   EpilogueLoopVectorizationInfo(unsigned MVF, unsigned MUF, unsigned EVF,
914                                 unsigned EUF)
915       : MainLoopVF(ElementCount::getFixed(MVF)), MainLoopUF(MUF),
916         EpilogueVF(ElementCount::getFixed(EVF)), EpilogueUF(EUF) {
917     assert(EUF == 1 &&
918            "A high UF for the epilogue loop is likely not beneficial.");
919   }
920 };
921 
922 /// An extension of the inner loop vectorizer that creates a skeleton for a
923 /// vectorized loop that has its epilogue (residual) also vectorized.
924 /// The idea is to run the vplan on a given loop twice, firstly to setup the
925 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
926 /// from the first step and vectorize the epilogue.  This is achieved by
927 /// deriving two concrete strategy classes from this base class and invoking
928 /// them in succession from the loop vectorizer planner.
929 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
930 public:
931   InnerLoopAndEpilogueVectorizer(
932       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
933       DominatorTree *DT, const TargetLibraryInfo *TLI,
934       const TargetTransformInfo *TTI, AssumptionCache *AC,
935       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
936       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
937       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
938       GeneratedRTChecks &Checks)
939       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
940                             EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI,
941                             Checks),
942         EPI(EPI) {}
943 
944   // Override this function to handle the more complex control flow around the
945   // three loops.
946   BasicBlock *createVectorizedLoopSkeleton() final override {
947     return createEpilogueVectorizedLoopSkeleton();
948   }
949 
950   /// The interface for creating a vectorized skeleton using one of two
951   /// different strategies, each corresponding to one execution of the vplan
952   /// as described above.
953   virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
954 
955   /// Holds and updates state information required to vectorize the main loop
956   /// and its epilogue in two separate passes. This setup helps us avoid
957   /// regenerating and recomputing runtime safety checks. It also helps us to
958   /// shorten the iteration-count-check path length for the cases where the
959   /// iteration count of the loop is so small that the main vector loop is
960   /// completely skipped.
961   EpilogueLoopVectorizationInfo &EPI;
962 };
963 
964 /// A specialized derived class of inner loop vectorizer that performs
965 /// vectorization of *main* loops in the process of vectorizing loops and their
966 /// epilogues.
967 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
968 public:
969   EpilogueVectorizerMainLoop(
970       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
971       DominatorTree *DT, const TargetLibraryInfo *TLI,
972       const TargetTransformInfo *TTI, AssumptionCache *AC,
973       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
974       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
975       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
976       GeneratedRTChecks &Check)
977       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
978                                        EPI, LVL, CM, BFI, PSI, Check) {}
979   /// Implements the interface for creating a vectorized skeleton using the
980   /// *main loop* strategy (ie the first pass of vplan execution).
981   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
982 
983 protected:
984   /// Emits an iteration count bypass check once for the main loop (when \p
985   /// ForEpilogue is false) and once for the epilogue loop (when \p
986   /// ForEpilogue is true).
987   BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
988                                              bool ForEpilogue);
989   void printDebugTracesAtStart() override;
990   void printDebugTracesAtEnd() override;
991 };
992 
993 // A specialized derived class of inner loop vectorizer that performs
994 // vectorization of *epilogue* loops in the process of vectorizing loops and
995 // their epilogues.
996 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
997 public:
998   EpilogueVectorizerEpilogueLoop(
999       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
1000       DominatorTree *DT, const TargetLibraryInfo *TLI,
1001       const TargetTransformInfo *TTI, AssumptionCache *AC,
1002       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
1003       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
1004       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
1005       GeneratedRTChecks &Checks)
1006       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
1007                                        EPI, LVL, CM, BFI, PSI, Checks) {}
1008   /// Implements the interface for creating a vectorized skeleton using the
1009   /// *epilogue loop* strategy (ie the second pass of vplan execution).
1010   BasicBlock *createEpilogueVectorizedLoopSkeleton() final override;
1011 
1012 protected:
1013   /// Emits an iteration count bypass check after the main vector loop has
1014   /// finished to see if there are any iterations left to execute by either
1015   /// the vector epilogue or the scalar epilogue.
1016   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
1017                                                       BasicBlock *Bypass,
1018                                                       BasicBlock *Insert);
1019   void printDebugTracesAtStart() override;
1020   void printDebugTracesAtEnd() override;
1021 };
1022 } // end namespace llvm
1023 
1024 /// Look for a meaningful debug location on the instruction or it's
1025 /// operands.
1026 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
1027   if (!I)
1028     return I;
1029 
1030   DebugLoc Empty;
1031   if (I->getDebugLoc() != Empty)
1032     return I;
1033 
1034   for (Use &Op : I->operands()) {
1035     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
1036       if (OpInst->getDebugLoc() != Empty)
1037         return OpInst;
1038   }
1039 
1040   return I;
1041 }
1042 
1043 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
1044   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
1045     const DILocation *DIL = Inst->getDebugLoc();
1046 
1047     // When a FSDiscriminator is enabled, we don't need to add the multiply
1048     // factors to the discriminators.
1049     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
1050         !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) {
1051       // FIXME: For scalable vectors, assume vscale=1.
1052       auto NewDIL =
1053           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
1054       if (NewDIL)
1055         B.SetCurrentDebugLocation(NewDIL.getValue());
1056       else
1057         LLVM_DEBUG(dbgs()
1058                    << "Failed to create new discriminator: "
1059                    << DIL->getFilename() << " Line: " << DIL->getLine());
1060     } else
1061       B.SetCurrentDebugLocation(DIL);
1062   } else
1063     B.SetCurrentDebugLocation(DebugLoc());
1064 }
1065 
1066 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
1067 /// is passed, the message relates to that particular instruction.
1068 #ifndef NDEBUG
1069 static void debugVectorizationMessage(const StringRef Prefix,
1070                                       const StringRef DebugMsg,
1071                                       Instruction *I) {
1072   dbgs() << "LV: " << Prefix << DebugMsg;
1073   if (I != nullptr)
1074     dbgs() << " " << *I;
1075   else
1076     dbgs() << '.';
1077   dbgs() << '\n';
1078 }
1079 #endif
1080 
1081 /// Create an analysis remark that explains why vectorization failed
1082 ///
1083 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
1084 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
1085 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
1086 /// the location of the remark.  \return the remark object that can be
1087 /// streamed to.
1088 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
1089     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
1090   Value *CodeRegion = TheLoop->getHeader();
1091   DebugLoc DL = TheLoop->getStartLoc();
1092 
1093   if (I) {
1094     CodeRegion = I->getParent();
1095     // If there is no debug location attached to the instruction, revert back to
1096     // using the loop's.
1097     if (I->getDebugLoc())
1098       DL = I->getDebugLoc();
1099   }
1100 
1101   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
1102 }
1103 
1104 /// Return a value for Step multiplied by VF.
1105 static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) {
1106   assert(isa<ConstantInt>(Step) && "Expected an integer step");
1107   Constant *StepVal = ConstantInt::get(
1108       Step->getType(),
1109       cast<ConstantInt>(Step)->getSExtValue() * VF.getKnownMinValue());
1110   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
1111 }
1112 
1113 namespace llvm {
1114 
1115 /// Return the runtime value for VF.
1116 Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) {
1117   Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
1118   return VF.isScalable() ? B.CreateVScale(EC) : EC;
1119 }
1120 
1121 void reportVectorizationFailure(const StringRef DebugMsg,
1122                                 const StringRef OREMsg, const StringRef ORETag,
1123                                 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1124                                 Instruction *I) {
1125   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
1126   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1127   ORE->emit(
1128       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1129       << "loop not vectorized: " << OREMsg);
1130 }
1131 
1132 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1133                              OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1134                              Instruction *I) {
1135   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
1136   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1137   ORE->emit(
1138       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1139       << Msg);
1140 }
1141 
1142 } // end namespace llvm
1143 
1144 #ifndef NDEBUG
1145 /// \return string containing a file name and a line # for the given loop.
1146 static std::string getDebugLocString(const Loop *L) {
1147   std::string Result;
1148   if (L) {
1149     raw_string_ostream OS(Result);
1150     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1151       LoopDbgLoc.print(OS);
1152     else
1153       // Just print the module name.
1154       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1155     OS.flush();
1156   }
1157   return Result;
1158 }
1159 #endif
1160 
1161 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1162                                          const Instruction *Orig) {
1163   // If the loop was versioned with memchecks, add the corresponding no-alias
1164   // metadata.
1165   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1166     LVer->annotateInstWithNoAlias(To, Orig);
1167 }
1168 
1169 void InnerLoopVectorizer::addMetadata(Instruction *To,
1170                                       Instruction *From) {
1171   propagateMetadata(To, From);
1172   addNewMetadata(To, From);
1173 }
1174 
1175 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1176                                       Instruction *From) {
1177   for (Value *V : To) {
1178     if (Instruction *I = dyn_cast<Instruction>(V))
1179       addMetadata(I, From);
1180   }
1181 }
1182 
1183 namespace llvm {
1184 
1185 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1186 // lowered.
1187 enum ScalarEpilogueLowering {
1188 
1189   // The default: allowing scalar epilogues.
1190   CM_ScalarEpilogueAllowed,
1191 
1192   // Vectorization with OptForSize: don't allow epilogues.
1193   CM_ScalarEpilogueNotAllowedOptSize,
1194 
1195   // A special case of vectorisation with OptForSize: loops with a very small
1196   // trip count are considered for vectorization under OptForSize, thereby
1197   // making sure the cost of their loop body is dominant, free of runtime
1198   // guards and scalar iteration overheads.
1199   CM_ScalarEpilogueNotAllowedLowTripLoop,
1200 
1201   // Loop hint predicate indicating an epilogue is undesired.
1202   CM_ScalarEpilogueNotNeededUsePredicate,
1203 
1204   // Directive indicating we must either tail fold or not vectorize
1205   CM_ScalarEpilogueNotAllowedUsePredicate
1206 };
1207 
1208 /// ElementCountComparator creates a total ordering for ElementCount
1209 /// for the purposes of using it in a set structure.
1210 struct ElementCountComparator {
1211   bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
1212     return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
1213            std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
1214   }
1215 };
1216 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
1217 
1218 /// LoopVectorizationCostModel - estimates the expected speedups due to
1219 /// vectorization.
1220 /// In many cases vectorization is not profitable. This can happen because of
1221 /// a number of reasons. In this class we mainly attempt to predict the
1222 /// expected speedup/slowdowns due to the supported instruction set. We use the
1223 /// TargetTransformInfo to query the different backends for the cost of
1224 /// different operations.
1225 class LoopVectorizationCostModel {
1226 public:
1227   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1228                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1229                              LoopVectorizationLegality *Legal,
1230                              const TargetTransformInfo &TTI,
1231                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1232                              AssumptionCache *AC,
1233                              OptimizationRemarkEmitter *ORE, const Function *F,
1234                              const LoopVectorizeHints *Hints,
1235                              InterleavedAccessInfo &IAI)
1236       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1237         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1238         Hints(Hints), InterleaveInfo(IAI) {}
1239 
1240   /// \return An upper bound for the vectorization factors (both fixed and
1241   /// scalable). If the factors are 0, vectorization and interleaving should be
1242   /// avoided up front.
1243   FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1244 
1245   /// \return True if runtime checks are required for vectorization, and false
1246   /// otherwise.
1247   bool runtimeChecksRequired();
1248 
1249   /// \return The most profitable vectorization factor and the cost of that VF.
1250   /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
1251   /// then this vectorization factor will be selected if vectorization is
1252   /// possible.
1253   VectorizationFactor
1254   selectVectorizationFactor(const ElementCountSet &CandidateVFs);
1255 
1256   VectorizationFactor
1257   selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1258                                     const LoopVectorizationPlanner &LVP);
1259 
1260   /// Setup cost-based decisions for user vectorization factor.
1261   void selectUserVectorizationFactor(ElementCount UserVF) {
1262     collectUniformsAndScalars(UserVF);
1263     collectInstsToScalarize(UserVF);
1264   }
1265 
1266   /// \return The size (in bits) of the smallest and widest types in the code
1267   /// that needs to be vectorized. We ignore values that remain scalar such as
1268   /// 64 bit loop indices.
1269   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1270 
1271   /// \return The desired interleave count.
1272   /// If interleave count has been specified by metadata it will be returned.
1273   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1274   /// are the selected vectorization factor and the cost of the selected VF.
1275   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1276 
1277   /// Memory access instruction may be vectorized in more than one way.
1278   /// Form of instruction after vectorization depends on cost.
1279   /// This function takes cost-based decisions for Load/Store instructions
1280   /// and collects them in a map. This decisions map is used for building
1281   /// the lists of loop-uniform and loop-scalar instructions.
1282   /// The calculated cost is saved with widening decision in order to
1283   /// avoid redundant calculations.
1284   void setCostBasedWideningDecision(ElementCount VF);
1285 
1286   /// A struct that represents some properties of the register usage
1287   /// of a loop.
1288   struct RegisterUsage {
1289     /// Holds the number of loop invariant values that are used in the loop.
1290     /// The key is ClassID of target-provided register class.
1291     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1292     /// Holds the maximum number of concurrent live intervals in the loop.
1293     /// The key is ClassID of target-provided register class.
1294     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1295   };
1296 
1297   /// \return Returns information about the register usages of the loop for the
1298   /// given vectorization factors.
1299   SmallVector<RegisterUsage, 8>
1300   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1301 
1302   /// Collect values we want to ignore in the cost model.
1303   void collectValuesToIgnore();
1304 
1305   /// Split reductions into those that happen in the loop, and those that happen
1306   /// outside. In loop reductions are collected into InLoopReductionChains.
1307   void collectInLoopReductions();
1308 
1309   /// \returns The smallest bitwidth each instruction can be represented with.
1310   /// The vector equivalents of these instructions should be truncated to this
1311   /// type.
1312   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1313     return MinBWs;
1314   }
1315 
1316   /// \returns True if it is more profitable to scalarize instruction \p I for
1317   /// vectorization factor \p VF.
1318   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1319     assert(VF.isVector() &&
1320            "Profitable to scalarize relevant only for VF > 1.");
1321 
1322     // Cost model is not run in the VPlan-native path - return conservative
1323     // result until this changes.
1324     if (EnableVPlanNativePath)
1325       return false;
1326 
1327     auto Scalars = InstsToScalarize.find(VF);
1328     assert(Scalars != InstsToScalarize.end() &&
1329            "VF not yet analyzed for scalarization profitability");
1330     return Scalars->second.find(I) != Scalars->second.end();
1331   }
1332 
1333   /// Returns true if \p I is known to be uniform after vectorization.
1334   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1335     if (VF.isScalar())
1336       return true;
1337 
1338     // Cost model is not run in the VPlan-native path - return conservative
1339     // result until this changes.
1340     if (EnableVPlanNativePath)
1341       return false;
1342 
1343     auto UniformsPerVF = Uniforms.find(VF);
1344     assert(UniformsPerVF != Uniforms.end() &&
1345            "VF not yet analyzed for uniformity");
1346     return UniformsPerVF->second.count(I);
1347   }
1348 
1349   /// Returns true if \p I is known to be scalar after vectorization.
1350   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1351     if (VF.isScalar())
1352       return true;
1353 
1354     // Cost model is not run in the VPlan-native path - return conservative
1355     // result until this changes.
1356     if (EnableVPlanNativePath)
1357       return false;
1358 
1359     auto ScalarsPerVF = Scalars.find(VF);
1360     assert(ScalarsPerVF != Scalars.end() &&
1361            "Scalar values are not calculated for VF");
1362     return ScalarsPerVF->second.count(I);
1363   }
1364 
1365   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1366   /// for vectorization factor \p VF.
1367   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1368     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1369            !isProfitableToScalarize(I, VF) &&
1370            !isScalarAfterVectorization(I, VF);
1371   }
1372 
1373   /// Decision that was taken during cost calculation for memory instruction.
1374   enum InstWidening {
1375     CM_Unknown,
1376     CM_Widen,         // For consecutive accesses with stride +1.
1377     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1378     CM_Interleave,
1379     CM_GatherScatter,
1380     CM_Scalarize
1381   };
1382 
1383   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1384   /// instruction \p I and vector width \p VF.
1385   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1386                            InstructionCost Cost) {
1387     assert(VF.isVector() && "Expected VF >=2");
1388     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1389   }
1390 
1391   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1392   /// interleaving group \p Grp and vector width \p VF.
1393   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1394                            ElementCount VF, InstWidening W,
1395                            InstructionCost Cost) {
1396     assert(VF.isVector() && "Expected VF >=2");
1397     /// Broadcast this decicion to all instructions inside the group.
1398     /// But the cost will be assigned to one instruction only.
1399     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1400       if (auto *I = Grp->getMember(i)) {
1401         if (Grp->getInsertPos() == I)
1402           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1403         else
1404           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1405       }
1406     }
1407   }
1408 
1409   /// Return the cost model decision for the given instruction \p I and vector
1410   /// width \p VF. Return CM_Unknown if this instruction did not pass
1411   /// through the cost modeling.
1412   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1413     assert(VF.isVector() && "Expected VF to be a vector VF");
1414     // Cost model is not run in the VPlan-native path - return conservative
1415     // result until this changes.
1416     if (EnableVPlanNativePath)
1417       return CM_GatherScatter;
1418 
1419     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1420     auto Itr = WideningDecisions.find(InstOnVF);
1421     if (Itr == WideningDecisions.end())
1422       return CM_Unknown;
1423     return Itr->second.first;
1424   }
1425 
1426   /// Return the vectorization cost for the given instruction \p I and vector
1427   /// width \p VF.
1428   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1429     assert(VF.isVector() && "Expected VF >=2");
1430     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1431     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1432            "The cost is not calculated");
1433     return WideningDecisions[InstOnVF].second;
1434   }
1435 
1436   /// Return True if instruction \p I is an optimizable truncate whose operand
1437   /// is an induction variable. Such a truncate will be removed by adding a new
1438   /// induction variable with the destination type.
1439   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1440     // If the instruction is not a truncate, return false.
1441     auto *Trunc = dyn_cast<TruncInst>(I);
1442     if (!Trunc)
1443       return false;
1444 
1445     // Get the source and destination types of the truncate.
1446     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1447     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1448 
1449     // If the truncate is free for the given types, return false. Replacing a
1450     // free truncate with an induction variable would add an induction variable
1451     // update instruction to each iteration of the loop. We exclude from this
1452     // check the primary induction variable since it will need an update
1453     // instruction regardless.
1454     Value *Op = Trunc->getOperand(0);
1455     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1456       return false;
1457 
1458     // If the truncated value is not an induction variable, return false.
1459     return Legal->isInductionPhi(Op);
1460   }
1461 
1462   /// Collects the instructions to scalarize for each predicated instruction in
1463   /// the loop.
1464   void collectInstsToScalarize(ElementCount VF);
1465 
1466   /// Collect Uniform and Scalar values for the given \p VF.
1467   /// The sets depend on CM decision for Load/Store instructions
1468   /// that may be vectorized as interleave, gather-scatter or scalarized.
1469   void collectUniformsAndScalars(ElementCount VF) {
1470     // Do the analysis once.
1471     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1472       return;
1473     setCostBasedWideningDecision(VF);
1474     collectLoopUniforms(VF);
1475     collectLoopScalars(VF);
1476   }
1477 
1478   /// Returns true if the target machine supports masked store operation
1479   /// for the given \p DataType and kind of access to \p Ptr.
1480   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1481     return Legal->isConsecutivePtr(Ptr) &&
1482            TTI.isLegalMaskedStore(DataType, Alignment);
1483   }
1484 
1485   /// Returns true if the target machine supports masked load operation
1486   /// for the given \p DataType and kind of access to \p Ptr.
1487   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1488     return Legal->isConsecutivePtr(Ptr) &&
1489            TTI.isLegalMaskedLoad(DataType, Alignment);
1490   }
1491 
1492   /// Returns true if the target machine can represent \p V as a masked gather
1493   /// or scatter operation.
1494   bool isLegalGatherOrScatter(Value *V) {
1495     bool LI = isa<LoadInst>(V);
1496     bool SI = isa<StoreInst>(V);
1497     if (!LI && !SI)
1498       return false;
1499     auto *Ty = getLoadStoreType(V);
1500     Align Align = getLoadStoreAlignment(V);
1501     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1502            (SI && TTI.isLegalMaskedScatter(Ty, Align));
1503   }
1504 
1505   /// Returns true if the target machine supports all of the reduction
1506   /// variables found for the given VF.
1507   bool canVectorizeReductions(ElementCount VF) {
1508     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1509       RecurrenceDescriptor RdxDesc = Reduction.second;
1510       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1511     }));
1512   }
1513 
1514   /// Returns true if \p I is an instruction that will be scalarized with
1515   /// predication. Such instructions include conditional stores and
1516   /// instructions that may divide by zero.
1517   /// If a non-zero VF has been calculated, we check if I will be scalarized
1518   /// predication for that VF.
1519   bool isScalarWithPredication(Instruction *I) const;
1520 
1521   // Returns true if \p I is an instruction that will be predicated either
1522   // through scalar predication or masked load/store or masked gather/scatter.
1523   // Superset of instructions that return true for isScalarWithPredication.
1524   bool isPredicatedInst(Instruction *I) {
1525     if (!blockNeedsPredication(I->getParent()))
1526       return false;
1527     // Loads and stores that need some form of masked operation are predicated
1528     // instructions.
1529     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1530       return Legal->isMaskRequired(I);
1531     return isScalarWithPredication(I);
1532   }
1533 
1534   /// Returns true if \p I is a memory instruction with consecutive memory
1535   /// access that can be widened.
1536   bool
1537   memoryInstructionCanBeWidened(Instruction *I,
1538                                 ElementCount VF = ElementCount::getFixed(1));
1539 
1540   /// Returns true if \p I is a memory instruction in an interleaved-group
1541   /// of memory accesses that can be vectorized with wide vector loads/stores
1542   /// and shuffles.
1543   bool
1544   interleavedAccessCanBeWidened(Instruction *I,
1545                                 ElementCount VF = ElementCount::getFixed(1));
1546 
1547   /// Check if \p Instr belongs to any interleaved access group.
1548   bool isAccessInterleaved(Instruction *Instr) {
1549     return InterleaveInfo.isInterleaved(Instr);
1550   }
1551 
1552   /// Get the interleaved access group that \p Instr belongs to.
1553   const InterleaveGroup<Instruction> *
1554   getInterleavedAccessGroup(Instruction *Instr) {
1555     return InterleaveInfo.getInterleaveGroup(Instr);
1556   }
1557 
1558   /// Returns true if we're required to use a scalar epilogue for at least
1559   /// the final iteration of the original loop.
1560   bool requiresScalarEpilogue() const {
1561     if (!isScalarEpilogueAllowed())
1562       return false;
1563     // If we might exit from anywhere but the latch, must run the exiting
1564     // iteration in scalar form.
1565     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1566       return true;
1567     return InterleaveInfo.requiresScalarEpilogue();
1568   }
1569 
1570   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1571   /// loop hint annotation.
1572   bool isScalarEpilogueAllowed() const {
1573     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1574   }
1575 
1576   /// Returns true if all loop blocks should be masked to fold tail loop.
1577   bool foldTailByMasking() const { return FoldTailByMasking; }
1578 
1579   bool blockNeedsPredication(BasicBlock *BB) const {
1580     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1581   }
1582 
1583   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1584   /// nodes to the chain of instructions representing the reductions. Uses a
1585   /// MapVector to ensure deterministic iteration order.
1586   using ReductionChainMap =
1587       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1588 
1589   /// Return the chain of instructions representing an inloop reduction.
1590   const ReductionChainMap &getInLoopReductionChains() const {
1591     return InLoopReductionChains;
1592   }
1593 
1594   /// Returns true if the Phi is part of an inloop reduction.
1595   bool isInLoopReduction(PHINode *Phi) const {
1596     return InLoopReductionChains.count(Phi);
1597   }
1598 
1599   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1600   /// with factor VF.  Return the cost of the instruction, including
1601   /// scalarization overhead if it's needed.
1602   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1603 
1604   /// Estimate cost of a call instruction CI if it were vectorized with factor
1605   /// VF. Return the cost of the instruction, including scalarization overhead
1606   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1607   /// scalarized -
1608   /// i.e. either vector version isn't available, or is too expensive.
1609   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1610                                     bool &NeedToScalarize) const;
1611 
1612   /// Returns true if the per-lane cost of VectorizationFactor A is lower than
1613   /// that of B.
1614   bool isMoreProfitable(const VectorizationFactor &A,
1615                         const VectorizationFactor &B) const;
1616 
1617   /// Invalidates decisions already taken by the cost model.
1618   void invalidateCostModelingDecisions() {
1619     WideningDecisions.clear();
1620     Uniforms.clear();
1621     Scalars.clear();
1622   }
1623 
1624 private:
1625   unsigned NumPredStores = 0;
1626 
1627   /// \return An upper bound for the vectorization factors for both
1628   /// fixed and scalable vectorization, where the minimum-known number of
1629   /// elements is a power-of-2 larger than zero. If scalable vectorization is
1630   /// disabled or unsupported, then the scalable part will be equal to
1631   /// ElementCount::getScalable(0).
1632   FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1633                                            ElementCount UserVF);
1634 
1635   /// \return the maximized element count based on the targets vector
1636   /// registers and the loop trip-count, but limited to a maximum safe VF.
1637   /// This is a helper function of computeFeasibleMaxVF.
1638   /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure
1639   /// issue that occurred on one of the buildbots which cannot be reproduced
1640   /// without having access to the properietary compiler (see comments on
1641   /// D98509). The issue is currently under investigation and this workaround
1642   /// will be removed as soon as possible.
1643   ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
1644                                        unsigned SmallestType,
1645                                        unsigned WidestType,
1646                                        const ElementCount &MaxSafeVF);
1647 
1648   /// \return the maximum legal scalable VF, based on the safe max number
1649   /// of elements.
1650   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1651 
1652   /// The vectorization cost is a combination of the cost itself and a boolean
1653   /// indicating whether any of the contributing operations will actually
1654   /// operate on
1655   /// vector values after type legalization in the backend. If this latter value
1656   /// is
1657   /// false, then all operations will be scalarized (i.e. no vectorization has
1658   /// actually taken place).
1659   using VectorizationCostTy = std::pair<InstructionCost, bool>;
1660 
1661   /// Returns the expected execution cost. The unit of the cost does
1662   /// not matter because we use the 'cost' units to compare different
1663   /// vector widths. The cost that is returned is *not* normalized by
1664   /// the factor width.
1665   VectorizationCostTy expectedCost(ElementCount VF);
1666 
1667   /// Returns the execution time cost of an instruction for a given vector
1668   /// width. Vector width of one means scalar.
1669   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1670 
1671   /// The cost-computation logic from getInstructionCost which provides
1672   /// the vector type as an output parameter.
1673   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1674                                      Type *&VectorTy);
1675 
1676   /// Return the cost of instructions in an inloop reduction pattern, if I is
1677   /// part of that pattern.
1678   InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF,
1679                                           Type *VectorTy,
1680                                           TTI::TargetCostKind CostKind);
1681 
1682   /// Calculate vectorization cost of memory instruction \p I.
1683   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1684 
1685   /// The cost computation for scalarized memory instruction.
1686   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1687 
1688   /// The cost computation for interleaving group of memory instructions.
1689   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1690 
1691   /// The cost computation for Gather/Scatter instruction.
1692   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1693 
1694   /// The cost computation for widening instruction \p I with consecutive
1695   /// memory access.
1696   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1697 
1698   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1699   /// Load: scalar load + broadcast.
1700   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1701   /// element)
1702   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1703 
1704   /// Estimate the overhead of scalarizing an instruction. This is a
1705   /// convenience wrapper for the type-based getScalarizationOverhead API.
1706   InstructionCost getScalarizationOverhead(Instruction *I,
1707                                            ElementCount VF) const;
1708 
1709   /// Returns whether the instruction is a load or store and will be a emitted
1710   /// as a vector operation.
1711   bool isConsecutiveLoadOrStore(Instruction *I);
1712 
1713   /// Returns true if an artificially high cost for emulated masked memrefs
1714   /// should be used.
1715   bool useEmulatedMaskMemRefHack(Instruction *I);
1716 
1717   /// Map of scalar integer values to the smallest bitwidth they can be legally
1718   /// represented as. The vector equivalents of these values should be truncated
1719   /// to this type.
1720   MapVector<Instruction *, uint64_t> MinBWs;
1721 
1722   /// A type representing the costs for instructions if they were to be
1723   /// scalarized rather than vectorized. The entries are Instruction-Cost
1724   /// pairs.
1725   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1726 
1727   /// A set containing all BasicBlocks that are known to present after
1728   /// vectorization as a predicated block.
1729   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1730 
1731   /// Records whether it is allowed to have the original scalar loop execute at
1732   /// least once. This may be needed as a fallback loop in case runtime
1733   /// aliasing/dependence checks fail, or to handle the tail/remainder
1734   /// iterations when the trip count is unknown or doesn't divide by the VF,
1735   /// or as a peel-loop to handle gaps in interleave-groups.
1736   /// Under optsize and when the trip count is very small we don't allow any
1737   /// iterations to execute in the scalar loop.
1738   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1739 
1740   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1741   bool FoldTailByMasking = false;
1742 
1743   /// A map holding scalar costs for different vectorization factors. The
1744   /// presence of a cost for an instruction in the mapping indicates that the
1745   /// instruction will be scalarized when vectorizing with the associated
1746   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1747   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1748 
1749   /// Holds the instructions known to be uniform after vectorization.
1750   /// The data is collected per VF.
1751   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1752 
1753   /// Holds the instructions known to be scalar after vectorization.
1754   /// The data is collected per VF.
1755   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1756 
1757   /// Holds the instructions (address computations) that are forced to be
1758   /// scalarized.
1759   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1760 
1761   /// PHINodes of the reductions that should be expanded in-loop along with
1762   /// their associated chains of reduction operations, in program order from top
1763   /// (PHI) to bottom
1764   ReductionChainMap InLoopReductionChains;
1765 
1766   /// A Map of inloop reduction operations and their immediate chain operand.
1767   /// FIXME: This can be removed once reductions can be costed correctly in
1768   /// vplan. This was added to allow quick lookup to the inloop operations,
1769   /// without having to loop through InLoopReductionChains.
1770   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1771 
1772   /// Returns the expected difference in cost from scalarizing the expression
1773   /// feeding a predicated instruction \p PredInst. The instructions to
1774   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1775   /// non-negative return value implies the expression will be scalarized.
1776   /// Currently, only single-use chains are considered for scalarization.
1777   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1778                               ElementCount VF);
1779 
1780   /// Collect the instructions that are uniform after vectorization. An
1781   /// instruction is uniform if we represent it with a single scalar value in
1782   /// the vectorized loop corresponding to each vector iteration. Examples of
1783   /// uniform instructions include pointer operands of consecutive or
1784   /// interleaved memory accesses. Note that although uniformity implies an
1785   /// instruction will be scalar, the reverse is not true. In general, a
1786   /// scalarized instruction will be represented by VF scalar values in the
1787   /// vectorized loop, each corresponding to an iteration of the original
1788   /// scalar loop.
1789   void collectLoopUniforms(ElementCount VF);
1790 
1791   /// Collect the instructions that are scalar after vectorization. An
1792   /// instruction is scalar if it is known to be uniform or will be scalarized
1793   /// during vectorization. Non-uniform scalarized instructions will be
1794   /// represented by VF values in the vectorized loop, each corresponding to an
1795   /// iteration of the original scalar loop.
1796   void collectLoopScalars(ElementCount VF);
1797 
1798   /// Keeps cost model vectorization decision and cost for instructions.
1799   /// Right now it is used for memory instructions only.
1800   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1801                                 std::pair<InstWidening, InstructionCost>>;
1802 
1803   DecisionList WideningDecisions;
1804 
1805   /// Returns true if \p V is expected to be vectorized and it needs to be
1806   /// extracted.
1807   bool needsExtract(Value *V, ElementCount VF) const {
1808     Instruction *I = dyn_cast<Instruction>(V);
1809     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1810         TheLoop->isLoopInvariant(I))
1811       return false;
1812 
1813     // Assume we can vectorize V (and hence we need extraction) if the
1814     // scalars are not computed yet. This can happen, because it is called
1815     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1816     // the scalars are collected. That should be a safe assumption in most
1817     // cases, because we check if the operands have vectorizable types
1818     // beforehand in LoopVectorizationLegality.
1819     return Scalars.find(VF) == Scalars.end() ||
1820            !isScalarAfterVectorization(I, VF);
1821   };
1822 
1823   /// Returns a range containing only operands needing to be extracted.
1824   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1825                                                    ElementCount VF) const {
1826     return SmallVector<Value *, 4>(make_filter_range(
1827         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1828   }
1829 
1830   /// Determines if we have the infrastructure to vectorize loop \p L and its
1831   /// epilogue, assuming the main loop is vectorized by \p VF.
1832   bool isCandidateForEpilogueVectorization(const Loop &L,
1833                                            const ElementCount VF) const;
1834 
1835   /// Returns true if epilogue vectorization is considered profitable, and
1836   /// false otherwise.
1837   /// \p VF is the vectorization factor chosen for the original loop.
1838   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1839 
1840 public:
1841   /// The loop that we evaluate.
1842   Loop *TheLoop;
1843 
1844   /// Predicated scalar evolution analysis.
1845   PredicatedScalarEvolution &PSE;
1846 
1847   /// Loop Info analysis.
1848   LoopInfo *LI;
1849 
1850   /// Vectorization legality.
1851   LoopVectorizationLegality *Legal;
1852 
1853   /// Vector target information.
1854   const TargetTransformInfo &TTI;
1855 
1856   /// Target Library Info.
1857   const TargetLibraryInfo *TLI;
1858 
1859   /// Demanded bits analysis.
1860   DemandedBits *DB;
1861 
1862   /// Assumption cache.
1863   AssumptionCache *AC;
1864 
1865   /// Interface to emit optimization remarks.
1866   OptimizationRemarkEmitter *ORE;
1867 
1868   const Function *TheFunction;
1869 
1870   /// Loop Vectorize Hint.
1871   const LoopVectorizeHints *Hints;
1872 
1873   /// The interleave access information contains groups of interleaved accesses
1874   /// with the same stride and close to each other.
1875   InterleavedAccessInfo &InterleaveInfo;
1876 
1877   /// Values to ignore in the cost model.
1878   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1879 
1880   /// Values to ignore in the cost model when VF > 1.
1881   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1882 
1883   /// Profitable vector factors.
1884   SmallVector<VectorizationFactor, 8> ProfitableVFs;
1885 };
1886 } // end namespace llvm
1887 
1888 /// Helper struct to manage generating runtime checks for vectorization.
1889 ///
1890 /// The runtime checks are created up-front in temporary blocks to allow better
1891 /// estimating the cost and un-linked from the existing IR. After deciding to
1892 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1893 /// temporary blocks are completely removed.
1894 class GeneratedRTChecks {
1895   /// Basic block which contains the generated SCEV checks, if any.
1896   BasicBlock *SCEVCheckBlock = nullptr;
1897 
1898   /// The value representing the result of the generated SCEV checks. If it is
1899   /// nullptr, either no SCEV checks have been generated or they have been used.
1900   Value *SCEVCheckCond = nullptr;
1901 
1902   /// Basic block which contains the generated memory runtime checks, if any.
1903   BasicBlock *MemCheckBlock = nullptr;
1904 
1905   /// The value representing the result of the generated memory runtime checks.
1906   /// If it is nullptr, either no memory runtime checks have been generated or
1907   /// they have been used.
1908   Instruction *MemRuntimeCheckCond = nullptr;
1909 
1910   DominatorTree *DT;
1911   LoopInfo *LI;
1912 
1913   SCEVExpander SCEVExp;
1914   SCEVExpander MemCheckExp;
1915 
1916 public:
1917   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1918                     const DataLayout &DL)
1919       : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"),
1920         MemCheckExp(SE, DL, "scev.check") {}
1921 
1922   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1923   /// accurately estimate the cost of the runtime checks. The blocks are
1924   /// un-linked from the IR and is added back during vector code generation. If
1925   /// there is no vector code generation, the check blocks are removed
1926   /// completely.
1927   void Create(Loop *L, const LoopAccessInfo &LAI,
1928               const SCEVUnionPredicate &UnionPred) {
1929 
1930     BasicBlock *LoopHeader = L->getHeader();
1931     BasicBlock *Preheader = L->getLoopPreheader();
1932 
1933     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1934     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1935     // may be used by SCEVExpander. The blocks will be un-linked from their
1936     // predecessors and removed from LI & DT at the end of the function.
1937     if (!UnionPred.isAlwaysTrue()) {
1938       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1939                                   nullptr, "vector.scevcheck");
1940 
1941       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1942           &UnionPred, SCEVCheckBlock->getTerminator());
1943     }
1944 
1945     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1946     if (RtPtrChecking.Need) {
1947       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1948       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1949                                  "vector.memcheck");
1950 
1951       std::tie(std::ignore, MemRuntimeCheckCond) =
1952           addRuntimeChecks(MemCheckBlock->getTerminator(), L,
1953                            RtPtrChecking.getChecks(), MemCheckExp);
1954       assert(MemRuntimeCheckCond &&
1955              "no RT checks generated although RtPtrChecking "
1956              "claimed checks are required");
1957     }
1958 
1959     if (!MemCheckBlock && !SCEVCheckBlock)
1960       return;
1961 
1962     // Unhook the temporary block with the checks, update various places
1963     // accordingly.
1964     if (SCEVCheckBlock)
1965       SCEVCheckBlock->replaceAllUsesWith(Preheader);
1966     if (MemCheckBlock)
1967       MemCheckBlock->replaceAllUsesWith(Preheader);
1968 
1969     if (SCEVCheckBlock) {
1970       SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1971       new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1972       Preheader->getTerminator()->eraseFromParent();
1973     }
1974     if (MemCheckBlock) {
1975       MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1976       new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1977       Preheader->getTerminator()->eraseFromParent();
1978     }
1979 
1980     DT->changeImmediateDominator(LoopHeader, Preheader);
1981     if (MemCheckBlock) {
1982       DT->eraseNode(MemCheckBlock);
1983       LI->removeBlock(MemCheckBlock);
1984     }
1985     if (SCEVCheckBlock) {
1986       DT->eraseNode(SCEVCheckBlock);
1987       LI->removeBlock(SCEVCheckBlock);
1988     }
1989   }
1990 
1991   /// Remove the created SCEV & memory runtime check blocks & instructions, if
1992   /// unused.
1993   ~GeneratedRTChecks() {
1994     SCEVExpanderCleaner SCEVCleaner(SCEVExp, *DT);
1995     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp, *DT);
1996     if (!SCEVCheckCond)
1997       SCEVCleaner.markResultUsed();
1998 
1999     if (!MemRuntimeCheckCond)
2000       MemCheckCleaner.markResultUsed();
2001 
2002     if (MemRuntimeCheckCond) {
2003       auto &SE = *MemCheckExp.getSE();
2004       // Memory runtime check generation creates compares that use expanded
2005       // values. Remove them before running the SCEVExpanderCleaners.
2006       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2007         if (MemCheckExp.isInsertedInstruction(&I))
2008           continue;
2009         SE.forgetValue(&I);
2010         SE.eraseValueFromMap(&I);
2011         I.eraseFromParent();
2012       }
2013     }
2014     MemCheckCleaner.cleanup();
2015     SCEVCleaner.cleanup();
2016 
2017     if (SCEVCheckCond)
2018       SCEVCheckBlock->eraseFromParent();
2019     if (MemRuntimeCheckCond)
2020       MemCheckBlock->eraseFromParent();
2021   }
2022 
2023   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2024   /// adjusts the branches to branch to the vector preheader or \p Bypass,
2025   /// depending on the generated condition.
2026   BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass,
2027                              BasicBlock *LoopVectorPreHeader,
2028                              BasicBlock *LoopExitBlock) {
2029     if (!SCEVCheckCond)
2030       return nullptr;
2031     if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond))
2032       if (C->isZero())
2033         return nullptr;
2034 
2035     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2036 
2037     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2038     // Create new preheader for vector loop.
2039     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2040       PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2041 
2042     SCEVCheckBlock->getTerminator()->eraseFromParent();
2043     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2044     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2045                                                 SCEVCheckBlock);
2046 
2047     DT->addNewBlock(SCEVCheckBlock, Pred);
2048     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2049 
2050     ReplaceInstWithInst(
2051         SCEVCheckBlock->getTerminator(),
2052         BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond));
2053     // Mark the check as used, to prevent it from being removed during cleanup.
2054     SCEVCheckCond = nullptr;
2055     return SCEVCheckBlock;
2056   }
2057 
2058   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2059   /// the branches to branch to the vector preheader or \p Bypass, depending on
2060   /// the generated condition.
2061   BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass,
2062                                    BasicBlock *LoopVectorPreHeader) {
2063     // Check if we generated code that checks in runtime if arrays overlap.
2064     if (!MemRuntimeCheckCond)
2065       return nullptr;
2066 
2067     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2068     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2069                                                 MemCheckBlock);
2070 
2071     DT->addNewBlock(MemCheckBlock, Pred);
2072     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2073     MemCheckBlock->moveBefore(LoopVectorPreHeader);
2074 
2075     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2076       PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2077 
2078     ReplaceInstWithInst(
2079         MemCheckBlock->getTerminator(),
2080         BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2081     MemCheckBlock->getTerminator()->setDebugLoc(
2082         Pred->getTerminator()->getDebugLoc());
2083 
2084     // Mark the check as used, to prevent it from being removed during cleanup.
2085     MemRuntimeCheckCond = nullptr;
2086     return MemCheckBlock;
2087   }
2088 };
2089 
2090 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2091 // vectorization. The loop needs to be annotated with #pragma omp simd
2092 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2093 // vector length information is not provided, vectorization is not considered
2094 // explicit. Interleave hints are not allowed either. These limitations will be
2095 // relaxed in the future.
2096 // Please, note that we are currently forced to abuse the pragma 'clang
2097 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2098 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2099 // provides *explicit vectorization hints* (LV can bypass legal checks and
2100 // assume that vectorization is legal). However, both hints are implemented
2101 // using the same metadata (llvm.loop.vectorize, processed by
2102 // LoopVectorizeHints). This will be fixed in the future when the native IR
2103 // representation for pragma 'omp simd' is introduced.
2104 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2105                                    OptimizationRemarkEmitter *ORE) {
2106   assert(!OuterLp->isInnermost() && "This is not an outer loop");
2107   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2108 
2109   // Only outer loops with an explicit vectorization hint are supported.
2110   // Unannotated outer loops are ignored.
2111   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2112     return false;
2113 
2114   Function *Fn = OuterLp->getHeader()->getParent();
2115   if (!Hints.allowVectorization(Fn, OuterLp,
2116                                 true /*VectorizeOnlyWhenForced*/)) {
2117     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2118     return false;
2119   }
2120 
2121   if (Hints.getInterleave() > 1) {
2122     // TODO: Interleave support is future work.
2123     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2124                          "outer loops.\n");
2125     Hints.emitRemarkWithHints();
2126     return false;
2127   }
2128 
2129   return true;
2130 }
2131 
2132 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2133                                   OptimizationRemarkEmitter *ORE,
2134                                   SmallVectorImpl<Loop *> &V) {
2135   // Collect inner loops and outer loops without irreducible control flow. For
2136   // now, only collect outer loops that have explicit vectorization hints. If we
2137   // are stress testing the VPlan H-CFG construction, we collect the outermost
2138   // loop of every loop nest.
2139   if (L.isInnermost() || VPlanBuildStressTest ||
2140       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2141     LoopBlocksRPO RPOT(&L);
2142     RPOT.perform(LI);
2143     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2144       V.push_back(&L);
2145       // TODO: Collect inner loops inside marked outer loops in case
2146       // vectorization fails for the outer loop. Do not invoke
2147       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2148       // already known to be reducible. We can use an inherited attribute for
2149       // that.
2150       return;
2151     }
2152   }
2153   for (Loop *InnerL : L)
2154     collectSupportedLoops(*InnerL, LI, ORE, V);
2155 }
2156 
2157 namespace {
2158 
2159 /// The LoopVectorize Pass.
2160 struct LoopVectorize : public FunctionPass {
2161   /// Pass identification, replacement for typeid
2162   static char ID;
2163 
2164   LoopVectorizePass Impl;
2165 
2166   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2167                          bool VectorizeOnlyWhenForced = false)
2168       : FunctionPass(ID),
2169         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2170     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2171   }
2172 
2173   bool runOnFunction(Function &F) override {
2174     if (skipFunction(F))
2175       return false;
2176 
2177     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2178     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2179     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2180     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2181     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2182     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2183     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2184     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2185     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2186     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2187     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2188     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2189     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2190 
2191     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2192         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2193 
2194     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2195                         GetLAA, *ORE, PSI).MadeAnyChange;
2196   }
2197 
2198   void getAnalysisUsage(AnalysisUsage &AU) const override {
2199     AU.addRequired<AssumptionCacheTracker>();
2200     AU.addRequired<BlockFrequencyInfoWrapperPass>();
2201     AU.addRequired<DominatorTreeWrapperPass>();
2202     AU.addRequired<LoopInfoWrapperPass>();
2203     AU.addRequired<ScalarEvolutionWrapperPass>();
2204     AU.addRequired<TargetTransformInfoWrapperPass>();
2205     AU.addRequired<AAResultsWrapperPass>();
2206     AU.addRequired<LoopAccessLegacyAnalysis>();
2207     AU.addRequired<DemandedBitsWrapperPass>();
2208     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2209     AU.addRequired<InjectTLIMappingsLegacy>();
2210 
2211     // We currently do not preserve loopinfo/dominator analyses with outer loop
2212     // vectorization. Until this is addressed, mark these analyses as preserved
2213     // only for non-VPlan-native path.
2214     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2215     if (!EnableVPlanNativePath) {
2216       AU.addPreserved<LoopInfoWrapperPass>();
2217       AU.addPreserved<DominatorTreeWrapperPass>();
2218     }
2219 
2220     AU.addPreserved<BasicAAWrapperPass>();
2221     AU.addPreserved<GlobalsAAWrapperPass>();
2222     AU.addRequired<ProfileSummaryInfoWrapperPass>();
2223   }
2224 };
2225 
2226 } // end anonymous namespace
2227 
2228 //===----------------------------------------------------------------------===//
2229 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2230 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2231 //===----------------------------------------------------------------------===//
2232 
2233 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2234   // We need to place the broadcast of invariant variables outside the loop,
2235   // but only if it's proven safe to do so. Else, broadcast will be inside
2236   // vector loop body.
2237   Instruction *Instr = dyn_cast<Instruction>(V);
2238   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2239                      (!Instr ||
2240                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2241   // Place the code for broadcasting invariant variables in the new preheader.
2242   IRBuilder<>::InsertPointGuard Guard(Builder);
2243   if (SafeToHoist)
2244     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2245 
2246   // Broadcast the scalar into all locations in the vector.
2247   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2248 
2249   return Shuf;
2250 }
2251 
2252 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
2253     const InductionDescriptor &II, Value *Step, Value *Start,
2254     Instruction *EntryVal, VPValue *Def, VPValue *CastDef,
2255     VPTransformState &State) {
2256   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2257          "Expected either an induction phi-node or a truncate of it!");
2258 
2259   // Construct the initial value of the vector IV in the vector loop preheader
2260   auto CurrIP = Builder.saveIP();
2261   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2262   if (isa<TruncInst>(EntryVal)) {
2263     assert(Start->getType()->isIntegerTy() &&
2264            "Truncation requires an integer type");
2265     auto *TruncType = cast<IntegerType>(EntryVal->getType());
2266     Step = Builder.CreateTrunc(Step, TruncType);
2267     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2268   }
2269   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
2270   Value *SteppedStart =
2271       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
2272 
2273   // We create vector phi nodes for both integer and floating-point induction
2274   // variables. Here, we determine the kind of arithmetic we will perform.
2275   Instruction::BinaryOps AddOp;
2276   Instruction::BinaryOps MulOp;
2277   if (Step->getType()->isIntegerTy()) {
2278     AddOp = Instruction::Add;
2279     MulOp = Instruction::Mul;
2280   } else {
2281     AddOp = II.getInductionOpcode();
2282     MulOp = Instruction::FMul;
2283   }
2284 
2285   // Multiply the vectorization factor by the step using integer or
2286   // floating-point arithmetic as appropriate.
2287   Type *StepType = Step->getType();
2288   if (Step->getType()->isFloatingPointTy())
2289     StepType = IntegerType::get(StepType->getContext(),
2290                                 StepType->getScalarSizeInBits());
2291   Value *RuntimeVF = getRuntimeVF(Builder, StepType, VF);
2292   if (Step->getType()->isFloatingPointTy())
2293     RuntimeVF = Builder.CreateSIToFP(RuntimeVF, Step->getType());
2294   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
2295 
2296   // Create a vector splat to use in the induction update.
2297   //
2298   // FIXME: If the step is non-constant, we create the vector splat with
2299   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2300   //        handle a constant vector splat.
2301   Value *SplatVF = isa<Constant>(Mul)
2302                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
2303                        : Builder.CreateVectorSplat(VF, Mul);
2304   Builder.restoreIP(CurrIP);
2305 
2306   // We may need to add the step a number of times, depending on the unroll
2307   // factor. The last of those goes into the PHI.
2308   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2309                                     &*LoopVectorBody->getFirstInsertionPt());
2310   VecInd->setDebugLoc(EntryVal->getDebugLoc());
2311   Instruction *LastInduction = VecInd;
2312   for (unsigned Part = 0; Part < UF; ++Part) {
2313     State.set(Def, LastInduction, Part);
2314 
2315     if (isa<TruncInst>(EntryVal))
2316       addMetadata(LastInduction, EntryVal);
2317     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, CastDef,
2318                                           State, Part);
2319 
2320     LastInduction = cast<Instruction>(
2321         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
2322     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
2323   }
2324 
2325   // Move the last step to the end of the latch block. This ensures consistent
2326   // placement of all induction updates.
2327   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2328   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2329   auto *ICmp = cast<Instruction>(Br->getCondition());
2330   LastInduction->moveBefore(ICmp);
2331   LastInduction->setName("vec.ind.next");
2332 
2333   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2334   VecInd->addIncoming(LastInduction, LoopVectorLatch);
2335 }
2336 
2337 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
2338   return Cost->isScalarAfterVectorization(I, VF) ||
2339          Cost->isProfitableToScalarize(I, VF);
2340 }
2341 
2342 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
2343   if (shouldScalarizeInstruction(IV))
2344     return true;
2345   auto isScalarInst = [&](User *U) -> bool {
2346     auto *I = cast<Instruction>(U);
2347     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
2348   };
2349   return llvm::any_of(IV->users(), isScalarInst);
2350 }
2351 
2352 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
2353     const InductionDescriptor &ID, const Instruction *EntryVal,
2354     Value *VectorLoopVal, VPValue *CastDef, VPTransformState &State,
2355     unsigned Part, unsigned Lane) {
2356   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2357          "Expected either an induction phi-node or a truncate of it!");
2358 
2359   // This induction variable is not the phi from the original loop but the
2360   // newly-created IV based on the proof that casted Phi is equal to the
2361   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
2362   // re-uses the same InductionDescriptor that original IV uses but we don't
2363   // have to do any recording in this case - that is done when original IV is
2364   // processed.
2365   if (isa<TruncInst>(EntryVal))
2366     return;
2367 
2368   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
2369   if (Casts.empty())
2370     return;
2371   // Only the first Cast instruction in the Casts vector is of interest.
2372   // The rest of the Casts (if exist) have no uses outside the
2373   // induction update chain itself.
2374   if (Lane < UINT_MAX)
2375     State.set(CastDef, VectorLoopVal, VPIteration(Part, Lane));
2376   else
2377     State.set(CastDef, VectorLoopVal, Part);
2378 }
2379 
2380 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
2381                                                 TruncInst *Trunc, VPValue *Def,
2382                                                 VPValue *CastDef,
2383                                                 VPTransformState &State) {
2384   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
2385          "Primary induction variable must have an integer type");
2386 
2387   auto II = Legal->getInductionVars().find(IV);
2388   assert(II != Legal->getInductionVars().end() && "IV is not an induction");
2389 
2390   auto ID = II->second;
2391   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
2392 
2393   // The value from the original loop to which we are mapping the new induction
2394   // variable.
2395   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2396 
2397   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
2398 
2399   // Generate code for the induction step. Note that induction steps are
2400   // required to be loop-invariant
2401   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
2402     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
2403            "Induction step should be loop invariant");
2404     if (PSE.getSE()->isSCEVable(IV->getType())) {
2405       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2406       return Exp.expandCodeFor(Step, Step->getType(),
2407                                LoopVectorPreHeader->getTerminator());
2408     }
2409     return cast<SCEVUnknown>(Step)->getValue();
2410   };
2411 
2412   // The scalar value to broadcast. This is derived from the canonical
2413   // induction variable. If a truncation type is given, truncate the canonical
2414   // induction variable and step. Otherwise, derive these values from the
2415   // induction descriptor.
2416   auto CreateScalarIV = [&](Value *&Step) -> Value * {
2417     Value *ScalarIV = Induction;
2418     if (IV != OldInduction) {
2419       ScalarIV = IV->getType()->isIntegerTy()
2420                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
2421                      : Builder.CreateCast(Instruction::SIToFP, Induction,
2422                                           IV->getType());
2423       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
2424       ScalarIV->setName("offset.idx");
2425     }
2426     if (Trunc) {
2427       auto *TruncType = cast<IntegerType>(Trunc->getType());
2428       assert(Step->getType()->isIntegerTy() &&
2429              "Truncation requires an integer step");
2430       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2431       Step = Builder.CreateTrunc(Step, TruncType);
2432     }
2433     return ScalarIV;
2434   };
2435 
2436   // Create the vector values from the scalar IV, in the absence of creating a
2437   // vector IV.
2438   auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
2439     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
2440     for (unsigned Part = 0; Part < UF; ++Part) {
2441       assert(!VF.isScalable() && "scalable vectors not yet supported.");
2442       Value *EntryPart =
2443           getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
2444                         ID.getInductionOpcode());
2445       State.set(Def, EntryPart, Part);
2446       if (Trunc)
2447         addMetadata(EntryPart, Trunc);
2448       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, CastDef,
2449                                             State, Part);
2450     }
2451   };
2452 
2453   // Fast-math-flags propagate from the original induction instruction.
2454   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
2455   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
2456     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
2457 
2458   // Now do the actual transformations, and start with creating the step value.
2459   Value *Step = CreateStepValue(ID.getStep());
2460   if (VF.isZero() || VF.isScalar()) {
2461     Value *ScalarIV = CreateScalarIV(Step);
2462     CreateSplatIV(ScalarIV, Step);
2463     return;
2464   }
2465 
2466   // Determine if we want a scalar version of the induction variable. This is
2467   // true if the induction variable itself is not widened, or if it has at
2468   // least one user in the loop that is not widened.
2469   auto NeedsScalarIV = needsScalarInduction(EntryVal);
2470   if (!NeedsScalarIV) {
2471     createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
2472                                     State);
2473     return;
2474   }
2475 
2476   // Try to create a new independent vector induction variable. If we can't
2477   // create the phi node, we will splat the scalar induction variable in each
2478   // loop iteration.
2479   if (!shouldScalarizeInstruction(EntryVal)) {
2480     createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, CastDef,
2481                                     State);
2482     Value *ScalarIV = CreateScalarIV(Step);
2483     // Create scalar steps that can be used by instructions we will later
2484     // scalarize. Note that the addition of the scalar steps will not increase
2485     // the number of instructions in the loop in the common case prior to
2486     // InstCombine. We will be trading one vector extract for each scalar step.
2487     buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
2488     return;
2489   }
2490 
2491   // All IV users are scalar instructions, so only emit a scalar IV, not a
2492   // vectorised IV. Except when we tail-fold, then the splat IV feeds the
2493   // predicate used by the masked loads/stores.
2494   Value *ScalarIV = CreateScalarIV(Step);
2495   if (!Cost->isScalarEpilogueAllowed())
2496     CreateSplatIV(ScalarIV, Step);
2497   buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, CastDef, State);
2498 }
2499 
2500 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
2501                                           Instruction::BinaryOps BinOp) {
2502   // Create and check the types.
2503   auto *ValVTy = cast<VectorType>(Val->getType());
2504   ElementCount VLen = ValVTy->getElementCount();
2505 
2506   Type *STy = Val->getType()->getScalarType();
2507   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2508          "Induction Step must be an integer or FP");
2509   assert(Step->getType() == STy && "Step has wrong type");
2510 
2511   SmallVector<Constant *, 8> Indices;
2512 
2513   // Create a vector of consecutive numbers from zero to VF.
2514   VectorType *InitVecValVTy = ValVTy;
2515   Type *InitVecValSTy = STy;
2516   if (STy->isFloatingPointTy()) {
2517     InitVecValSTy =
2518         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
2519     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2520   }
2521   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2522 
2523   // Add on StartIdx
2524   Value *StartIdxSplat = Builder.CreateVectorSplat(
2525       VLen, ConstantInt::get(InitVecValSTy, StartIdx));
2526   InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2527 
2528   if (STy->isIntegerTy()) {
2529     Step = Builder.CreateVectorSplat(VLen, Step);
2530     assert(Step->getType() == Val->getType() && "Invalid step vec");
2531     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
2532     // which can be found from the original scalar operations.
2533     Step = Builder.CreateMul(InitVec, Step);
2534     return Builder.CreateAdd(Val, Step, "induction");
2535   }
2536 
2537   // Floating point induction.
2538   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2539          "Binary Opcode should be specified for FP induction");
2540   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2541   Step = Builder.CreateVectorSplat(VLen, Step);
2542   Value *MulOp = Builder.CreateFMul(InitVec, Step);
2543   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2544 }
2545 
2546 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2547                                            Instruction *EntryVal,
2548                                            const InductionDescriptor &ID,
2549                                            VPValue *Def, VPValue *CastDef,
2550                                            VPTransformState &State) {
2551   // We shouldn't have to build scalar steps if we aren't vectorizing.
2552   assert(VF.isVector() && "VF should be greater than one");
2553   // Get the value type and ensure it and the step have the same integer type.
2554   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2555   assert(ScalarIVTy == Step->getType() &&
2556          "Val and Step should have the same type");
2557 
2558   // We build scalar steps for both integer and floating-point induction
2559   // variables. Here, we determine the kind of arithmetic we will perform.
2560   Instruction::BinaryOps AddOp;
2561   Instruction::BinaryOps MulOp;
2562   if (ScalarIVTy->isIntegerTy()) {
2563     AddOp = Instruction::Add;
2564     MulOp = Instruction::Mul;
2565   } else {
2566     AddOp = ID.getInductionOpcode();
2567     MulOp = Instruction::FMul;
2568   }
2569 
2570   // Determine the number of scalars we need to generate for each unroll
2571   // iteration. If EntryVal is uniform, we only need to generate the first
2572   // lane. Otherwise, we generate all VF values.
2573   bool IsUniform =
2574       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF);
2575   unsigned Lanes = IsUniform ? 1 : VF.getKnownMinValue();
2576   // Compute the scalar steps and save the results in State.
2577   Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2578                                      ScalarIVTy->getScalarSizeInBits());
2579   Type *VecIVTy = nullptr;
2580   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2581   if (!IsUniform && VF.isScalable()) {
2582     VecIVTy = VectorType::get(ScalarIVTy, VF);
2583     UnitStepVec = Builder.CreateStepVector(VectorType::get(IntStepTy, VF));
2584     SplatStep = Builder.CreateVectorSplat(VF, Step);
2585     SplatIV = Builder.CreateVectorSplat(VF, ScalarIV);
2586   }
2587 
2588   for (unsigned Part = 0; Part < UF; ++Part) {
2589     Value *StartIdx0 =
2590         createStepForVF(Builder, ConstantInt::get(IntStepTy, Part), VF);
2591 
2592     if (!IsUniform && VF.isScalable()) {
2593       auto *SplatStartIdx = Builder.CreateVectorSplat(VF, StartIdx0);
2594       auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2595       if (ScalarIVTy->isFloatingPointTy())
2596         InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2597       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2598       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2599       State.set(Def, Add, Part);
2600       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
2601                                             Part);
2602       // It's useful to record the lane values too for the known minimum number
2603       // of elements so we do those below. This improves the code quality when
2604       // trying to extract the first element, for example.
2605     }
2606 
2607     if (ScalarIVTy->isFloatingPointTy())
2608       StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2609 
2610     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2611       Value *StartIdx = Builder.CreateBinOp(
2612           AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2613       // The step returned by `createStepForVF` is a runtime-evaluated value
2614       // when VF is scalable. Otherwise, it should be folded into a Constant.
2615       assert((VF.isScalable() || isa<Constant>(StartIdx)) &&
2616              "Expected StartIdx to be folded to a constant when VF is not "
2617              "scalable");
2618       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2619       auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2620       State.set(Def, Add, VPIteration(Part, Lane));
2621       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, CastDef, State,
2622                                             Part, Lane);
2623     }
2624   }
2625 }
2626 
2627 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2628                                                     const VPIteration &Instance,
2629                                                     VPTransformState &State) {
2630   Value *ScalarInst = State.get(Def, Instance);
2631   Value *VectorValue = State.get(Def, Instance.Part);
2632   VectorValue = Builder.CreateInsertElement(
2633       VectorValue, ScalarInst,
2634       Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2635   State.set(Def, VectorValue, Instance.Part);
2636 }
2637 
2638 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2639   assert(Vec->getType()->isVectorTy() && "Invalid type");
2640   return Builder.CreateVectorReverse(Vec, "reverse");
2641 }
2642 
2643 // Return whether we allow using masked interleave-groups (for dealing with
2644 // strided loads/stores that reside in predicated blocks, or for dealing
2645 // with gaps).
2646 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2647   // If an override option has been passed in for interleaved accesses, use it.
2648   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2649     return EnableMaskedInterleavedMemAccesses;
2650 
2651   return TTI.enableMaskedInterleavedAccessVectorization();
2652 }
2653 
2654 // Try to vectorize the interleave group that \p Instr belongs to.
2655 //
2656 // E.g. Translate following interleaved load group (factor = 3):
2657 //   for (i = 0; i < N; i+=3) {
2658 //     R = Pic[i];             // Member of index 0
2659 //     G = Pic[i+1];           // Member of index 1
2660 //     B = Pic[i+2];           // Member of index 2
2661 //     ... // do something to R, G, B
2662 //   }
2663 // To:
2664 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2665 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2666 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2667 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2668 //
2669 // Or translate following interleaved store group (factor = 3):
2670 //   for (i = 0; i < N; i+=3) {
2671 //     ... do something to R, G, B
2672 //     Pic[i]   = R;           // Member of index 0
2673 //     Pic[i+1] = G;           // Member of index 1
2674 //     Pic[i+2] = B;           // Member of index 2
2675 //   }
2676 // To:
2677 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2678 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2679 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2680 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2681 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2682 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2683     const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2684     VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2685     VPValue *BlockInMask) {
2686   Instruction *Instr = Group->getInsertPos();
2687   const DataLayout &DL = Instr->getModule()->getDataLayout();
2688 
2689   // Prepare for the vector type of the interleaved load/store.
2690   Type *ScalarTy = getLoadStoreType(Instr);
2691   unsigned InterleaveFactor = Group->getFactor();
2692   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2693   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2694 
2695   // Prepare for the new pointers.
2696   SmallVector<Value *, 2> AddrParts;
2697   unsigned Index = Group->getIndex(Instr);
2698 
2699   // TODO: extend the masked interleaved-group support to reversed access.
2700   assert((!BlockInMask || !Group->isReverse()) &&
2701          "Reversed masked interleave-group not supported.");
2702 
2703   // If the group is reverse, adjust the index to refer to the last vector lane
2704   // instead of the first. We adjust the index from the first vector lane,
2705   // rather than directly getting the pointer for lane VF - 1, because the
2706   // pointer operand of the interleaved access is supposed to be uniform. For
2707   // uniform instructions, we're only required to generate a value for the
2708   // first vector lane in each unroll iteration.
2709   if (Group->isReverse())
2710     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2711 
2712   for (unsigned Part = 0; Part < UF; Part++) {
2713     Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2714     setDebugLocFromInst(Builder, AddrPart);
2715 
2716     // Notice current instruction could be any index. Need to adjust the address
2717     // to the member of index 0.
2718     //
2719     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2720     //       b = A[i];       // Member of index 0
2721     // Current pointer is pointed to A[i+1], adjust it to A[i].
2722     //
2723     // E.g.  A[i+1] = a;     // Member of index 1
2724     //       A[i]   = b;     // Member of index 0
2725     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2726     // Current pointer is pointed to A[i+2], adjust it to A[i].
2727 
2728     bool InBounds = false;
2729     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2730       InBounds = gep->isInBounds();
2731     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2732     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2733 
2734     // Cast to the vector pointer type.
2735     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2736     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2737     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2738   }
2739 
2740   setDebugLocFromInst(Builder, Instr);
2741   Value *PoisonVec = PoisonValue::get(VecTy);
2742 
2743   Value *MaskForGaps = nullptr;
2744   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2745     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2746     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2747   }
2748 
2749   // Vectorize the interleaved load group.
2750   if (isa<LoadInst>(Instr)) {
2751     // For each unroll part, create a wide load for the group.
2752     SmallVector<Value *, 2> NewLoads;
2753     for (unsigned Part = 0; Part < UF; Part++) {
2754       Instruction *NewLoad;
2755       if (BlockInMask || MaskForGaps) {
2756         assert(useMaskedInterleavedAccesses(*TTI) &&
2757                "masked interleaved groups are not allowed.");
2758         Value *GroupMask = MaskForGaps;
2759         if (BlockInMask) {
2760           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2761           Value *ShuffledMask = Builder.CreateShuffleVector(
2762               BlockInMaskPart,
2763               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2764               "interleaved.mask");
2765           GroupMask = MaskForGaps
2766                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2767                                                 MaskForGaps)
2768                           : ShuffledMask;
2769         }
2770         NewLoad =
2771             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
2772                                      GroupMask, PoisonVec, "wide.masked.vec");
2773       }
2774       else
2775         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2776                                             Group->getAlign(), "wide.vec");
2777       Group->addMetadata(NewLoad);
2778       NewLoads.push_back(NewLoad);
2779     }
2780 
2781     // For each member in the group, shuffle out the appropriate data from the
2782     // wide loads.
2783     unsigned J = 0;
2784     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2785       Instruction *Member = Group->getMember(I);
2786 
2787       // Skip the gaps in the group.
2788       if (!Member)
2789         continue;
2790 
2791       auto StrideMask =
2792           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2793       for (unsigned Part = 0; Part < UF; Part++) {
2794         Value *StridedVec = Builder.CreateShuffleVector(
2795             NewLoads[Part], StrideMask, "strided.vec");
2796 
2797         // If this member has different type, cast the result type.
2798         if (Member->getType() != ScalarTy) {
2799           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2800           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2801           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2802         }
2803 
2804         if (Group->isReverse())
2805           StridedVec = reverseVector(StridedVec);
2806 
2807         State.set(VPDefs[J], StridedVec, Part);
2808       }
2809       ++J;
2810     }
2811     return;
2812   }
2813 
2814   // The sub vector type for current instruction.
2815   auto *SubVT = VectorType::get(ScalarTy, VF);
2816 
2817   // Vectorize the interleaved store group.
2818   for (unsigned Part = 0; Part < UF; Part++) {
2819     // Collect the stored vector from each member.
2820     SmallVector<Value *, 4> StoredVecs;
2821     for (unsigned i = 0; i < InterleaveFactor; i++) {
2822       // Interleaved store group doesn't allow a gap, so each index has a member
2823       assert(Group->getMember(i) && "Fail to get a member from an interleaved store group");
2824 
2825       Value *StoredVec = State.get(StoredValues[i], Part);
2826 
2827       if (Group->isReverse())
2828         StoredVec = reverseVector(StoredVec);
2829 
2830       // If this member has different type, cast it to a unified type.
2831 
2832       if (StoredVec->getType() != SubVT)
2833         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2834 
2835       StoredVecs.push_back(StoredVec);
2836     }
2837 
2838     // Concatenate all vectors into a wide vector.
2839     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2840 
2841     // Interleave the elements in the wide vector.
2842     Value *IVec = Builder.CreateShuffleVector(
2843         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2844         "interleaved.vec");
2845 
2846     Instruction *NewStoreInstr;
2847     if (BlockInMask) {
2848       Value *BlockInMaskPart = State.get(BlockInMask, Part);
2849       Value *ShuffledMask = Builder.CreateShuffleVector(
2850           BlockInMaskPart,
2851           createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2852           "interleaved.mask");
2853       NewStoreInstr = Builder.CreateMaskedStore(
2854           IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
2855     }
2856     else
2857       NewStoreInstr =
2858           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2859 
2860     Group->addMetadata(NewStoreInstr);
2861   }
2862 }
2863 
2864 void InnerLoopVectorizer::vectorizeMemoryInstruction(
2865     Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,
2866     VPValue *StoredValue, VPValue *BlockInMask) {
2867   // Attempt to issue a wide load.
2868   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2869   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2870 
2871   assert((LI || SI) && "Invalid Load/Store instruction");
2872   assert((!SI || StoredValue) && "No stored value provided for widened store");
2873   assert((!LI || !StoredValue) && "Stored value provided for widened load");
2874 
2875   LoopVectorizationCostModel::InstWidening Decision =
2876       Cost->getWideningDecision(Instr, VF);
2877   assert((Decision == LoopVectorizationCostModel::CM_Widen ||
2878           Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||
2879           Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
2880          "CM decision is not to widen the memory instruction");
2881 
2882   Type *ScalarDataTy = getLoadStoreType(Instr);
2883 
2884   auto *DataTy = VectorType::get(ScalarDataTy, VF);
2885   const Align Alignment = getLoadStoreAlignment(Instr);
2886 
2887   // Determine if the pointer operand of the access is either consecutive or
2888   // reverse consecutive.
2889   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2890   bool ConsecutiveStride =
2891       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2892   bool CreateGatherScatter =
2893       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2894 
2895   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2896   // gather/scatter. Otherwise Decision should have been to Scalarize.
2897   assert((ConsecutiveStride || CreateGatherScatter) &&
2898          "The instruction should be scalarized");
2899   (void)ConsecutiveStride;
2900 
2901   VectorParts BlockInMaskParts(UF);
2902   bool isMaskRequired = BlockInMask;
2903   if (isMaskRequired)
2904     for (unsigned Part = 0; Part < UF; ++Part)
2905       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2906 
2907   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2908     // Calculate the pointer for the specific unroll-part.
2909     GetElementPtrInst *PartPtr = nullptr;
2910 
2911     bool InBounds = false;
2912     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2913       InBounds = gep->isInBounds();
2914     if (Reverse) {
2915       // If the address is consecutive but reversed, then the
2916       // wide store needs to start at the last vector element.
2917       // RunTimeVF =  VScale * VF.getKnownMinValue()
2918       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
2919       Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), VF);
2920       // NumElt = -Part * RunTimeVF
2921       Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
2922       // LastLane = 1 - RunTimeVF
2923       Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
2924       PartPtr =
2925           cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
2926       PartPtr->setIsInBounds(InBounds);
2927       PartPtr = cast<GetElementPtrInst>(
2928           Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
2929       PartPtr->setIsInBounds(InBounds);
2930       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2931         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2932     } else {
2933       Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF);
2934       PartPtr = cast<GetElementPtrInst>(
2935           Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
2936       PartPtr->setIsInBounds(InBounds);
2937     }
2938 
2939     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2940     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2941   };
2942 
2943   // Handle Stores:
2944   if (SI) {
2945     setDebugLocFromInst(Builder, SI);
2946 
2947     for (unsigned Part = 0; Part < UF; ++Part) {
2948       Instruction *NewSI = nullptr;
2949       Value *StoredVal = State.get(StoredValue, Part);
2950       if (CreateGatherScatter) {
2951         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2952         Value *VectorGep = State.get(Addr, Part);
2953         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
2954                                             MaskPart);
2955       } else {
2956         if (Reverse) {
2957           // If we store to reverse consecutive memory locations, then we need
2958           // to reverse the order of elements in the stored value.
2959           StoredVal = reverseVector(StoredVal);
2960           // We don't want to update the value in the map as it might be used in
2961           // another expression. So don't call resetVectorValue(StoredVal).
2962         }
2963         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));
2964         if (isMaskRequired)
2965           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
2966                                             BlockInMaskParts[Part]);
2967         else
2968           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
2969       }
2970       addMetadata(NewSI, SI);
2971     }
2972     return;
2973   }
2974 
2975   // Handle loads.
2976   assert(LI && "Must have a load instruction");
2977   setDebugLocFromInst(Builder, LI);
2978   for (unsigned Part = 0; Part < UF; ++Part) {
2979     Value *NewLI;
2980     if (CreateGatherScatter) {
2981       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2982       Value *VectorGep = State.get(Addr, Part);
2983       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
2984                                          nullptr, "wide.masked.gather");
2985       addMetadata(NewLI, LI);
2986     } else {
2987       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0)));
2988       if (isMaskRequired)
2989         NewLI = Builder.CreateMaskedLoad(
2990             VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy),
2991             "wide.masked.load");
2992       else
2993         NewLI =
2994             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
2995 
2996       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2997       addMetadata(NewLI, LI);
2998       if (Reverse)
2999         NewLI = reverseVector(NewLI);
3000     }
3001 
3002     State.set(Def, NewLI, Part);
3003   }
3004 }
3005 
3006 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def,
3007                                                VPUser &User,
3008                                                const VPIteration &Instance,
3009                                                bool IfPredicateInstr,
3010                                                VPTransformState &State) {
3011   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
3012 
3013   // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
3014   // the first lane and part.
3015   if (isa<NoAliasScopeDeclInst>(Instr))
3016     if (!Instance.isFirstIteration())
3017       return;
3018 
3019   setDebugLocFromInst(Builder, Instr);
3020 
3021   // Does this instruction return a value ?
3022   bool IsVoidRetTy = Instr->getType()->isVoidTy();
3023 
3024   Instruction *Cloned = Instr->clone();
3025   if (!IsVoidRetTy)
3026     Cloned->setName(Instr->getName() + ".cloned");
3027 
3028   State.Builder.SetInsertPoint(Builder.GetInsertBlock(),
3029                                Builder.GetInsertPoint());
3030   // Replace the operands of the cloned instructions with their scalar
3031   // equivalents in the new loop.
3032   for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
3033     auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
3034     auto InputInstance = Instance;
3035     if (!Operand || !OrigLoop->contains(Operand) ||
3036         (Cost->isUniformAfterVectorization(Operand, State.VF)))
3037       InputInstance.Lane = VPLane::getFirstLane();
3038     auto *NewOp = State.get(User.getOperand(op), InputInstance);
3039     Cloned->setOperand(op, NewOp);
3040   }
3041   addNewMetadata(Cloned, Instr);
3042 
3043   // Place the cloned scalar in the new loop.
3044   Builder.Insert(Cloned);
3045 
3046   State.set(Def, Cloned, Instance);
3047 
3048   // If we just cloned a new assumption, add it the assumption cache.
3049   if (auto *II = dyn_cast<AssumeInst>(Cloned))
3050     AC->registerAssumption(II);
3051 
3052   // End if-block.
3053   if (IfPredicateInstr)
3054     PredicatedInstructions.push_back(Cloned);
3055 }
3056 
3057 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
3058                                                       Value *End, Value *Step,
3059                                                       Instruction *DL) {
3060   BasicBlock *Header = L->getHeader();
3061   BasicBlock *Latch = L->getLoopLatch();
3062   // As we're just creating this loop, it's possible no latch exists
3063   // yet. If so, use the header as this will be a single block loop.
3064   if (!Latch)
3065     Latch = Header;
3066 
3067   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
3068   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
3069   setDebugLocFromInst(Builder, OldInst);
3070   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
3071 
3072   Builder.SetInsertPoint(Latch->getTerminator());
3073   setDebugLocFromInst(Builder, OldInst);
3074 
3075   // Create i+1 and fill the PHINode.
3076   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
3077   Induction->addIncoming(Start, L->getLoopPreheader());
3078   Induction->addIncoming(Next, Latch);
3079   // Create the compare.
3080   Value *ICmp = Builder.CreateICmpEQ(Next, End);
3081   Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header);
3082 
3083   // Now we have two terminators. Remove the old one from the block.
3084   Latch->getTerminator()->eraseFromParent();
3085 
3086   return Induction;
3087 }
3088 
3089 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
3090   if (TripCount)
3091     return TripCount;
3092 
3093   assert(L && "Create Trip Count for null loop.");
3094   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3095   // Find the loop boundaries.
3096   ScalarEvolution *SE = PSE.getSE();
3097   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
3098   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
3099          "Invalid loop count");
3100 
3101   Type *IdxTy = Legal->getWidestInductionType();
3102   assert(IdxTy && "No type for induction");
3103 
3104   // The exit count might have the type of i64 while the phi is i32. This can
3105   // happen if we have an induction variable that is sign extended before the
3106   // compare. The only way that we get a backedge taken count is that the
3107   // induction variable was signed and as such will not overflow. In such a case
3108   // truncation is legal.
3109   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
3110       IdxTy->getPrimitiveSizeInBits())
3111     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
3112   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
3113 
3114   // Get the total trip count from the count by adding 1.
3115   const SCEV *ExitCount = SE->getAddExpr(
3116       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
3117 
3118   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
3119 
3120   // Expand the trip count and place the new instructions in the preheader.
3121   // Notice that the pre-header does not change, only the loop body.
3122   SCEVExpander Exp(*SE, DL, "induction");
3123 
3124   // Count holds the overall loop count (N).
3125   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
3126                                 L->getLoopPreheader()->getTerminator());
3127 
3128   if (TripCount->getType()->isPointerTy())
3129     TripCount =
3130         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
3131                                     L->getLoopPreheader()->getTerminator());
3132 
3133   return TripCount;
3134 }
3135 
3136 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
3137   if (VectorTripCount)
3138     return VectorTripCount;
3139 
3140   Value *TC = getOrCreateTripCount(L);
3141   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3142 
3143   Type *Ty = TC->getType();
3144   // This is where we can make the step a runtime constant.
3145   Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF);
3146 
3147   // If the tail is to be folded by masking, round the number of iterations N
3148   // up to a multiple of Step instead of rounding down. This is done by first
3149   // adding Step-1 and then rounding down. Note that it's ok if this addition
3150   // overflows: the vector induction variable will eventually wrap to zero given
3151   // that it starts at zero and its Step is a power of two; the loop will then
3152   // exit, with the last early-exit vector comparison also producing all-true.
3153   if (Cost->foldTailByMasking()) {
3154     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
3155            "VF*UF must be a power of 2 when folding tail by masking");
3156     assert(!VF.isScalable() &&
3157            "Tail folding not yet supported for scalable vectors");
3158     TC = Builder.CreateAdd(
3159         TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
3160   }
3161 
3162   // Now we need to generate the expression for the part of the loop that the
3163   // vectorized body will execute. This is equal to N - (N % Step) if scalar
3164   // iterations are not required for correctness, or N - Step, otherwise. Step
3165   // is equal to the vectorization factor (number of SIMD elements) times the
3166   // unroll factor (number of SIMD instructions).
3167   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
3168 
3169   // There are two cases where we need to ensure (at least) the last iteration
3170   // runs in the scalar remainder loop. Thus, if the step evenly divides
3171   // the trip count, we set the remainder to be equal to the step. If the step
3172   // does not evenly divide the trip count, no adjustment is necessary since
3173   // there will already be scalar iterations. Note that the minimum iterations
3174   // check ensures that N >= Step. The cases are:
3175   // 1) If there is a non-reversed interleaved group that may speculatively
3176   //    access memory out-of-bounds.
3177   // 2) If any instruction may follow a conditionally taken exit. That is, if
3178   //    the loop contains multiple exiting blocks, or a single exiting block
3179   //    which is not the latch.
3180   if (VF.isVector() && Cost->requiresScalarEpilogue()) {
3181     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
3182     R = Builder.CreateSelect(IsZero, Step, R);
3183   }
3184 
3185   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
3186 
3187   return VectorTripCount;
3188 }
3189 
3190 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
3191                                                    const DataLayout &DL) {
3192   // Verify that V is a vector type with same number of elements as DstVTy.
3193   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
3194   unsigned VF = DstFVTy->getNumElements();
3195   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
3196   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
3197   Type *SrcElemTy = SrcVecTy->getElementType();
3198   Type *DstElemTy = DstFVTy->getElementType();
3199   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3200          "Vector elements must have same size");
3201 
3202   // Do a direct cast if element types are castable.
3203   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3204     return Builder.CreateBitOrPointerCast(V, DstFVTy);
3205   }
3206   // V cannot be directly casted to desired vector type.
3207   // May happen when V is a floating point vector but DstVTy is a vector of
3208   // pointers or vice-versa. Handle this using a two-step bitcast using an
3209   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3210   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3211          "Only one type should be a pointer type");
3212   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3213          "Only one type should be a floating point type");
3214   Type *IntTy =
3215       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3216   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
3217   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3218   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
3219 }
3220 
3221 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
3222                                                          BasicBlock *Bypass) {
3223   Value *Count = getOrCreateTripCount(L);
3224   // Reuse existing vector loop preheader for TC checks.
3225   // Note that new preheader block is generated for vector loop.
3226   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
3227   IRBuilder<> Builder(TCCheckBlock->getTerminator());
3228 
3229   // Generate code to check if the loop's trip count is less than VF * UF, or
3230   // equal to it in case a scalar epilogue is required; this implies that the
3231   // vector trip count is zero. This check also covers the case where adding one
3232   // to the backedge-taken count overflowed leading to an incorrect trip count
3233   // of zero. In this case we will also jump to the scalar loop.
3234   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
3235                                           : ICmpInst::ICMP_ULT;
3236 
3237   // If tail is to be folded, vector loop takes care of all iterations.
3238   Value *CheckMinIters = Builder.getFalse();
3239   if (!Cost->foldTailByMasking()) {
3240     Value *Step =
3241         createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF);
3242     CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
3243   }
3244   // Create new preheader for vector loop.
3245   LoopVectorPreHeader =
3246       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3247                  "vector.ph");
3248 
3249   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
3250                                DT->getNode(Bypass)->getIDom()) &&
3251          "TC check is expected to dominate Bypass");
3252 
3253   // Update dominator for Bypass & LoopExit.
3254   DT->changeImmediateDominator(Bypass, TCCheckBlock);
3255   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3256 
3257   ReplaceInstWithInst(
3258       TCCheckBlock->getTerminator(),
3259       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3260   LoopBypassBlocks.push_back(TCCheckBlock);
3261 }
3262 
3263 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
3264 
3265   BasicBlock *const SCEVCheckBlock =
3266       RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock);
3267   if (!SCEVCheckBlock)
3268     return nullptr;
3269 
3270   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3271            (OptForSizeBasedOnProfile &&
3272             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
3273          "Cannot SCEV check stride or overflow when optimizing for size");
3274 
3275 
3276   // Update dominator only if this is first RT check.
3277   if (LoopBypassBlocks.empty()) {
3278     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3279     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3280   }
3281 
3282   LoopBypassBlocks.push_back(SCEVCheckBlock);
3283   AddedSafetyChecks = true;
3284   return SCEVCheckBlock;
3285 }
3286 
3287 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
3288                                                       BasicBlock *Bypass) {
3289   // VPlan-native path does not do any analysis for runtime checks currently.
3290   if (EnableVPlanNativePath)
3291     return nullptr;
3292 
3293   BasicBlock *const MemCheckBlock =
3294       RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader);
3295 
3296   // Check if we generated code that checks in runtime if arrays overlap. We put
3297   // the checks into a separate block to make the more common case of few
3298   // elements faster.
3299   if (!MemCheckBlock)
3300     return nullptr;
3301 
3302   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3303     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3304            "Cannot emit memory checks when optimizing for size, unless forced "
3305            "to vectorize.");
3306     ORE->emit([&]() {
3307       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3308                                         L->getStartLoc(), L->getHeader())
3309              << "Code-size may be reduced by not forcing "
3310                 "vectorization, or by source-code modifications "
3311                 "eliminating the need for runtime checks "
3312                 "(e.g., adding 'restrict').";
3313     });
3314   }
3315 
3316   LoopBypassBlocks.push_back(MemCheckBlock);
3317 
3318   AddedSafetyChecks = true;
3319 
3320   // We currently don't use LoopVersioning for the actual loop cloning but we
3321   // still use it to add the noalias metadata.
3322   LVer = std::make_unique<LoopVersioning>(
3323       *Legal->getLAI(),
3324       Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3325       DT, PSE.getSE());
3326   LVer->prepareNoAliasMetadata();
3327   return MemCheckBlock;
3328 }
3329 
3330 Value *InnerLoopVectorizer::emitTransformedIndex(
3331     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
3332     const InductionDescriptor &ID) const {
3333 
3334   SCEVExpander Exp(*SE, DL, "induction");
3335   auto Step = ID.getStep();
3336   auto StartValue = ID.getStartValue();
3337   assert(Index->getType()->getScalarType() == Step->getType() &&
3338          "Index scalar type does not match StepValue type");
3339 
3340   // Note: the IR at this point is broken. We cannot use SE to create any new
3341   // SCEV and then expand it, hoping that SCEV's simplification will give us
3342   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3343   // lead to various SCEV crashes. So all we can do is to use builder and rely
3344   // on InstCombine for future simplifications. Here we handle some trivial
3345   // cases only.
3346   auto CreateAdd = [&B](Value *X, Value *Y) {
3347     assert(X->getType() == Y->getType() && "Types don't match!");
3348     if (auto *CX = dyn_cast<ConstantInt>(X))
3349       if (CX->isZero())
3350         return Y;
3351     if (auto *CY = dyn_cast<ConstantInt>(Y))
3352       if (CY->isZero())
3353         return X;
3354     return B.CreateAdd(X, Y);
3355   };
3356 
3357   // We allow X to be a vector type, in which case Y will potentially be
3358   // splatted into a vector with the same element count.
3359   auto CreateMul = [&B](Value *X, Value *Y) {
3360     assert(X->getType()->getScalarType() == Y->getType() &&
3361            "Types don't match!");
3362     if (auto *CX = dyn_cast<ConstantInt>(X))
3363       if (CX->isOne())
3364         return Y;
3365     if (auto *CY = dyn_cast<ConstantInt>(Y))
3366       if (CY->isOne())
3367         return X;
3368     VectorType *XVTy = dyn_cast<VectorType>(X->getType());
3369     if (XVTy && !isa<VectorType>(Y->getType()))
3370       Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
3371     return B.CreateMul(X, Y);
3372   };
3373 
3374   // Get a suitable insert point for SCEV expansion. For blocks in the vector
3375   // loop, choose the end of the vector loop header (=LoopVectorBody), because
3376   // the DomTree is not kept up-to-date for additional blocks generated in the
3377   // vector loop. By using the header as insertion point, we guarantee that the
3378   // expanded instructions dominate all their uses.
3379   auto GetInsertPoint = [this, &B]() {
3380     BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3381     if (InsertBB != LoopVectorBody &&
3382         LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
3383       return LoopVectorBody->getTerminator();
3384     return &*B.GetInsertPoint();
3385   };
3386 
3387   switch (ID.getKind()) {
3388   case InductionDescriptor::IK_IntInduction: {
3389     assert(!isa<VectorType>(Index->getType()) &&
3390            "Vector indices not supported for integer inductions yet");
3391     assert(Index->getType() == StartValue->getType() &&
3392            "Index type does not match StartValue type");
3393     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3394       return B.CreateSub(StartValue, Index);
3395     auto *Offset = CreateMul(
3396         Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3397     return CreateAdd(StartValue, Offset);
3398   }
3399   case InductionDescriptor::IK_PtrInduction: {
3400     assert(isa<SCEVConstant>(Step) &&
3401            "Expected constant step for pointer induction");
3402     return B.CreateGEP(
3403         StartValue->getType()->getPointerElementType(), StartValue,
3404         CreateMul(Index,
3405                   Exp.expandCodeFor(Step, Index->getType()->getScalarType(),
3406                                     GetInsertPoint())));
3407   }
3408   case InductionDescriptor::IK_FpInduction: {
3409     assert(!isa<VectorType>(Index->getType()) &&
3410            "Vector indices not supported for FP inductions yet");
3411     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
3412     auto InductionBinOp = ID.getInductionBinOp();
3413     assert(InductionBinOp &&
3414            (InductionBinOp->getOpcode() == Instruction::FAdd ||
3415             InductionBinOp->getOpcode() == Instruction::FSub) &&
3416            "Original bin op should be defined for FP induction");
3417 
3418     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3419     Value *MulExp = B.CreateFMul(StepValue, Index);
3420     return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3421                          "induction");
3422   }
3423   case InductionDescriptor::IK_NoInduction:
3424     return nullptr;
3425   }
3426   llvm_unreachable("invalid enum");
3427 }
3428 
3429 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3430   LoopScalarBody = OrigLoop->getHeader();
3431   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3432   LoopExitBlock = OrigLoop->getUniqueExitBlock();
3433   assert(LoopExitBlock && "Must have an exit block");
3434   assert(LoopVectorPreHeader && "Invalid loop structure");
3435 
3436   LoopMiddleBlock =
3437       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3438                  LI, nullptr, Twine(Prefix) + "middle.block");
3439   LoopScalarPreHeader =
3440       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3441                  nullptr, Twine(Prefix) + "scalar.ph");
3442 
3443   // Set up branch from middle block to the exit and scalar preheader blocks.
3444   // completeLoopSkeleton will update the condition to use an iteration check,
3445   // if required to decide whether to execute the remainder.
3446   BranchInst *BrInst =
3447       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue());
3448   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3449   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3450   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3451 
3452   // We intentionally don't let SplitBlock to update LoopInfo since
3453   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3454   // LoopVectorBody is explicitly added to the correct place few lines later.
3455   LoopVectorBody =
3456       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3457                  nullptr, nullptr, Twine(Prefix) + "vector.body");
3458 
3459   // Update dominator for loop exit.
3460   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3461 
3462   // Create and register the new vector loop.
3463   Loop *Lp = LI->AllocateLoop();
3464   Loop *ParentLoop = OrigLoop->getParentLoop();
3465 
3466   // Insert the new loop into the loop nest and register the new basic blocks
3467   // before calling any utilities such as SCEV that require valid LoopInfo.
3468   if (ParentLoop) {
3469     ParentLoop->addChildLoop(Lp);
3470   } else {
3471     LI->addTopLevelLoop(Lp);
3472   }
3473   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3474   return Lp;
3475 }
3476 
3477 void InnerLoopVectorizer::createInductionResumeValues(
3478     Loop *L, Value *VectorTripCount,
3479     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3480   assert(VectorTripCount && L && "Expected valid arguments");
3481   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3482           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3483          "Inconsistent information about additional bypass.");
3484   // We are going to resume the execution of the scalar loop.
3485   // Go over all of the induction variables that we found and fix the
3486   // PHIs that are left in the scalar version of the loop.
3487   // The starting values of PHI nodes depend on the counter of the last
3488   // iteration in the vectorized loop.
3489   // If we come from a bypass edge then we need to start from the original
3490   // start value.
3491   for (auto &InductionEntry : Legal->getInductionVars()) {
3492     PHINode *OrigPhi = InductionEntry.first;
3493     InductionDescriptor II = InductionEntry.second;
3494 
3495     // Create phi nodes to merge from the  backedge-taken check block.
3496     PHINode *BCResumeVal =
3497         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3498                         LoopScalarPreHeader->getTerminator());
3499     // Copy original phi DL over to the new one.
3500     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3501     Value *&EndValue = IVEndValues[OrigPhi];
3502     Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3503     if (OrigPhi == OldInduction) {
3504       // We know what the end value is.
3505       EndValue = VectorTripCount;
3506     } else {
3507       IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3508 
3509       // Fast-math-flags propagate from the original induction instruction.
3510       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3511         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3512 
3513       Type *StepType = II.getStep()->getType();
3514       Instruction::CastOps CastOp =
3515           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3516       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3517       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3518       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3519       EndValue->setName("ind.end");
3520 
3521       // Compute the end value for the additional bypass (if applicable).
3522       if (AdditionalBypass.first) {
3523         B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3524         CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3525                                          StepType, true);
3526         CRD =
3527             B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
3528         EndValueFromAdditionalBypass =
3529             emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3530         EndValueFromAdditionalBypass->setName("ind.end");
3531       }
3532     }
3533     // The new PHI merges the original incoming value, in case of a bypass,
3534     // or the value at the end of the vectorized loop.
3535     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3536 
3537     // Fix the scalar body counter (PHI node).
3538     // The old induction's phi node in the scalar body needs the truncated
3539     // value.
3540     for (BasicBlock *BB : LoopBypassBlocks)
3541       BCResumeVal->addIncoming(II.getStartValue(), BB);
3542 
3543     if (AdditionalBypass.first)
3544       BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3545                                             EndValueFromAdditionalBypass);
3546 
3547     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3548   }
3549 }
3550 
3551 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3552                                                       MDNode *OrigLoopID) {
3553   assert(L && "Expected valid loop.");
3554 
3555   // The trip counts should be cached by now.
3556   Value *Count = getOrCreateTripCount(L);
3557   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3558 
3559   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3560 
3561   // Add a check in the middle block to see if we have completed
3562   // all of the iterations in the first vector loop.
3563   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3564   // If tail is to be folded, we know we don't need to run the remainder.
3565   if (!Cost->foldTailByMasking()) {
3566     Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3567                                         Count, VectorTripCount, "cmp.n",
3568                                         LoopMiddleBlock->getTerminator());
3569 
3570     // Here we use the same DebugLoc as the scalar loop latch terminator instead
3571     // of the corresponding compare because they may have ended up with
3572     // different line numbers and we want to avoid awkward line stepping while
3573     // debugging. Eg. if the compare has got a line number inside the loop.
3574     CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3575     cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3576   }
3577 
3578   // Get ready to start creating new instructions into the vectorized body.
3579   assert(LoopVectorPreHeader == L->getLoopPreheader() &&
3580          "Inconsistent vector loop preheader");
3581   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3582 
3583   Optional<MDNode *> VectorizedLoopID =
3584       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3585                                       LLVMLoopVectorizeFollowupVectorized});
3586   if (VectorizedLoopID.hasValue()) {
3587     L->setLoopID(VectorizedLoopID.getValue());
3588 
3589     // Do not setAlreadyVectorized if loop attributes have been defined
3590     // explicitly.
3591     return LoopVectorPreHeader;
3592   }
3593 
3594   // Keep all loop hints from the original loop on the vector loop (we'll
3595   // replace the vectorizer-specific hints below).
3596   if (MDNode *LID = OrigLoop->getLoopID())
3597     L->setLoopID(LID);
3598 
3599   LoopVectorizeHints Hints(L, true, *ORE);
3600   Hints.setAlreadyVectorized();
3601 
3602 #ifdef EXPENSIVE_CHECKS
3603   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3604   LI->verify(*DT);
3605 #endif
3606 
3607   return LoopVectorPreHeader;
3608 }
3609 
3610 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3611   /*
3612    In this function we generate a new loop. The new loop will contain
3613    the vectorized instructions while the old loop will continue to run the
3614    scalar remainder.
3615 
3616        [ ] <-- loop iteration number check.
3617     /   |
3618    /    v
3619   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3620   |  /  |
3621   | /   v
3622   ||   [ ]     <-- vector pre header.
3623   |/    |
3624   |     v
3625   |    [  ] \
3626   |    [  ]_|   <-- vector loop.
3627   |     |
3628   |     v
3629   |   -[ ]   <--- middle-block.
3630   |  /  |
3631   | /   v
3632   -|- >[ ]     <--- new preheader.
3633    |    |
3634    |    v
3635    |   [ ] \
3636    |   [ ]_|   <-- old scalar loop to handle remainder.
3637     \   |
3638      \  v
3639       >[ ]     <-- exit block.
3640    ...
3641    */
3642 
3643   // Get the metadata of the original loop before it gets modified.
3644   MDNode *OrigLoopID = OrigLoop->getLoopID();
3645 
3646   // Workaround!  Compute the trip count of the original loop and cache it
3647   // before we start modifying the CFG.  This code has a systemic problem
3648   // wherein it tries to run analysis over partially constructed IR; this is
3649   // wrong, and not simply for SCEV.  The trip count of the original loop
3650   // simply happens to be prone to hitting this in practice.  In theory, we
3651   // can hit the same issue for any SCEV, or ValueTracking query done during
3652   // mutation.  See PR49900.
3653   getOrCreateTripCount(OrigLoop);
3654 
3655   // Create an empty vector loop, and prepare basic blocks for the runtime
3656   // checks.
3657   Loop *Lp = createVectorLoopSkeleton("");
3658 
3659   // Now, compare the new count to zero. If it is zero skip the vector loop and
3660   // jump to the scalar loop. This check also covers the case where the
3661   // backedge-taken count is uint##_max: adding one to it will overflow leading
3662   // to an incorrect trip count of zero. In this (rare) case we will also jump
3663   // to the scalar loop.
3664   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3665 
3666   // Generate the code to check any assumptions that we've made for SCEV
3667   // expressions.
3668   emitSCEVChecks(Lp, LoopScalarPreHeader);
3669 
3670   // Generate the code that checks in runtime if arrays overlap. We put the
3671   // checks into a separate block to make the more common case of few elements
3672   // faster.
3673   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3674 
3675   // Some loops have a single integer induction variable, while other loops
3676   // don't. One example is c++ iterators that often have multiple pointer
3677   // induction variables. In the code below we also support a case where we
3678   // don't have a single induction variable.
3679   //
3680   // We try to obtain an induction variable from the original loop as hard
3681   // as possible. However if we don't find one that:
3682   //   - is an integer
3683   //   - counts from zero, stepping by one
3684   //   - is the size of the widest induction variable type
3685   // then we create a new one.
3686   OldInduction = Legal->getPrimaryInduction();
3687   Type *IdxTy = Legal->getWidestInductionType();
3688   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3689   // The loop step is equal to the vectorization factor (num of SIMD elements)
3690   // times the unroll factor (num of SIMD instructions).
3691   Builder.SetInsertPoint(&*Lp->getHeader()->getFirstInsertionPt());
3692   Value *Step = createStepForVF(Builder, ConstantInt::get(IdxTy, UF), VF);
3693   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3694   Induction =
3695       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3696                               getDebugLocFromInstOrOperands(OldInduction));
3697 
3698   // Emit phis for the new starting index of the scalar loop.
3699   createInductionResumeValues(Lp, CountRoundDown);
3700 
3701   return completeLoopSkeleton(Lp, OrigLoopID);
3702 }
3703 
3704 // Fix up external users of the induction variable. At this point, we are
3705 // in LCSSA form, with all external PHIs that use the IV having one input value,
3706 // coming from the remainder loop. We need those PHIs to also have a correct
3707 // value for the IV when arriving directly from the middle block.
3708 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3709                                        const InductionDescriptor &II,
3710                                        Value *CountRoundDown, Value *EndValue,
3711                                        BasicBlock *MiddleBlock) {
3712   // There are two kinds of external IV usages - those that use the value
3713   // computed in the last iteration (the PHI) and those that use the penultimate
3714   // value (the value that feeds into the phi from the loop latch).
3715   // We allow both, but they, obviously, have different values.
3716 
3717   assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3718 
3719   DenseMap<Value *, Value *> MissingVals;
3720 
3721   // An external user of the last iteration's value should see the value that
3722   // the remainder loop uses to initialize its own IV.
3723   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3724   for (User *U : PostInc->users()) {
3725     Instruction *UI = cast<Instruction>(U);
3726     if (!OrigLoop->contains(UI)) {
3727       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3728       MissingVals[UI] = EndValue;
3729     }
3730   }
3731 
3732   // An external user of the penultimate value need to see EndValue - Step.
3733   // The simplest way to get this is to recompute it from the constituent SCEVs,
3734   // that is Start + (Step * (CRD - 1)).
3735   for (User *U : OrigPhi->users()) {
3736     auto *UI = cast<Instruction>(U);
3737     if (!OrigLoop->contains(UI)) {
3738       const DataLayout &DL =
3739           OrigLoop->getHeader()->getModule()->getDataLayout();
3740       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3741 
3742       IRBuilder<> B(MiddleBlock->getTerminator());
3743 
3744       // Fast-math-flags propagate from the original induction instruction.
3745       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3746         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3747 
3748       Value *CountMinusOne = B.CreateSub(
3749           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3750       Value *CMO =
3751           !II.getStep()->getType()->isIntegerTy()
3752               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3753                              II.getStep()->getType())
3754               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3755       CMO->setName("cast.cmo");
3756       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3757       Escape->setName("ind.escape");
3758       MissingVals[UI] = Escape;
3759     }
3760   }
3761 
3762   for (auto &I : MissingVals) {
3763     PHINode *PHI = cast<PHINode>(I.first);
3764     // One corner case we have to handle is two IVs "chasing" each-other,
3765     // that is %IV2 = phi [...], [ %IV1, %latch ]
3766     // In this case, if IV1 has an external use, we need to avoid adding both
3767     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3768     // don't already have an incoming value for the middle block.
3769     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3770       PHI->addIncoming(I.second, MiddleBlock);
3771   }
3772 }
3773 
3774 namespace {
3775 
3776 struct CSEDenseMapInfo {
3777   static bool canHandle(const Instruction *I) {
3778     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3779            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3780   }
3781 
3782   static inline Instruction *getEmptyKey() {
3783     return DenseMapInfo<Instruction *>::getEmptyKey();
3784   }
3785 
3786   static inline Instruction *getTombstoneKey() {
3787     return DenseMapInfo<Instruction *>::getTombstoneKey();
3788   }
3789 
3790   static unsigned getHashValue(const Instruction *I) {
3791     assert(canHandle(I) && "Unknown instruction!");
3792     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3793                                                            I->value_op_end()));
3794   }
3795 
3796   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3797     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3798         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3799       return LHS == RHS;
3800     return LHS->isIdenticalTo(RHS);
3801   }
3802 };
3803 
3804 } // end anonymous namespace
3805 
3806 ///Perform cse of induction variable instructions.
3807 static void cse(BasicBlock *BB) {
3808   // Perform simple cse.
3809   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3810   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3811     Instruction *In = &*I++;
3812 
3813     if (!CSEDenseMapInfo::canHandle(In))
3814       continue;
3815 
3816     // Check if we can replace this instruction with any of the
3817     // visited instructions.
3818     if (Instruction *V = CSEMap.lookup(In)) {
3819       In->replaceAllUsesWith(V);
3820       In->eraseFromParent();
3821       continue;
3822     }
3823 
3824     CSEMap[In] = In;
3825   }
3826 }
3827 
3828 InstructionCost
3829 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3830                                               bool &NeedToScalarize) const {
3831   Function *F = CI->getCalledFunction();
3832   Type *ScalarRetTy = CI->getType();
3833   SmallVector<Type *, 4> Tys, ScalarTys;
3834   for (auto &ArgOp : CI->arg_operands())
3835     ScalarTys.push_back(ArgOp->getType());
3836 
3837   // Estimate cost of scalarized vector call. The source operands are assumed
3838   // to be vectors, so we need to extract individual elements from there,
3839   // execute VF scalar calls, and then gather the result into the vector return
3840   // value.
3841   InstructionCost ScalarCallCost =
3842       TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
3843   if (VF.isScalar())
3844     return ScalarCallCost;
3845 
3846   // Compute corresponding vector type for return value and arguments.
3847   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3848   for (Type *ScalarTy : ScalarTys)
3849     Tys.push_back(ToVectorTy(ScalarTy, VF));
3850 
3851   // Compute costs of unpacking argument values for the scalar calls and
3852   // packing the return values to a vector.
3853   InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
3854 
3855   InstructionCost Cost =
3856       ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3857 
3858   // If we can't emit a vector call for this function, then the currently found
3859   // cost is the cost we need to return.
3860   NeedToScalarize = true;
3861   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3862   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3863 
3864   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3865     return Cost;
3866 
3867   // If the corresponding vector cost is cheaper, return its cost.
3868   InstructionCost VectorCallCost =
3869       TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
3870   if (VectorCallCost < Cost) {
3871     NeedToScalarize = false;
3872     Cost = VectorCallCost;
3873   }
3874   return Cost;
3875 }
3876 
3877 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3878   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3879     return Elt;
3880   return VectorType::get(Elt, VF);
3881 }
3882 
3883 InstructionCost
3884 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3885                                                    ElementCount VF) const {
3886   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3887   assert(ID && "Expected intrinsic call!");
3888   Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3889   FastMathFlags FMF;
3890   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3891     FMF = FPMO->getFastMathFlags();
3892 
3893   SmallVector<const Value *> Arguments(CI->arg_begin(), CI->arg_end());
3894   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3895   SmallVector<Type *> ParamTys;
3896   std::transform(FTy->param_begin(), FTy->param_end(),
3897                  std::back_inserter(ParamTys),
3898                  [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3899 
3900   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3901                                     dyn_cast<IntrinsicInst>(CI));
3902   return TTI.getIntrinsicInstrCost(CostAttrs,
3903                                    TargetTransformInfo::TCK_RecipThroughput);
3904 }
3905 
3906 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3907   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3908   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3909   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3910 }
3911 
3912 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3913   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3914   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3915   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3916 }
3917 
3918 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3919   // For every instruction `I` in MinBWs, truncate the operands, create a
3920   // truncated version of `I` and reextend its result. InstCombine runs
3921   // later and will remove any ext/trunc pairs.
3922   SmallPtrSet<Value *, 4> Erased;
3923   for (const auto &KV : Cost->getMinimalBitwidths()) {
3924     // If the value wasn't vectorized, we must maintain the original scalar
3925     // type. The absence of the value from State indicates that it
3926     // wasn't vectorized.
3927     VPValue *Def = State.Plan->getVPValue(KV.first);
3928     if (!State.hasAnyVectorValue(Def))
3929       continue;
3930     for (unsigned Part = 0; Part < UF; ++Part) {
3931       Value *I = State.get(Def, Part);
3932       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3933         continue;
3934       Type *OriginalTy = I->getType();
3935       Type *ScalarTruncatedTy =
3936           IntegerType::get(OriginalTy->getContext(), KV.second);
3937       auto *TruncatedTy = FixedVectorType::get(
3938           ScalarTruncatedTy,
3939           cast<FixedVectorType>(OriginalTy)->getNumElements());
3940       if (TruncatedTy == OriginalTy)
3941         continue;
3942 
3943       IRBuilder<> B(cast<Instruction>(I));
3944       auto ShrinkOperand = [&](Value *V) -> Value * {
3945         if (auto *ZI = dyn_cast<ZExtInst>(V))
3946           if (ZI->getSrcTy() == TruncatedTy)
3947             return ZI->getOperand(0);
3948         return B.CreateZExtOrTrunc(V, TruncatedTy);
3949       };
3950 
3951       // The actual instruction modification depends on the instruction type,
3952       // unfortunately.
3953       Value *NewI = nullptr;
3954       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3955         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3956                              ShrinkOperand(BO->getOperand(1)));
3957 
3958         // Any wrapping introduced by shrinking this operation shouldn't be
3959         // considered undefined behavior. So, we can't unconditionally copy
3960         // arithmetic wrapping flags to NewI.
3961         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3962       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3963         NewI =
3964             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3965                          ShrinkOperand(CI->getOperand(1)));
3966       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3967         NewI = B.CreateSelect(SI->getCondition(),
3968                               ShrinkOperand(SI->getTrueValue()),
3969                               ShrinkOperand(SI->getFalseValue()));
3970       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3971         switch (CI->getOpcode()) {
3972         default:
3973           llvm_unreachable("Unhandled cast!");
3974         case Instruction::Trunc:
3975           NewI = ShrinkOperand(CI->getOperand(0));
3976           break;
3977         case Instruction::SExt:
3978           NewI = B.CreateSExtOrTrunc(
3979               CI->getOperand(0),
3980               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3981           break;
3982         case Instruction::ZExt:
3983           NewI = B.CreateZExtOrTrunc(
3984               CI->getOperand(0),
3985               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3986           break;
3987         }
3988       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3989         auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType())
3990                              ->getNumElements();
3991         auto *O0 = B.CreateZExtOrTrunc(
3992             SI->getOperand(0),
3993             FixedVectorType::get(ScalarTruncatedTy, Elements0));
3994         auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType())
3995                              ->getNumElements();
3996         auto *O1 = B.CreateZExtOrTrunc(
3997             SI->getOperand(1),
3998             FixedVectorType::get(ScalarTruncatedTy, Elements1));
3999 
4000         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
4001       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
4002         // Don't do anything with the operands, just extend the result.
4003         continue;
4004       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
4005         auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType())
4006                             ->getNumElements();
4007         auto *O0 = B.CreateZExtOrTrunc(
4008             IE->getOperand(0),
4009             FixedVectorType::get(ScalarTruncatedTy, Elements));
4010         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
4011         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
4012       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
4013         auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType())
4014                             ->getNumElements();
4015         auto *O0 = B.CreateZExtOrTrunc(
4016             EE->getOperand(0),
4017             FixedVectorType::get(ScalarTruncatedTy, Elements));
4018         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
4019       } else {
4020         // If we don't know what to do, be conservative and don't do anything.
4021         continue;
4022       }
4023 
4024       // Lastly, extend the result.
4025       NewI->takeName(cast<Instruction>(I));
4026       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
4027       I->replaceAllUsesWith(Res);
4028       cast<Instruction>(I)->eraseFromParent();
4029       Erased.insert(I);
4030       State.reset(Def, Res, Part);
4031     }
4032   }
4033 
4034   // We'll have created a bunch of ZExts that are now parentless. Clean up.
4035   for (const auto &KV : Cost->getMinimalBitwidths()) {
4036     // If the value wasn't vectorized, we must maintain the original scalar
4037     // type. The absence of the value from State indicates that it
4038     // wasn't vectorized.
4039     VPValue *Def = State.Plan->getVPValue(KV.first);
4040     if (!State.hasAnyVectorValue(Def))
4041       continue;
4042     for (unsigned Part = 0; Part < UF; ++Part) {
4043       Value *I = State.get(Def, Part);
4044       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
4045       if (Inst && Inst->use_empty()) {
4046         Value *NewI = Inst->getOperand(0);
4047         Inst->eraseFromParent();
4048         State.reset(Def, NewI, Part);
4049       }
4050     }
4051   }
4052 }
4053 
4054 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
4055   // Insert truncates and extends for any truncated instructions as hints to
4056   // InstCombine.
4057   if (VF.isVector())
4058     truncateToMinimalBitwidths(State);
4059 
4060   // Fix widened non-induction PHIs by setting up the PHI operands.
4061   if (OrigPHIsToFix.size()) {
4062     assert(EnableVPlanNativePath &&
4063            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
4064     fixNonInductionPHIs(State);
4065   }
4066 
4067   // At this point every instruction in the original loop is widened to a
4068   // vector form. Now we need to fix the recurrences in the loop. These PHI
4069   // nodes are currently empty because we did not want to introduce cycles.
4070   // This is the second stage of vectorizing recurrences.
4071   fixCrossIterationPHIs(State);
4072 
4073   // Forget the original basic block.
4074   PSE.getSE()->forgetLoop(OrigLoop);
4075 
4076   // Fix-up external users of the induction variables.
4077   for (auto &Entry : Legal->getInductionVars())
4078     fixupIVUsers(Entry.first, Entry.second,
4079                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
4080                  IVEndValues[Entry.first], LoopMiddleBlock);
4081 
4082   fixLCSSAPHIs(State);
4083   for (Instruction *PI : PredicatedInstructions)
4084     sinkScalarOperands(&*PI);
4085 
4086   // Remove redundant induction instructions.
4087   cse(LoopVectorBody);
4088 
4089   // Set/update profile weights for the vector and remainder loops as original
4090   // loop iterations are now distributed among them. Note that original loop
4091   // represented by LoopScalarBody becomes remainder loop after vectorization.
4092   //
4093   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
4094   // end up getting slightly roughened result but that should be OK since
4095   // profile is not inherently precise anyway. Note also possible bypass of
4096   // vector code caused by legality checks is ignored, assigning all the weight
4097   // to the vector loop, optimistically.
4098   //
4099   // For scalable vectorization we can't know at compile time how many iterations
4100   // of the loop are handled in one vector iteration, so instead assume a pessimistic
4101   // vscale of '1'.
4102   setProfileInfoAfterUnrolling(
4103       LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
4104       LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
4105 }
4106 
4107 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
4108   // In order to support recurrences we need to be able to vectorize Phi nodes.
4109   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4110   // stage #2: We now need to fix the recurrences by adding incoming edges to
4111   // the currently empty PHI nodes. At this point every instruction in the
4112   // original loop is widened to a vector form so we can use them to construct
4113   // the incoming edges.
4114   VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock();
4115   for (VPRecipeBase &R : Header->phis()) {
4116     auto *PhiR = dyn_cast<VPWidenPHIRecipe>(&R);
4117     if (!PhiR)
4118       continue;
4119     auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
4120     if (PhiR->getRecurrenceDescriptor()) {
4121       fixReduction(PhiR, State);
4122     } else if (Legal->isFirstOrderRecurrence(OrigPhi))
4123       fixFirstOrderRecurrence(OrigPhi, State);
4124   }
4125 }
4126 
4127 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi,
4128                                                   VPTransformState &State) {
4129   // This is the second phase of vectorizing first-order recurrences. An
4130   // overview of the transformation is described below. Suppose we have the
4131   // following loop.
4132   //
4133   //   for (int i = 0; i < n; ++i)
4134   //     b[i] = a[i] - a[i - 1];
4135   //
4136   // There is a first-order recurrence on "a". For this loop, the shorthand
4137   // scalar IR looks like:
4138   //
4139   //   scalar.ph:
4140   //     s_init = a[-1]
4141   //     br scalar.body
4142   //
4143   //   scalar.body:
4144   //     i = phi [0, scalar.ph], [i+1, scalar.body]
4145   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
4146   //     s2 = a[i]
4147   //     b[i] = s2 - s1
4148   //     br cond, scalar.body, ...
4149   //
4150   // In this example, s1 is a recurrence because it's value depends on the
4151   // previous iteration. In the first phase of vectorization, we created a
4152   // temporary value for s1. We now complete the vectorization and produce the
4153   // shorthand vector IR shown below (for VF = 4, UF = 1).
4154   //
4155   //   vector.ph:
4156   //     v_init = vector(..., ..., ..., a[-1])
4157   //     br vector.body
4158   //
4159   //   vector.body
4160   //     i = phi [0, vector.ph], [i+4, vector.body]
4161   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
4162   //     v2 = a[i, i+1, i+2, i+3];
4163   //     v3 = vector(v1(3), v2(0, 1, 2))
4164   //     b[i, i+1, i+2, i+3] = v2 - v3
4165   //     br cond, vector.body, middle.block
4166   //
4167   //   middle.block:
4168   //     x = v2(3)
4169   //     br scalar.ph
4170   //
4171   //   scalar.ph:
4172   //     s_init = phi [x, middle.block], [a[-1], otherwise]
4173   //     br scalar.body
4174   //
4175   // After execution completes the vector loop, we extract the next value of
4176   // the recurrence (x) to use as the initial value in the scalar loop.
4177 
4178   // Get the original loop preheader and single loop latch.
4179   auto *Preheader = OrigLoop->getLoopPreheader();
4180   auto *Latch = OrigLoop->getLoopLatch();
4181 
4182   // Get the initial and previous values of the scalar recurrence.
4183   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
4184   auto *Previous = Phi->getIncomingValueForBlock(Latch);
4185 
4186   auto *IdxTy = Builder.getInt32Ty();
4187   auto *One = ConstantInt::get(IdxTy, 1);
4188 
4189   // Create a vector from the initial value.
4190   auto *VectorInit = ScalarInit;
4191   if (VF.isVector()) {
4192     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4193     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
4194     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
4195     VectorInit = Builder.CreateInsertElement(
4196         PoisonValue::get(VectorType::get(VectorInit->getType(), VF)),
4197         VectorInit, LastIdx, "vector.recur.init");
4198   }
4199 
4200   VPValue *PhiDef = State.Plan->getVPValue(Phi);
4201   VPValue *PreviousDef = State.Plan->getVPValue(Previous);
4202   // We constructed a temporary phi node in the first phase of vectorization.
4203   // This phi node will eventually be deleted.
4204   Builder.SetInsertPoint(cast<Instruction>(State.get(PhiDef, 0)));
4205 
4206   // Create a phi node for the new recurrence. The current value will either be
4207   // the initial value inserted into a vector or loop-varying vector value.
4208   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
4209   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
4210 
4211   // Get the vectorized previous value of the last part UF - 1. It appears last
4212   // among all unrolled iterations, due to the order of their construction.
4213   Value *PreviousLastPart = State.get(PreviousDef, UF - 1);
4214 
4215   // Find and set the insertion point after the previous value if it is an
4216   // instruction.
4217   BasicBlock::iterator InsertPt;
4218   // Note that the previous value may have been constant-folded so it is not
4219   // guaranteed to be an instruction in the vector loop.
4220   // FIXME: Loop invariant values do not form recurrences. We should deal with
4221   //        them earlier.
4222   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
4223     InsertPt = LoopVectorBody->getFirstInsertionPt();
4224   else {
4225     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
4226     if (isa<PHINode>(PreviousLastPart))
4227       // If the previous value is a phi node, we should insert after all the phi
4228       // nodes in the block containing the PHI to avoid breaking basic block
4229       // verification. Note that the basic block may be different to
4230       // LoopVectorBody, in case we predicate the loop.
4231       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
4232     else
4233       InsertPt = ++PreviousInst->getIterator();
4234   }
4235   Builder.SetInsertPoint(&*InsertPt);
4236 
4237   // The vector from which to take the initial value for the current iteration
4238   // (actual or unrolled). Initially, this is the vector phi node.
4239   Value *Incoming = VecPhi;
4240 
4241   // Shuffle the current and previous vector and update the vector parts.
4242   for (unsigned Part = 0; Part < UF; ++Part) {
4243     Value *PreviousPart = State.get(PreviousDef, Part);
4244     Value *PhiPart = State.get(PhiDef, Part);
4245     auto *Shuffle = VF.isVector()
4246                         ? Builder.CreateVectorSplice(Incoming, PreviousPart, -1)
4247                         : Incoming;
4248     PhiPart->replaceAllUsesWith(Shuffle);
4249     cast<Instruction>(PhiPart)->eraseFromParent();
4250     State.reset(PhiDef, Shuffle, Part);
4251     Incoming = PreviousPart;
4252   }
4253 
4254   // Fix the latch value of the new recurrence in the vector loop.
4255   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
4256 
4257   // Extract the last vector element in the middle block. This will be the
4258   // initial value for the recurrence when jumping to the scalar loop.
4259   auto *ExtractForScalar = Incoming;
4260   if (VF.isVector()) {
4261     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4262     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
4263     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
4264     ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
4265                                                     "vector.recur.extract");
4266   }
4267   // Extract the second last element in the middle block if the
4268   // Phi is used outside the loop. We need to extract the phi itself
4269   // and not the last element (the phi update in the current iteration). This
4270   // will be the value when jumping to the exit block from the LoopMiddleBlock,
4271   // when the scalar loop is not run at all.
4272   Value *ExtractForPhiUsedOutsideLoop = nullptr;
4273   if (VF.isVector()) {
4274     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
4275     auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
4276     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4277         Incoming, Idx, "vector.recur.extract.for.phi");
4278   } else if (UF > 1)
4279     // When loop is unrolled without vectorizing, initialize
4280     // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
4281     // of `Incoming`. This is analogous to the vectorized case above: extracting
4282     // the second last element when VF > 1.
4283     ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
4284 
4285   // Fix the initial value of the original recurrence in the scalar loop.
4286   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
4287   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4288   for (auto *BB : predecessors(LoopScalarPreHeader)) {
4289     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4290     Start->addIncoming(Incoming, BB);
4291   }
4292 
4293   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
4294   Phi->setName("scalar.recur");
4295 
4296   // Finally, fix users of the recurrence outside the loop. The users will need
4297   // either the last value of the scalar recurrence or the last value of the
4298   // vector recurrence we extracted in the middle block. Since the loop is in
4299   // LCSSA form, we just need to find all the phi nodes for the original scalar
4300   // recurrence in the exit block, and then add an edge for the middle block.
4301   // Note that LCSSA does not imply single entry when the original scalar loop
4302   // had multiple exiting edges (as we always run the last iteration in the
4303   // scalar epilogue); in that case, the exiting path through middle will be
4304   // dynamically dead and the value picked for the phi doesn't matter.
4305   for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4306     if (any_of(LCSSAPhi.incoming_values(),
4307                [Phi](Value *V) { return V == Phi; }))
4308       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4309 }
4310 
4311 static bool useOrderedReductions(RecurrenceDescriptor &RdxDesc) {
4312   return EnableStrictReductions && RdxDesc.isOrdered();
4313 }
4314 
4315 void InnerLoopVectorizer::fixReduction(VPWidenPHIRecipe *PhiR,
4316                                        VPTransformState &State) {
4317   PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
4318   // Get it's reduction variable descriptor.
4319   assert(Legal->isReductionVariable(OrigPhi) &&
4320          "Unable to find the reduction variable");
4321   RecurrenceDescriptor RdxDesc = *PhiR->getRecurrenceDescriptor();
4322 
4323   RecurKind RK = RdxDesc.getRecurrenceKind();
4324   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4325   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4326   setDebugLocFromInst(Builder, ReductionStartValue);
4327   bool IsInLoopReductionPhi = Cost->isInLoopReduction(OrigPhi);
4328 
4329   VPValue *LoopExitInstDef = State.Plan->getVPValue(LoopExitInst);
4330   // This is the vector-clone of the value that leaves the loop.
4331   Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
4332 
4333   // Wrap flags are in general invalid after vectorization, clear them.
4334   clearReductionWrapFlags(RdxDesc, State);
4335 
4336   // Fix the vector-loop phi.
4337 
4338   // Reductions do not have to start at zero. They can start with
4339   // any loop invariant values.
4340   BasicBlock *VectorLoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4341 
4342   bool IsOrdered = State.VF.isVector() && IsInLoopReductionPhi &&
4343                    useOrderedReductions(RdxDesc);
4344 
4345   for (unsigned Part = 0; Part < UF; ++Part) {
4346     if (IsOrdered && Part > 0)
4347       break;
4348     Value *VecRdxPhi = State.get(PhiR->getVPSingleValue(), Part);
4349     Value *Val = State.get(PhiR->getBackedgeValue(), Part);
4350     if (IsOrdered)
4351       Val = State.get(PhiR->getBackedgeValue(), UF - 1);
4352 
4353     cast<PHINode>(VecRdxPhi)->addIncoming(Val, VectorLoopLatch);
4354   }
4355 
4356   // Before each round, move the insertion point right between
4357   // the PHIs and the values we are going to write.
4358   // This allows us to write both PHINodes and the extractelement
4359   // instructions.
4360   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4361 
4362   setDebugLocFromInst(Builder, LoopExitInst);
4363 
4364   Type *PhiTy = OrigPhi->getType();
4365   // If tail is folded by masking, the vector value to leave the loop should be
4366   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
4367   // instead of the former. For an inloop reduction the reduction will already
4368   // be predicated, and does not need to be handled here.
4369   if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) {
4370     for (unsigned Part = 0; Part < UF; ++Part) {
4371       Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
4372       Value *Sel = nullptr;
4373       for (User *U : VecLoopExitInst->users()) {
4374         if (isa<SelectInst>(U)) {
4375           assert(!Sel && "Reduction exit feeding two selects");
4376           Sel = U;
4377         } else
4378           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
4379       }
4380       assert(Sel && "Reduction exit feeds no select");
4381       State.reset(LoopExitInstDef, Sel, Part);
4382 
4383       // If the target can create a predicated operator for the reduction at no
4384       // extra cost in the loop (for example a predicated vadd), it can be
4385       // cheaper for the select to remain in the loop than be sunk out of it,
4386       // and so use the select value for the phi instead of the old
4387       // LoopExitValue.
4388       if (PreferPredicatedReductionSelect ||
4389           TTI->preferPredicatedReductionSelect(
4390               RdxDesc.getOpcode(), PhiTy,
4391               TargetTransformInfo::ReductionFlags())) {
4392         auto *VecRdxPhi =
4393             cast<PHINode>(State.get(PhiR->getVPSingleValue(), Part));
4394         VecRdxPhi->setIncomingValueForBlock(
4395             LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4396       }
4397     }
4398   }
4399 
4400   // If the vector reduction can be performed in a smaller type, we truncate
4401   // then extend the loop exit value to enable InstCombine to evaluate the
4402   // entire expression in the smaller type.
4403   if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
4404     assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!");
4405     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4406     Builder.SetInsertPoint(
4407         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4408     VectorParts RdxParts(UF);
4409     for (unsigned Part = 0; Part < UF; ++Part) {
4410       RdxParts[Part] = State.get(LoopExitInstDef, Part);
4411       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4412       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4413                                         : Builder.CreateZExt(Trunc, VecTy);
4414       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
4415            UI != RdxParts[Part]->user_end();)
4416         if (*UI != Trunc) {
4417           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
4418           RdxParts[Part] = Extnd;
4419         } else {
4420           ++UI;
4421         }
4422     }
4423     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4424     for (unsigned Part = 0; Part < UF; ++Part) {
4425       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4426       State.reset(LoopExitInstDef, RdxParts[Part], Part);
4427     }
4428   }
4429 
4430   // Reduce all of the unrolled parts into a single vector.
4431   Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
4432   unsigned Op = RecurrenceDescriptor::getOpcode(RK);
4433 
4434   // The middle block terminator has already been assigned a DebugLoc here (the
4435   // OrigLoop's single latch terminator). We want the whole middle block to
4436   // appear to execute on this line because: (a) it is all compiler generated,
4437   // (b) these instructions are always executed after evaluating the latch
4438   // conditional branch, and (c) other passes may add new predecessors which
4439   // terminate on this line. This is the easiest way to ensure we don't
4440   // accidentally cause an extra step back into the loop while debugging.
4441   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
4442   if (IsOrdered)
4443     ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
4444   else {
4445     // Floating-point operations should have some FMF to enable the reduction.
4446     IRBuilderBase::FastMathFlagGuard FMFG(Builder);
4447     Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
4448     for (unsigned Part = 1; Part < UF; ++Part) {
4449       Value *RdxPart = State.get(LoopExitInstDef, Part);
4450       if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
4451         ReducedPartRdx = Builder.CreateBinOp(
4452             (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
4453       } else {
4454         ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
4455       }
4456     }
4457   }
4458 
4459   // Create the reduction after the loop. Note that inloop reductions create the
4460   // target reduction in the loop using a Reduction recipe.
4461   if (VF.isVector() && !IsInLoopReductionPhi) {
4462     ReducedPartRdx =
4463         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx);
4464     // If the reduction can be performed in a smaller type, we need to extend
4465     // the reduction to the wider type before we branch to the original loop.
4466     if (PhiTy != RdxDesc.getRecurrenceType())
4467       ReducedPartRdx = RdxDesc.isSigned()
4468                            ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
4469                            : Builder.CreateZExt(ReducedPartRdx, PhiTy);
4470   }
4471 
4472   // Create a phi node that merges control-flow from the backedge-taken check
4473   // block and the middle block.
4474   PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
4475                                         LoopScalarPreHeader->getTerminator());
4476   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
4477     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
4478   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
4479 
4480   // Now, we need to fix the users of the reduction variable
4481   // inside and outside of the scalar remainder loop.
4482 
4483   // We know that the loop is in LCSSA form. We need to update the PHI nodes
4484   // in the exit blocks.  See comment on analogous loop in
4485   // fixFirstOrderRecurrence for a more complete explaination of the logic.
4486   for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4487     if (any_of(LCSSAPhi.incoming_values(),
4488                [LoopExitInst](Value *V) { return V == LoopExitInst; }))
4489       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4490 
4491   // Fix the scalar loop reduction variable with the incoming reduction sum
4492   // from the vector body and from the backedge value.
4493   int IncomingEdgeBlockIdx =
4494       OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4495   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4496   // Pick the other block.
4497   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4498   OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4499   OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4500 }
4501 
4502 void InnerLoopVectorizer::clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc,
4503                                                   VPTransformState &State) {
4504   RecurKind RK = RdxDesc.getRecurrenceKind();
4505   if (RK != RecurKind::Add && RK != RecurKind::Mul)
4506     return;
4507 
4508   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4509   assert(LoopExitInstr && "null loop exit instruction");
4510   SmallVector<Instruction *, 8> Worklist;
4511   SmallPtrSet<Instruction *, 8> Visited;
4512   Worklist.push_back(LoopExitInstr);
4513   Visited.insert(LoopExitInstr);
4514 
4515   while (!Worklist.empty()) {
4516     Instruction *Cur = Worklist.pop_back_val();
4517     if (isa<OverflowingBinaryOperator>(Cur))
4518       for (unsigned Part = 0; Part < UF; ++Part) {
4519         Value *V = State.get(State.Plan->getVPValue(Cur), Part);
4520         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4521       }
4522 
4523     for (User *U : Cur->users()) {
4524       Instruction *UI = cast<Instruction>(U);
4525       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4526           Visited.insert(UI).second)
4527         Worklist.push_back(UI);
4528     }
4529   }
4530 }
4531 
4532 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) {
4533   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4534     if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
4535       // Some phis were already hand updated by the reduction and recurrence
4536       // code above, leave them alone.
4537       continue;
4538 
4539     auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4540     // Non-instruction incoming values will have only one value.
4541 
4542     VPLane Lane = VPLane::getFirstLane();
4543     if (isa<Instruction>(IncomingValue) &&
4544         !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue),
4545                                            VF))
4546       Lane = VPLane::getLastLaneForVF(VF);
4547 
4548     // Can be a loop invariant incoming value or the last scalar value to be
4549     // extracted from the vectorized loop.
4550     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4551     Value *lastIncomingValue =
4552         OrigLoop->isLoopInvariant(IncomingValue)
4553             ? IncomingValue
4554             : State.get(State.Plan->getVPValue(IncomingValue),
4555                         VPIteration(UF - 1, Lane));
4556     LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4557   }
4558 }
4559 
4560 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4561   // The basic block and loop containing the predicated instruction.
4562   auto *PredBB = PredInst->getParent();
4563   auto *VectorLoop = LI->getLoopFor(PredBB);
4564 
4565   // Initialize a worklist with the operands of the predicated instruction.
4566   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4567 
4568   // Holds instructions that we need to analyze again. An instruction may be
4569   // reanalyzed if we don't yet know if we can sink it or not.
4570   SmallVector<Instruction *, 8> InstsToReanalyze;
4571 
4572   // Returns true if a given use occurs in the predicated block. Phi nodes use
4573   // their operands in their corresponding predecessor blocks.
4574   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4575     auto *I = cast<Instruction>(U.getUser());
4576     BasicBlock *BB = I->getParent();
4577     if (auto *Phi = dyn_cast<PHINode>(I))
4578       BB = Phi->getIncomingBlock(
4579           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4580     return BB == PredBB;
4581   };
4582 
4583   // Iteratively sink the scalarized operands of the predicated instruction
4584   // into the block we created for it. When an instruction is sunk, it's
4585   // operands are then added to the worklist. The algorithm ends after one pass
4586   // through the worklist doesn't sink a single instruction.
4587   bool Changed;
4588   do {
4589     // Add the instructions that need to be reanalyzed to the worklist, and
4590     // reset the changed indicator.
4591     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4592     InstsToReanalyze.clear();
4593     Changed = false;
4594 
4595     while (!Worklist.empty()) {
4596       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4597 
4598       // We can't sink an instruction if it is a phi node, is not in the loop,
4599       // or may have side effects.
4600       if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
4601           I->mayHaveSideEffects())
4602         continue;
4603 
4604       // If the instruction is already in PredBB, check if we can sink its
4605       // operands. In that case, VPlan's sinkScalarOperands() succeeded in
4606       // sinking the scalar instruction I, hence it appears in PredBB; but it
4607       // may have failed to sink I's operands (recursively), which we try
4608       // (again) here.
4609       if (I->getParent() == PredBB) {
4610         Worklist.insert(I->op_begin(), I->op_end());
4611         continue;
4612       }
4613 
4614       // It's legal to sink the instruction if all its uses occur in the
4615       // predicated block. Otherwise, there's nothing to do yet, and we may
4616       // need to reanalyze the instruction.
4617       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4618         InstsToReanalyze.push_back(I);
4619         continue;
4620       }
4621 
4622       // Move the instruction to the beginning of the predicated block, and add
4623       // it's operands to the worklist.
4624       I->moveBefore(&*PredBB->getFirstInsertionPt());
4625       Worklist.insert(I->op_begin(), I->op_end());
4626 
4627       // The sinking may have enabled other instructions to be sunk, so we will
4628       // need to iterate.
4629       Changed = true;
4630     }
4631   } while (Changed);
4632 }
4633 
4634 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
4635   for (PHINode *OrigPhi : OrigPHIsToFix) {
4636     VPWidenPHIRecipe *VPPhi =
4637         cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi));
4638     PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4639     // Make sure the builder has a valid insert point.
4640     Builder.SetInsertPoint(NewPhi);
4641     for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4642       VPValue *Inc = VPPhi->getIncomingValue(i);
4643       VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4644       NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4645     }
4646   }
4647 }
4648 
4649 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
4650                                    VPUser &Operands, unsigned UF,
4651                                    ElementCount VF, bool IsPtrLoopInvariant,
4652                                    SmallBitVector &IsIndexLoopInvariant,
4653                                    VPTransformState &State) {
4654   // Construct a vector GEP by widening the operands of the scalar GEP as
4655   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4656   // results in a vector of pointers when at least one operand of the GEP
4657   // is vector-typed. Thus, to keep the representation compact, we only use
4658   // vector-typed operands for loop-varying values.
4659 
4660   if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4661     // If we are vectorizing, but the GEP has only loop-invariant operands,
4662     // the GEP we build (by only using vector-typed operands for
4663     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4664     // produce a vector of pointers, we need to either arbitrarily pick an
4665     // operand to broadcast, or broadcast a clone of the original GEP.
4666     // Here, we broadcast a clone of the original.
4667     //
4668     // TODO: If at some point we decide to scalarize instructions having
4669     //       loop-invariant operands, this special case will no longer be
4670     //       required. We would add the scalarization decision to
4671     //       collectLoopScalars() and teach getVectorValue() to broadcast
4672     //       the lane-zero scalar value.
4673     auto *Clone = Builder.Insert(GEP->clone());
4674     for (unsigned Part = 0; Part < UF; ++Part) {
4675       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4676       State.set(VPDef, EntryPart, Part);
4677       addMetadata(EntryPart, GEP);
4678     }
4679   } else {
4680     // If the GEP has at least one loop-varying operand, we are sure to
4681     // produce a vector of pointers. But if we are only unrolling, we want
4682     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4683     // produce with the code below will be scalar (if VF == 1) or vector
4684     // (otherwise). Note that for the unroll-only case, we still maintain
4685     // values in the vector mapping with initVector, as we do for other
4686     // instructions.
4687     for (unsigned Part = 0; Part < UF; ++Part) {
4688       // The pointer operand of the new GEP. If it's loop-invariant, we
4689       // won't broadcast it.
4690       auto *Ptr = IsPtrLoopInvariant
4691                       ? State.get(Operands.getOperand(0), VPIteration(0, 0))
4692                       : State.get(Operands.getOperand(0), Part);
4693 
4694       // Collect all the indices for the new GEP. If any index is
4695       // loop-invariant, we won't broadcast it.
4696       SmallVector<Value *, 4> Indices;
4697       for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
4698         VPValue *Operand = Operands.getOperand(I);
4699         if (IsIndexLoopInvariant[I - 1])
4700           Indices.push_back(State.get(Operand, VPIteration(0, 0)));
4701         else
4702           Indices.push_back(State.get(Operand, Part));
4703       }
4704 
4705       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4706       // but it should be a vector, otherwise.
4707       auto *NewGEP =
4708           GEP->isInBounds()
4709               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4710                                           Indices)
4711               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4712       assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
4713              "NewGEP is not a pointer vector");
4714       State.set(VPDef, NewGEP, Part);
4715       addMetadata(NewGEP, GEP);
4716     }
4717   }
4718 }
4719 
4720 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
4721                                               RecurrenceDescriptor *RdxDesc,
4722                                               VPWidenPHIRecipe *PhiR,
4723                                               VPTransformState &State) {
4724   PHINode *P = cast<PHINode>(PN);
4725   if (EnableVPlanNativePath) {
4726     // Currently we enter here in the VPlan-native path for non-induction
4727     // PHIs where all control flow is uniform. We simply widen these PHIs.
4728     // Create a vector phi with no operands - the vector phi operands will be
4729     // set at the end of vector code generation.
4730     Type *VecTy = (State.VF.isScalar())
4731                       ? PN->getType()
4732                       : VectorType::get(PN->getType(), State.VF);
4733     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4734     State.set(PhiR, VecPhi, 0);
4735     OrigPHIsToFix.push_back(P);
4736 
4737     return;
4738   }
4739 
4740   assert(PN->getParent() == OrigLoop->getHeader() &&
4741          "Non-header phis should have been handled elsewhere");
4742 
4743   VPValue *StartVPV = PhiR->getStartValue();
4744   Value *StartV = StartVPV ? StartVPV->getLiveInIRValue() : nullptr;
4745   // In order to support recurrences we need to be able to vectorize Phi nodes.
4746   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4747   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4748   // this value when we vectorize all of the instructions that use the PHI.
4749   if (RdxDesc || Legal->isFirstOrderRecurrence(P)) {
4750     Value *Iden = nullptr;
4751     bool ScalarPHI =
4752         (State.VF.isScalar()) || Cost->isInLoopReduction(cast<PHINode>(PN));
4753     Type *VecTy =
4754         ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF);
4755 
4756     if (RdxDesc) {
4757       assert(Legal->isReductionVariable(P) && StartV &&
4758              "RdxDesc should only be set for reduction variables; in that case "
4759              "a StartV is also required");
4760       RecurKind RK = RdxDesc->getRecurrenceKind();
4761       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) {
4762         // MinMax reduction have the start value as their identify.
4763         if (ScalarPHI) {
4764           Iden = StartV;
4765         } else {
4766           IRBuilderBase::InsertPointGuard IPBuilder(Builder);
4767           Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4768           StartV = Iden =
4769               Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
4770         }
4771       } else {
4772         Constant *IdenC = RecurrenceDescriptor::getRecurrenceIdentity(
4773             RK, VecTy->getScalarType(), RdxDesc->getFastMathFlags());
4774         Iden = IdenC;
4775 
4776         if (!ScalarPHI) {
4777           Iden = ConstantVector::getSplat(State.VF, IdenC);
4778           IRBuilderBase::InsertPointGuard IPBuilder(Builder);
4779           Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
4780           Constant *Zero = Builder.getInt32(0);
4781           StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
4782         }
4783       }
4784     }
4785 
4786     bool IsOrdered = State.VF.isVector() &&
4787                      Cost->isInLoopReduction(cast<PHINode>(PN)) &&
4788                      useOrderedReductions(*RdxDesc);
4789 
4790     for (unsigned Part = 0; Part < State.UF; ++Part) {
4791       // This is phase one of vectorizing PHIs.
4792       if (Part > 0 && IsOrdered)
4793         return;
4794       Value *EntryPart = PHINode::Create(
4795           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4796       State.set(PhiR, EntryPart, Part);
4797       if (StartV) {
4798         // Make sure to add the reduction start value only to the
4799         // first unroll part.
4800         Value *StartVal = (Part == 0) ? StartV : Iden;
4801         cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader);
4802       }
4803     }
4804     return;
4805   }
4806 
4807   assert(!Legal->isReductionVariable(P) &&
4808          "reductions should be handled above");
4809 
4810   setDebugLocFromInst(Builder, P);
4811 
4812   // This PHINode must be an induction variable.
4813   // Make sure that we know about it.
4814   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4815 
4816   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4817   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4818 
4819   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4820   // which can be found from the original scalar operations.
4821   switch (II.getKind()) {
4822   case InductionDescriptor::IK_NoInduction:
4823     llvm_unreachable("Unknown induction");
4824   case InductionDescriptor::IK_IntInduction:
4825   case InductionDescriptor::IK_FpInduction:
4826     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4827   case InductionDescriptor::IK_PtrInduction: {
4828     // Handle the pointer induction variable case.
4829     assert(P->getType()->isPointerTy() && "Unexpected type.");
4830 
4831     if (Cost->isScalarAfterVectorization(P, State.VF)) {
4832       // This is the normalized GEP that starts counting at zero.
4833       Value *PtrInd =
4834           Builder.CreateSExtOrTrunc(Induction, II.getStep()->getType());
4835       // Determine the number of scalars we need to generate for each unroll
4836       // iteration. If the instruction is uniform, we only need to generate the
4837       // first lane. Otherwise, we generate all VF values.
4838       bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF);
4839       unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue();
4840 
4841       bool NeedsVectorIndex = !IsUniform && VF.isScalable();
4842       Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr;
4843       if (NeedsVectorIndex) {
4844         Type *VecIVTy = VectorType::get(PtrInd->getType(), VF);
4845         UnitStepVec = Builder.CreateStepVector(VecIVTy);
4846         PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd);
4847       }
4848 
4849       for (unsigned Part = 0; Part < UF; ++Part) {
4850         Value *PartStart = createStepForVF(
4851             Builder, ConstantInt::get(PtrInd->getType(), Part), VF);
4852 
4853         if (NeedsVectorIndex) {
4854           Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart);
4855           Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec);
4856           Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices);
4857           Value *SclrGep =
4858               emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II);
4859           SclrGep->setName("next.gep");
4860           State.set(PhiR, SclrGep, Part);
4861           // We've cached the whole vector, which means we can support the
4862           // extraction of any lane.
4863           continue;
4864         }
4865 
4866         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4867           Value *Idx = Builder.CreateAdd(
4868               PartStart, ConstantInt::get(PtrInd->getType(), Lane));
4869           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4870           Value *SclrGep =
4871               emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4872           SclrGep->setName("next.gep");
4873           State.set(PhiR, SclrGep, VPIteration(Part, Lane));
4874         }
4875       }
4876       return;
4877     }
4878     assert(isa<SCEVConstant>(II.getStep()) &&
4879            "Induction step not a SCEV constant!");
4880     Type *PhiType = II.getStep()->getType();
4881 
4882     // Build a pointer phi
4883     Value *ScalarStartValue = II.getStartValue();
4884     Type *ScStValueType = ScalarStartValue->getType();
4885     PHINode *NewPointerPhi =
4886         PHINode::Create(ScStValueType, 2, "pointer.phi", Induction);
4887     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4888 
4889     // A pointer induction, performed by using a gep
4890     BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4891     Instruction *InductionLoc = LoopLatch->getTerminator();
4892     const SCEV *ScalarStep = II.getStep();
4893     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4894     Value *ScalarStepValue =
4895         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4896     Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF);
4897     Value *NumUnrolledElems =
4898         Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
4899     Value *InductionGEP = GetElementPtrInst::Create(
4900         ScStValueType->getPointerElementType(), NewPointerPhi,
4901         Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
4902         InductionLoc);
4903     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4904 
4905     // Create UF many actual address geps that use the pointer
4906     // phi as base and a vectorized version of the step value
4907     // (<step*0, ..., step*N>) as offset.
4908     for (unsigned Part = 0; Part < State.UF; ++Part) {
4909       Type *VecPhiType = VectorType::get(PhiType, State.VF);
4910       Value *StartOffsetScalar =
4911           Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
4912       Value *StartOffset =
4913           Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
4914       // Create a vector of consecutive numbers from zero to VF.
4915       StartOffset =
4916           Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType));
4917 
4918       Value *GEP = Builder.CreateGEP(
4919           ScStValueType->getPointerElementType(), NewPointerPhi,
4920           Builder.CreateMul(
4921               StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue),
4922               "vector.gep"));
4923       State.set(PhiR, GEP, Part);
4924     }
4925   }
4926   }
4927 }
4928 
4929 /// A helper function for checking whether an integer division-related
4930 /// instruction may divide by zero (in which case it must be predicated if
4931 /// executed conditionally in the scalar code).
4932 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4933 /// Non-zero divisors that are non compile-time constants will not be
4934 /// converted into multiplication, so we will still end up scalarizing
4935 /// the division, but can do so w/o predication.
4936 static bool mayDivideByZero(Instruction &I) {
4937   assert((I.getOpcode() == Instruction::UDiv ||
4938           I.getOpcode() == Instruction::SDiv ||
4939           I.getOpcode() == Instruction::URem ||
4940           I.getOpcode() == Instruction::SRem) &&
4941          "Unexpected instruction");
4942   Value *Divisor = I.getOperand(1);
4943   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4944   return !CInt || CInt->isZero();
4945 }
4946 
4947 void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
4948                                            VPUser &User,
4949                                            VPTransformState &State) {
4950   switch (I.getOpcode()) {
4951   case Instruction::Call:
4952   case Instruction::Br:
4953   case Instruction::PHI:
4954   case Instruction::GetElementPtr:
4955   case Instruction::Select:
4956     llvm_unreachable("This instruction is handled by a different recipe.");
4957   case Instruction::UDiv:
4958   case Instruction::SDiv:
4959   case Instruction::SRem:
4960   case Instruction::URem:
4961   case Instruction::Add:
4962   case Instruction::FAdd:
4963   case Instruction::Sub:
4964   case Instruction::FSub:
4965   case Instruction::FNeg:
4966   case Instruction::Mul:
4967   case Instruction::FMul:
4968   case Instruction::FDiv:
4969   case Instruction::FRem:
4970   case Instruction::Shl:
4971   case Instruction::LShr:
4972   case Instruction::AShr:
4973   case Instruction::And:
4974   case Instruction::Or:
4975   case Instruction::Xor: {
4976     // Just widen unops and binops.
4977     setDebugLocFromInst(Builder, &I);
4978 
4979     for (unsigned Part = 0; Part < UF; ++Part) {
4980       SmallVector<Value *, 2> Ops;
4981       for (VPValue *VPOp : User.operands())
4982         Ops.push_back(State.get(VPOp, Part));
4983 
4984       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4985 
4986       if (auto *VecOp = dyn_cast<Instruction>(V))
4987         VecOp->copyIRFlags(&I);
4988 
4989       // Use this vector value for all users of the original instruction.
4990       State.set(Def, V, Part);
4991       addMetadata(V, &I);
4992     }
4993 
4994     break;
4995   }
4996   case Instruction::ICmp:
4997   case Instruction::FCmp: {
4998     // Widen compares. Generate vector compares.
4999     bool FCmp = (I.getOpcode() == Instruction::FCmp);
5000     auto *Cmp = cast<CmpInst>(&I);
5001     setDebugLocFromInst(Builder, Cmp);
5002     for (unsigned Part = 0; Part < UF; ++Part) {
5003       Value *A = State.get(User.getOperand(0), Part);
5004       Value *B = State.get(User.getOperand(1), Part);
5005       Value *C = nullptr;
5006       if (FCmp) {
5007         // Propagate fast math flags.
5008         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
5009         Builder.setFastMathFlags(Cmp->getFastMathFlags());
5010         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
5011       } else {
5012         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
5013       }
5014       State.set(Def, C, Part);
5015       addMetadata(C, &I);
5016     }
5017 
5018     break;
5019   }
5020 
5021   case Instruction::ZExt:
5022   case Instruction::SExt:
5023   case Instruction::FPToUI:
5024   case Instruction::FPToSI:
5025   case Instruction::FPExt:
5026   case Instruction::PtrToInt:
5027   case Instruction::IntToPtr:
5028   case Instruction::SIToFP:
5029   case Instruction::UIToFP:
5030   case Instruction::Trunc:
5031   case Instruction::FPTrunc:
5032   case Instruction::BitCast: {
5033     auto *CI = cast<CastInst>(&I);
5034     setDebugLocFromInst(Builder, CI);
5035 
5036     /// Vectorize casts.
5037     Type *DestTy =
5038         (VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
5039 
5040     for (unsigned Part = 0; Part < UF; ++Part) {
5041       Value *A = State.get(User.getOperand(0), Part);
5042       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
5043       State.set(Def, Cast, Part);
5044       addMetadata(Cast, &I);
5045     }
5046     break;
5047   }
5048   default:
5049     // This instruction is not vectorized by simple widening.
5050     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
5051     llvm_unreachable("Unhandled instruction!");
5052   } // end of switch.
5053 }
5054 
5055 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
5056                                                VPUser &ArgOperands,
5057                                                VPTransformState &State) {
5058   assert(!isa<DbgInfoIntrinsic>(I) &&
5059          "DbgInfoIntrinsic should have been dropped during VPlan construction");
5060   setDebugLocFromInst(Builder, &I);
5061 
5062   Module *M = I.getParent()->getParent()->getParent();
5063   auto *CI = cast<CallInst>(&I);
5064 
5065   SmallVector<Type *, 4> Tys;
5066   for (Value *ArgOperand : CI->arg_operands())
5067     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
5068 
5069   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
5070 
5071   // The flag shows whether we use Intrinsic or a usual Call for vectorized
5072   // version of the instruction.
5073   // Is it beneficial to perform intrinsic call compared to lib call?
5074   bool NeedToScalarize = false;
5075   InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
5076   InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
5077   bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
5078   assert((UseVectorIntrinsic || !NeedToScalarize) &&
5079          "Instruction should be scalarized elsewhere.");
5080   assert((IntrinsicCost.isValid() || CallCost.isValid()) &&
5081          "Either the intrinsic cost or vector call cost must be valid");
5082 
5083   for (unsigned Part = 0; Part < UF; ++Part) {
5084     SmallVector<Value *, 4> Args;
5085     for (auto &I : enumerate(ArgOperands.operands())) {
5086       // Some intrinsics have a scalar argument - don't replace it with a
5087       // vector.
5088       Value *Arg;
5089       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
5090         Arg = State.get(I.value(), Part);
5091       else
5092         Arg = State.get(I.value(), VPIteration(0, 0));
5093       Args.push_back(Arg);
5094     }
5095 
5096     Function *VectorF;
5097     if (UseVectorIntrinsic) {
5098       // Use vector version of the intrinsic.
5099       Type *TysForDecl[] = {CI->getType()};
5100       if (VF.isVector())
5101         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
5102       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
5103       assert(VectorF && "Can't retrieve vector intrinsic.");
5104     } else {
5105       // Use vector version of the function call.
5106       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
5107 #ifndef NDEBUG
5108       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
5109              "Can't create vector function.");
5110 #endif
5111         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
5112     }
5113       SmallVector<OperandBundleDef, 1> OpBundles;
5114       CI->getOperandBundlesAsDefs(OpBundles);
5115       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
5116 
5117       if (isa<FPMathOperator>(V))
5118         V->copyFastMathFlags(CI);
5119 
5120       State.set(Def, V, Part);
5121       addMetadata(V, &I);
5122   }
5123 }
5124 
5125 void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef,
5126                                                  VPUser &Operands,
5127                                                  bool InvariantCond,
5128                                                  VPTransformState &State) {
5129   setDebugLocFromInst(Builder, &I);
5130 
5131   // The condition can be loop invariant  but still defined inside the
5132   // loop. This means that we can't just use the original 'cond' value.
5133   // We have to take the 'vectorized' value and pick the first lane.
5134   // Instcombine will make this a no-op.
5135   auto *InvarCond = InvariantCond
5136                         ? State.get(Operands.getOperand(0), VPIteration(0, 0))
5137                         : nullptr;
5138 
5139   for (unsigned Part = 0; Part < UF; ++Part) {
5140     Value *Cond =
5141         InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
5142     Value *Op0 = State.get(Operands.getOperand(1), Part);
5143     Value *Op1 = State.get(Operands.getOperand(2), Part);
5144     Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
5145     State.set(VPDef, Sel, Part);
5146     addMetadata(Sel, &I);
5147   }
5148 }
5149 
5150 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
5151   // We should not collect Scalars more than once per VF. Right now, this
5152   // function is called from collectUniformsAndScalars(), which already does
5153   // this check. Collecting Scalars for VF=1 does not make any sense.
5154   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
5155          "This function should not be visited twice for the same VF");
5156 
5157   SmallSetVector<Instruction *, 8> Worklist;
5158 
5159   // These sets are used to seed the analysis with pointers used by memory
5160   // accesses that will remain scalar.
5161   SmallSetVector<Instruction *, 8> ScalarPtrs;
5162   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
5163   auto *Latch = TheLoop->getLoopLatch();
5164 
5165   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
5166   // The pointer operands of loads and stores will be scalar as long as the
5167   // memory access is not a gather or scatter operation. The value operand of a
5168   // store will remain scalar if the store is scalarized.
5169   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
5170     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
5171     assert(WideningDecision != CM_Unknown &&
5172            "Widening decision should be ready at this moment");
5173     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
5174       if (Ptr == Store->getValueOperand())
5175         return WideningDecision == CM_Scalarize;
5176     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
5177            "Ptr is neither a value or pointer operand");
5178     return WideningDecision != CM_GatherScatter;
5179   };
5180 
5181   // A helper that returns true if the given value is a bitcast or
5182   // getelementptr instruction contained in the loop.
5183   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
5184     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
5185             isa<GetElementPtrInst>(V)) &&
5186            !TheLoop->isLoopInvariant(V);
5187   };
5188 
5189   auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
5190     if (!isa<PHINode>(Ptr) ||
5191         !Legal->getInductionVars().count(cast<PHINode>(Ptr)))
5192       return false;
5193     auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
5194     if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
5195       return false;
5196     return isScalarUse(MemAccess, Ptr);
5197   };
5198 
5199   // A helper that evaluates a memory access's use of a pointer. If the
5200   // pointer is actually the pointer induction of a loop, it is being
5201   // inserted into Worklist. If the use will be a scalar use, and the
5202   // pointer is only used by memory accesses, we place the pointer in
5203   // ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
5204   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
5205     if (isScalarPtrInduction(MemAccess, Ptr)) {
5206       Worklist.insert(cast<Instruction>(Ptr));
5207       Instruction *Update = cast<Instruction>(
5208           cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
5209       Worklist.insert(Update);
5210       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
5211                         << "\n");
5212       LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update
5213                         << "\n");
5214       return;
5215     }
5216     // We only care about bitcast and getelementptr instructions contained in
5217     // the loop.
5218     if (!isLoopVaryingBitCastOrGEP(Ptr))
5219       return;
5220 
5221     // If the pointer has already been identified as scalar (e.g., if it was
5222     // also identified as uniform), there's nothing to do.
5223     auto *I = cast<Instruction>(Ptr);
5224     if (Worklist.count(I))
5225       return;
5226 
5227     // If the use of the pointer will be a scalar use, and all users of the
5228     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
5229     // place the pointer in PossibleNonScalarPtrs.
5230     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
5231           return isa<LoadInst>(U) || isa<StoreInst>(U);
5232         }))
5233       ScalarPtrs.insert(I);
5234     else
5235       PossibleNonScalarPtrs.insert(I);
5236   };
5237 
5238   // We seed the scalars analysis with three classes of instructions: (1)
5239   // instructions marked uniform-after-vectorization and (2) bitcast,
5240   // getelementptr and (pointer) phi instructions used by memory accesses
5241   // requiring a scalar use.
5242   //
5243   // (1) Add to the worklist all instructions that have been identified as
5244   // uniform-after-vectorization.
5245   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
5246 
5247   // (2) Add to the worklist all bitcast and getelementptr instructions used by
5248   // memory accesses requiring a scalar use. The pointer operands of loads and
5249   // stores will be scalar as long as the memory accesses is not a gather or
5250   // scatter operation. The value operand of a store will remain scalar if the
5251   // store is scalarized.
5252   for (auto *BB : TheLoop->blocks())
5253     for (auto &I : *BB) {
5254       if (auto *Load = dyn_cast<LoadInst>(&I)) {
5255         evaluatePtrUse(Load, Load->getPointerOperand());
5256       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
5257         evaluatePtrUse(Store, Store->getPointerOperand());
5258         evaluatePtrUse(Store, Store->getValueOperand());
5259       }
5260     }
5261   for (auto *I : ScalarPtrs)
5262     if (!PossibleNonScalarPtrs.count(I)) {
5263       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
5264       Worklist.insert(I);
5265     }
5266 
5267   // Insert the forced scalars.
5268   // FIXME: Currently widenPHIInstruction() often creates a dead vector
5269   // induction variable when the PHI user is scalarized.
5270   auto ForcedScalar = ForcedScalars.find(VF);
5271   if (ForcedScalar != ForcedScalars.end())
5272     for (auto *I : ForcedScalar->second)
5273       Worklist.insert(I);
5274 
5275   // Expand the worklist by looking through any bitcasts and getelementptr
5276   // instructions we've already identified as scalar. This is similar to the
5277   // expansion step in collectLoopUniforms(); however, here we're only
5278   // expanding to include additional bitcasts and getelementptr instructions.
5279   unsigned Idx = 0;
5280   while (Idx != Worklist.size()) {
5281     Instruction *Dst = Worklist[Idx++];
5282     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
5283       continue;
5284     auto *Src = cast<Instruction>(Dst->getOperand(0));
5285     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
5286           auto *J = cast<Instruction>(U);
5287           return !TheLoop->contains(J) || Worklist.count(J) ||
5288                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
5289                   isScalarUse(J, Src));
5290         })) {
5291       Worklist.insert(Src);
5292       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
5293     }
5294   }
5295 
5296   // An induction variable will remain scalar if all users of the induction
5297   // variable and induction variable update remain scalar.
5298   for (auto &Induction : Legal->getInductionVars()) {
5299     auto *Ind = Induction.first;
5300     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5301 
5302     // If tail-folding is applied, the primary induction variable will be used
5303     // to feed a vector compare.
5304     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
5305       continue;
5306 
5307     // Determine if all users of the induction variable are scalar after
5308     // vectorization.
5309     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5310       auto *I = cast<Instruction>(U);
5311       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
5312     });
5313     if (!ScalarInd)
5314       continue;
5315 
5316     // Determine if all users of the induction variable update instruction are
5317     // scalar after vectorization.
5318     auto ScalarIndUpdate =
5319         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5320           auto *I = cast<Instruction>(U);
5321           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
5322         });
5323     if (!ScalarIndUpdate)
5324       continue;
5325 
5326     // The induction variable and its update instruction will remain scalar.
5327     Worklist.insert(Ind);
5328     Worklist.insert(IndUpdate);
5329     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
5330     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
5331                       << "\n");
5332   }
5333 
5334   Scalars[VF].insert(Worklist.begin(), Worklist.end());
5335 }
5336 
5337 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I) const {
5338   if (!blockNeedsPredication(I->getParent()))
5339     return false;
5340   switch(I->getOpcode()) {
5341   default:
5342     break;
5343   case Instruction::Load:
5344   case Instruction::Store: {
5345     if (!Legal->isMaskRequired(I))
5346       return false;
5347     auto *Ptr = getLoadStorePointerOperand(I);
5348     auto *Ty = getLoadStoreType(I);
5349     const Align Alignment = getLoadStoreAlignment(I);
5350     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
5351                                 TTI.isLegalMaskedGather(Ty, Alignment))
5352                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
5353                                 TTI.isLegalMaskedScatter(Ty, Alignment));
5354   }
5355   case Instruction::UDiv:
5356   case Instruction::SDiv:
5357   case Instruction::SRem:
5358   case Instruction::URem:
5359     return mayDivideByZero(*I);
5360   }
5361   return false;
5362 }
5363 
5364 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
5365     Instruction *I, ElementCount VF) {
5366   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
5367   assert(getWideningDecision(I, VF) == CM_Unknown &&
5368          "Decision should not be set yet.");
5369   auto *Group = getInterleavedAccessGroup(I);
5370   assert(Group && "Must have a group.");
5371 
5372   // If the instruction's allocated size doesn't equal it's type size, it
5373   // requires padding and will be scalarized.
5374   auto &DL = I->getModule()->getDataLayout();
5375   auto *ScalarTy = getLoadStoreType(I);
5376   if (hasIrregularType(ScalarTy, DL))
5377     return false;
5378 
5379   // Check if masking is required.
5380   // A Group may need masking for one of two reasons: it resides in a block that
5381   // needs predication, or it was decided to use masking to deal with gaps.
5382   bool PredicatedAccessRequiresMasking =
5383       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
5384   bool AccessWithGapsRequiresMasking =
5385       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5386   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
5387     return true;
5388 
5389   // If masked interleaving is required, we expect that the user/target had
5390   // enabled it, because otherwise it either wouldn't have been created or
5391   // it should have been invalidated by the CostModel.
5392   assert(useMaskedInterleavedAccesses(TTI) &&
5393          "Masked interleave-groups for predicated accesses are not enabled.");
5394 
5395   auto *Ty = getLoadStoreType(I);
5396   const Align Alignment = getLoadStoreAlignment(I);
5397   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
5398                           : TTI.isLegalMaskedStore(Ty, Alignment);
5399 }
5400 
5401 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
5402     Instruction *I, ElementCount VF) {
5403   // Get and ensure we have a valid memory instruction.
5404   LoadInst *LI = dyn_cast<LoadInst>(I);
5405   StoreInst *SI = dyn_cast<StoreInst>(I);
5406   assert((LI || SI) && "Invalid memory instruction");
5407 
5408   auto *Ptr = getLoadStorePointerOperand(I);
5409 
5410   // In order to be widened, the pointer should be consecutive, first of all.
5411   if (!Legal->isConsecutivePtr(Ptr))
5412     return false;
5413 
5414   // If the instruction is a store located in a predicated block, it will be
5415   // scalarized.
5416   if (isScalarWithPredication(I))
5417     return false;
5418 
5419   // If the instruction's allocated size doesn't equal it's type size, it
5420   // requires padding and will be scalarized.
5421   auto &DL = I->getModule()->getDataLayout();
5422   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
5423   if (hasIrregularType(ScalarTy, DL))
5424     return false;
5425 
5426   return true;
5427 }
5428 
5429 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
5430   // We should not collect Uniforms more than once per VF. Right now,
5431   // this function is called from collectUniformsAndScalars(), which
5432   // already does this check. Collecting Uniforms for VF=1 does not make any
5433   // sense.
5434 
5435   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
5436          "This function should not be visited twice for the same VF");
5437 
5438   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5439   // not analyze again.  Uniforms.count(VF) will return 1.
5440   Uniforms[VF].clear();
5441 
5442   // We now know that the loop is vectorizable!
5443   // Collect instructions inside the loop that will remain uniform after
5444   // vectorization.
5445 
5446   // Global values, params and instructions outside of current loop are out of
5447   // scope.
5448   auto isOutOfScope = [&](Value *V) -> bool {
5449     Instruction *I = dyn_cast<Instruction>(V);
5450     return (!I || !TheLoop->contains(I));
5451   };
5452 
5453   SetVector<Instruction *> Worklist;
5454   BasicBlock *Latch = TheLoop->getLoopLatch();
5455 
5456   // Instructions that are scalar with predication must not be considered
5457   // uniform after vectorization, because that would create an erroneous
5458   // replicating region where only a single instance out of VF should be formed.
5459   // TODO: optimize such seldom cases if found important, see PR40816.
5460   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
5461     if (isOutOfScope(I)) {
5462       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
5463                         << *I << "\n");
5464       return;
5465     }
5466     if (isScalarWithPredication(I)) {
5467       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
5468                         << *I << "\n");
5469       return;
5470     }
5471     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
5472     Worklist.insert(I);
5473   };
5474 
5475   // Start with the conditional branch. If the branch condition is an
5476   // instruction contained in the loop that is only used by the branch, it is
5477   // uniform.
5478   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5479   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5480     addToWorklistIfAllowed(Cmp);
5481 
5482   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5483     InstWidening WideningDecision = getWideningDecision(I, VF);
5484     assert(WideningDecision != CM_Unknown &&
5485            "Widening decision should be ready at this moment");
5486 
5487     // A uniform memory op is itself uniform.  We exclude uniform stores
5488     // here as they demand the last lane, not the first one.
5489     if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
5490       assert(WideningDecision == CM_Scalarize);
5491       return true;
5492     }
5493 
5494     return (WideningDecision == CM_Widen ||
5495             WideningDecision == CM_Widen_Reverse ||
5496             WideningDecision == CM_Interleave);
5497   };
5498 
5499 
5500   // Returns true if Ptr is the pointer operand of a memory access instruction
5501   // I, and I is known to not require scalarization.
5502   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5503     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5504   };
5505 
5506   // Holds a list of values which are known to have at least one uniform use.
5507   // Note that there may be other uses which aren't uniform.  A "uniform use"
5508   // here is something which only demands lane 0 of the unrolled iterations;
5509   // it does not imply that all lanes produce the same value (e.g. this is not
5510   // the usual meaning of uniform)
5511   SetVector<Value *> HasUniformUse;
5512 
5513   // Scan the loop for instructions which are either a) known to have only
5514   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
5515   for (auto *BB : TheLoop->blocks())
5516     for (auto &I : *BB) {
5517       // If there's no pointer operand, there's nothing to do.
5518       auto *Ptr = getLoadStorePointerOperand(&I);
5519       if (!Ptr)
5520         continue;
5521 
5522       // A uniform memory op is itself uniform.  We exclude uniform stores
5523       // here as they demand the last lane, not the first one.
5524       if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
5525         addToWorklistIfAllowed(&I);
5526 
5527       if (isUniformDecision(&I, VF)) {
5528         assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
5529         HasUniformUse.insert(Ptr);
5530       }
5531     }
5532 
5533   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
5534   // demanding) users.  Since loops are assumed to be in LCSSA form, this
5535   // disallows uses outside the loop as well.
5536   for (auto *V : HasUniformUse) {
5537     if (isOutOfScope(V))
5538       continue;
5539     auto *I = cast<Instruction>(V);
5540     auto UsersAreMemAccesses =
5541       llvm::all_of(I->users(), [&](User *U) -> bool {
5542         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
5543       });
5544     if (UsersAreMemAccesses)
5545       addToWorklistIfAllowed(I);
5546   }
5547 
5548   // Expand Worklist in topological order: whenever a new instruction
5549   // is added , its users should be already inside Worklist.  It ensures
5550   // a uniform instruction will only be used by uniform instructions.
5551   unsigned idx = 0;
5552   while (idx != Worklist.size()) {
5553     Instruction *I = Worklist[idx++];
5554 
5555     for (auto OV : I->operand_values()) {
5556       // isOutOfScope operands cannot be uniform instructions.
5557       if (isOutOfScope(OV))
5558         continue;
5559       // First order recurrence Phi's should typically be considered
5560       // non-uniform.
5561       auto *OP = dyn_cast<PHINode>(OV);
5562       if (OP && Legal->isFirstOrderRecurrence(OP))
5563         continue;
5564       // If all the users of the operand are uniform, then add the
5565       // operand into the uniform worklist.
5566       auto *OI = cast<Instruction>(OV);
5567       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5568             auto *J = cast<Instruction>(U);
5569             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
5570           }))
5571         addToWorklistIfAllowed(OI);
5572     }
5573   }
5574 
5575   // For an instruction to be added into Worklist above, all its users inside
5576   // the loop should also be in Worklist. However, this condition cannot be
5577   // true for phi nodes that form a cyclic dependence. We must process phi
5578   // nodes separately. An induction variable will remain uniform if all users
5579   // of the induction variable and induction variable update remain uniform.
5580   // The code below handles both pointer and non-pointer induction variables.
5581   for (auto &Induction : Legal->getInductionVars()) {
5582     auto *Ind = Induction.first;
5583     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5584 
5585     // Determine if all users of the induction variable are uniform after
5586     // vectorization.
5587     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5588       auto *I = cast<Instruction>(U);
5589       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5590              isVectorizedMemAccessUse(I, Ind);
5591     });
5592     if (!UniformInd)
5593       continue;
5594 
5595     // Determine if all users of the induction variable update instruction are
5596     // uniform after vectorization.
5597     auto UniformIndUpdate =
5598         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5599           auto *I = cast<Instruction>(U);
5600           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5601                  isVectorizedMemAccessUse(I, IndUpdate);
5602         });
5603     if (!UniformIndUpdate)
5604       continue;
5605 
5606     // The induction variable and its update instruction will remain uniform.
5607     addToWorklistIfAllowed(Ind);
5608     addToWorklistIfAllowed(IndUpdate);
5609   }
5610 
5611   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5612 }
5613 
5614 bool LoopVectorizationCostModel::runtimeChecksRequired() {
5615   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
5616 
5617   if (Legal->getRuntimePointerChecking()->Need) {
5618     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5619         "runtime pointer checks needed. Enable vectorization of this "
5620         "loop with '#pragma clang loop vectorize(enable)' when "
5621         "compiling with -Os/-Oz",
5622         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5623     return true;
5624   }
5625 
5626   if (!PSE.getUnionPredicate().getPredicates().empty()) {
5627     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5628         "runtime SCEV checks needed. Enable vectorization of this "
5629         "loop with '#pragma clang loop vectorize(enable)' when "
5630         "compiling with -Os/-Oz",
5631         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5632     return true;
5633   }
5634 
5635   // FIXME: Avoid specializing for stride==1 instead of bailing out.
5636   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5637     reportVectorizationFailure("Runtime stride check for small trip count",
5638         "runtime stride == 1 checks needed. Enable vectorization of "
5639         "this loop without such check by compiling with -Os/-Oz",
5640         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5641     return true;
5642   }
5643 
5644   return false;
5645 }
5646 
5647 ElementCount
5648 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
5649   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
5650     reportVectorizationInfo(
5651         "Disabling scalable vectorization, because target does not "
5652         "support scalable vectors.",
5653         "ScalableVectorsUnsupported", ORE, TheLoop);
5654     return ElementCount::getScalable(0);
5655   }
5656 
5657   if (Hints->isScalableVectorizationDisabled()) {
5658     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
5659                             "ScalableVectorizationDisabled", ORE, TheLoop);
5660     return ElementCount::getScalable(0);
5661   }
5662 
5663   auto MaxScalableVF = ElementCount::getScalable(
5664       std::numeric_limits<ElementCount::ScalarTy>::max());
5665 
5666   // Disable scalable vectorization if the loop contains unsupported reductions.
5667   // Test that the loop-vectorizer can legalize all operations for this MaxVF.
5668   // FIXME: While for scalable vectors this is currently sufficient, this should
5669   // be replaced by a more detailed mechanism that filters out specific VFs,
5670   // instead of invalidating vectorization for a whole set of VFs based on the
5671   // MaxVF.
5672   if (!canVectorizeReductions(MaxScalableVF)) {
5673     reportVectorizationInfo(
5674         "Scalable vectorization not supported for the reduction "
5675         "operations found in this loop.",
5676         "ScalableVFUnfeasible", ORE, TheLoop);
5677     return ElementCount::getScalable(0);
5678   }
5679 
5680   if (Legal->isSafeForAnyVectorWidth())
5681     return MaxScalableVF;
5682 
5683   // Limit MaxScalableVF by the maximum safe dependence distance.
5684   Optional<unsigned> MaxVScale = TTI.getMaxVScale();
5685   MaxScalableVF = ElementCount::getScalable(
5686       MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
5687   if (!MaxScalableVF)
5688     reportVectorizationInfo(
5689         "Max legal vector width too small, scalable vectorization "
5690         "unfeasible.",
5691         "ScalableVFUnfeasible", ORE, TheLoop);
5692 
5693   return MaxScalableVF;
5694 }
5695 
5696 FixedScalableVFPair
5697 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
5698                                                  ElementCount UserVF) {
5699   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5700   unsigned SmallestType, WidestType;
5701   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5702 
5703   // Get the maximum safe dependence distance in bits computed by LAA.
5704   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5705   // the memory accesses that is most restrictive (involved in the smallest
5706   // dependence distance).
5707   unsigned MaxSafeElements =
5708       PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
5709 
5710   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
5711   auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
5712 
5713   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
5714                     << ".\n");
5715   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
5716                     << ".\n");
5717 
5718   // First analyze the UserVF, fall back if the UserVF should be ignored.
5719   if (UserVF) {
5720     auto MaxSafeUserVF =
5721         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
5722 
5723     if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF))
5724       return UserVF;
5725 
5726     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
5727 
5728     // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
5729     // is better to ignore the hint and let the compiler choose a suitable VF.
5730     if (!UserVF.isScalable()) {
5731       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5732                         << " is unsafe, clamping to max safe VF="
5733                         << MaxSafeFixedVF << ".\n");
5734       ORE->emit([&]() {
5735         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5736                                           TheLoop->getStartLoc(),
5737                                           TheLoop->getHeader())
5738                << "User-specified vectorization factor "
5739                << ore::NV("UserVectorizationFactor", UserVF)
5740                << " is unsafe, clamping to maximum safe vectorization factor "
5741                << ore::NV("VectorizationFactor", MaxSafeFixedVF);
5742       });
5743       return MaxSafeFixedVF;
5744     }
5745 
5746     LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5747                       << " is unsafe. Ignoring scalable UserVF.\n");
5748     ORE->emit([&]() {
5749       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5750                                         TheLoop->getStartLoc(),
5751                                         TheLoop->getHeader())
5752              << "User-specified vectorization factor "
5753              << ore::NV("UserVectorizationFactor", UserVF)
5754              << " is unsafe. Ignoring the hint to let the compiler pick a "
5755                 "suitable VF.";
5756     });
5757   }
5758 
5759   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5760                     << " / " << WidestType << " bits.\n");
5761 
5762   FixedScalableVFPair Result(ElementCount::getFixed(1),
5763                              ElementCount::getScalable(0));
5764   if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType,
5765                                            WidestType, MaxSafeFixedVF))
5766     Result.FixedVF = MaxVF;
5767 
5768   if (auto MaxVF = getMaximizedVFForTarget(ConstTripCount, SmallestType,
5769                                            WidestType, MaxSafeScalableVF))
5770     if (MaxVF.isScalable()) {
5771       Result.ScalableVF = MaxVF;
5772       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
5773                         << "\n");
5774     }
5775 
5776   return Result;
5777 }
5778 
5779 FixedScalableVFPair
5780 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5781   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5782     // TODO: It may by useful to do since it's still likely to be dynamically
5783     // uniform if the target can skip.
5784     reportVectorizationFailure(
5785         "Not inserting runtime ptr check for divergent target",
5786         "runtime pointer checks needed. Not enabled for divergent target",
5787         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5788     return FixedScalableVFPair::getNone();
5789   }
5790 
5791   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5792   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5793   if (TC == 1) {
5794     reportVectorizationFailure("Single iteration (non) loop",
5795         "loop trip count is one, irrelevant for vectorization",
5796         "SingleIterationLoop", ORE, TheLoop);
5797     return FixedScalableVFPair::getNone();
5798   }
5799 
5800   switch (ScalarEpilogueStatus) {
5801   case CM_ScalarEpilogueAllowed:
5802     return computeFeasibleMaxVF(TC, UserVF);
5803   case CM_ScalarEpilogueNotAllowedUsePredicate:
5804     LLVM_FALLTHROUGH;
5805   case CM_ScalarEpilogueNotNeededUsePredicate:
5806     LLVM_DEBUG(
5807         dbgs() << "LV: vector predicate hint/switch found.\n"
5808                << "LV: Not allowing scalar epilogue, creating predicated "
5809                << "vector loop.\n");
5810     break;
5811   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5812     // fallthrough as a special case of OptForSize
5813   case CM_ScalarEpilogueNotAllowedOptSize:
5814     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5815       LLVM_DEBUG(
5816           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5817     else
5818       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5819                         << "count.\n");
5820 
5821     // Bail if runtime checks are required, which are not good when optimising
5822     // for size.
5823     if (runtimeChecksRequired())
5824       return FixedScalableVFPair::getNone();
5825 
5826     break;
5827   }
5828 
5829   // The only loops we can vectorize without a scalar epilogue, are loops with
5830   // a bottom-test and a single exiting block. We'd have to handle the fact
5831   // that not every instruction executes on the last iteration.  This will
5832   // require a lane mask which varies through the vector loop body.  (TODO)
5833   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5834     // If there was a tail-folding hint/switch, but we can't fold the tail by
5835     // masking, fallback to a vectorization with a scalar epilogue.
5836     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5837       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5838                            "scalar epilogue instead.\n");
5839       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5840       return computeFeasibleMaxVF(TC, UserVF);
5841     }
5842     return FixedScalableVFPair::getNone();
5843   }
5844 
5845   // Now try the tail folding
5846 
5847   // Invalidate interleave groups that require an epilogue if we can't mask
5848   // the interleave-group.
5849   if (!useMaskedInterleavedAccesses(TTI)) {
5850     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5851            "No decisions should have been taken at this point");
5852     // Note: There is no need to invalidate any cost modeling decisions here, as
5853     // non where taken so far.
5854     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5855   }
5856 
5857   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF);
5858   // Avoid tail folding if the trip count is known to be a multiple of any VF
5859   // we chose.
5860   // FIXME: The condition below pessimises the case for fixed-width vectors,
5861   // when scalable VFs are also candidates for vectorization.
5862   if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
5863     ElementCount MaxFixedVF = MaxFactors.FixedVF;
5864     assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&
5865            "MaxFixedVF must be a power of 2");
5866     unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
5867                                    : MaxFixedVF.getFixedValue();
5868     ScalarEvolution *SE = PSE.getSE();
5869     const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5870     const SCEV *ExitCount = SE->getAddExpr(
5871         BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5872     const SCEV *Rem = SE->getURemExpr(
5873         SE->applyLoopGuards(ExitCount, TheLoop),
5874         SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5875     if (Rem->isZero()) {
5876       // Accept MaxFixedVF if we do not have a tail.
5877       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5878       return MaxFactors;
5879     }
5880   }
5881 
5882   // If we don't know the precise trip count, or if the trip count that we
5883   // found modulo the vectorization factor is not zero, try to fold the tail
5884   // by masking.
5885   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5886   if (Legal->prepareToFoldTailByMasking()) {
5887     FoldTailByMasking = true;
5888     return MaxFactors;
5889   }
5890 
5891   // If there was a tail-folding hint/switch, but we can't fold the tail by
5892   // masking, fallback to a vectorization with a scalar epilogue.
5893   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5894     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5895                          "scalar epilogue instead.\n");
5896     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5897     return MaxFactors;
5898   }
5899 
5900   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5901     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5902     return FixedScalableVFPair::getNone();
5903   }
5904 
5905   if (TC == 0) {
5906     reportVectorizationFailure(
5907         "Unable to calculate the loop count due to complex control flow",
5908         "unable to calculate the loop count due to complex control flow",
5909         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5910     return FixedScalableVFPair::getNone();
5911   }
5912 
5913   reportVectorizationFailure(
5914       "Cannot optimize for size and vectorize at the same time.",
5915       "cannot optimize for size and vectorize at the same time. "
5916       "Enable vectorization of this loop with '#pragma clang loop "
5917       "vectorize(enable)' when compiling with -Os/-Oz",
5918       "NoTailLoopWithOptForSize", ORE, TheLoop);
5919   return FixedScalableVFPair::getNone();
5920 }
5921 
5922 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
5923     unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
5924     const ElementCount &MaxSafeVF) {
5925   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
5926   TypeSize WidestRegister = TTI.getRegisterBitWidth(
5927       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5928                            : TargetTransformInfo::RGK_FixedWidthVector);
5929 
5930   // Convenience function to return the minimum of two ElementCounts.
5931   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
5932     assert((LHS.isScalable() == RHS.isScalable()) &&
5933            "Scalable flags must match");
5934     return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
5935   };
5936 
5937   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5938   // Note that both WidestRegister and WidestType may not be a powers of 2.
5939   auto MaxVectorElementCount = ElementCount::get(
5940       PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType),
5941       ComputeScalableMaxVF);
5942   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
5943   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5944                     << (MaxVectorElementCount * WidestType) << " bits.\n");
5945 
5946   if (!MaxVectorElementCount) {
5947     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5948     return ElementCount::getFixed(1);
5949   }
5950 
5951   const auto TripCountEC = ElementCount::getFixed(ConstTripCount);
5952   if (ConstTripCount &&
5953       ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) &&
5954       isPowerOf2_32(ConstTripCount)) {
5955     // We need to clamp the VF to be the ConstTripCount. There is no point in
5956     // choosing a higher viable VF as done in the loop below. If
5957     // MaxVectorElementCount is scalable, we only fall back on a fixed VF when
5958     // the TC is less than or equal to the known number of lanes.
5959     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5960                       << ConstTripCount << "\n");
5961     return TripCountEC;
5962   }
5963 
5964   ElementCount MaxVF = MaxVectorElementCount;
5965   if (TTI.shouldMaximizeVectorBandwidth() ||
5966       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5967     auto MaxVectorElementCountMaxBW = ElementCount::get(
5968         PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
5969         ComputeScalableMaxVF);
5970     MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
5971 
5972     // Collect all viable vectorization factors larger than the default MaxVF
5973     // (i.e. MaxVectorElementCount).
5974     SmallVector<ElementCount, 8> VFs;
5975     for (ElementCount VS = MaxVectorElementCount * 2;
5976          ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
5977       VFs.push_back(VS);
5978 
5979     // For each VF calculate its register usage.
5980     auto RUs = calculateRegisterUsage(VFs);
5981 
5982     // Select the largest VF which doesn't require more registers than existing
5983     // ones.
5984     for (int i = RUs.size() - 1; i >= 0; --i) {
5985       bool Selected = true;
5986       for (auto &pair : RUs[i].MaxLocalUsers) {
5987         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5988         if (pair.second > TargetNumRegisters)
5989           Selected = false;
5990       }
5991       if (Selected) {
5992         MaxVF = VFs[i];
5993         break;
5994       }
5995     }
5996     if (ElementCount MinVF =
5997             TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
5998       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
5999         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
6000                           << ") with target's minimum: " << MinVF << '\n');
6001         MaxVF = MinVF;
6002       }
6003     }
6004   }
6005   return MaxVF;
6006 }
6007 
6008 bool LoopVectorizationCostModel::isMoreProfitable(
6009     const VectorizationFactor &A, const VectorizationFactor &B) const {
6010   InstructionCost::CostType CostA = *A.Cost.getValue();
6011   InstructionCost::CostType CostB = *B.Cost.getValue();
6012 
6013   unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
6014 
6015   if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking &&
6016       MaxTripCount) {
6017     // If we are folding the tail and the trip count is a known (possibly small)
6018     // constant, the trip count will be rounded up to an integer number of
6019     // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),
6020     // which we compare directly. When not folding the tail, the total cost will
6021     // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is
6022     // approximated with the per-lane cost below instead of using the tripcount
6023     // as here.
6024     int64_t RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue());
6025     int64_t RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue());
6026     return RTCostA < RTCostB;
6027   }
6028 
6029   // When set to preferred, for now assume vscale may be larger than 1, so
6030   // that scalable vectorization is slightly favorable over fixed-width
6031   // vectorization.
6032   if (Hints->isScalableVectorizationPreferred())
6033     if (A.Width.isScalable() && !B.Width.isScalable())
6034       return (CostA * B.Width.getKnownMinValue()) <=
6035              (CostB * A.Width.getKnownMinValue());
6036 
6037   // To avoid the need for FP division:
6038   //      (CostA / A.Width) < (CostB / B.Width)
6039   // <=>  (CostA * B.Width) < (CostB * A.Width)
6040   return (CostA * B.Width.getKnownMinValue()) <
6041          (CostB * A.Width.getKnownMinValue());
6042 }
6043 
6044 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
6045     const ElementCountSet &VFCandidates) {
6046   InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
6047   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
6048   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
6049   assert(VFCandidates.count(ElementCount::getFixed(1)) &&
6050          "Expected Scalar VF to be a candidate");
6051 
6052   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost);
6053   VectorizationFactor ChosenFactor = ScalarCost;
6054 
6055   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
6056   if (ForceVectorization && VFCandidates.size() > 1) {
6057     // Ignore scalar width, because the user explicitly wants vectorization.
6058     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
6059     // evaluation.
6060     ChosenFactor.Cost = std::numeric_limits<InstructionCost::CostType>::max();
6061   }
6062 
6063   for (const auto &i : VFCandidates) {
6064     // The cost for scalar VF=1 is already calculated, so ignore it.
6065     if (i.isScalar())
6066       continue;
6067 
6068     // Notice that the vector loop needs to be executed less times, so
6069     // we need to divide the cost of the vector loops by the width of
6070     // the vector elements.
6071     VectorizationCostTy C = expectedCost(i);
6072 
6073     assert(C.first.isValid() && "Unexpected invalid cost for vector loop");
6074     VectorizationFactor Candidate(i, C.first);
6075     LLVM_DEBUG(
6076         dbgs() << "LV: Vector loop of width " << i << " costs: "
6077                << (*Candidate.Cost.getValue() /
6078                    Candidate.Width.getKnownMinValue())
6079                << (i.isScalable() ? " (assuming a minimum vscale of 1)" : "")
6080                << ".\n");
6081 
6082     if (!C.second && !ForceVectorization) {
6083       LLVM_DEBUG(
6084           dbgs() << "LV: Not considering vector loop of width " << i
6085                  << " because it will not generate any vector instructions.\n");
6086       continue;
6087     }
6088 
6089     // If profitable add it to ProfitableVF list.
6090     if (isMoreProfitable(Candidate, ScalarCost))
6091       ProfitableVFs.push_back(Candidate);
6092 
6093     if (isMoreProfitable(Candidate, ChosenFactor))
6094       ChosenFactor = Candidate;
6095   }
6096 
6097   if (!EnableCondStoresVectorization && NumPredStores) {
6098     reportVectorizationFailure("There are conditional stores.",
6099         "store that is conditionally executed prevents vectorization",
6100         "ConditionalStore", ORE, TheLoop);
6101     ChosenFactor = ScalarCost;
6102   }
6103 
6104   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
6105                  *ChosenFactor.Cost.getValue() >= *ScalarCost.Cost.getValue())
6106                  dbgs()
6107              << "LV: Vectorization seems to be not beneficial, "
6108              << "but was forced by a user.\n");
6109   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
6110   return ChosenFactor;
6111 }
6112 
6113 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
6114     const Loop &L, ElementCount VF) const {
6115   // Cross iteration phis such as reductions need special handling and are
6116   // currently unsupported.
6117   if (any_of(L.getHeader()->phis(), [&](PHINode &Phi) {
6118         return Legal->isFirstOrderRecurrence(&Phi) ||
6119                Legal->isReductionVariable(&Phi);
6120       }))
6121     return false;
6122 
6123   // Phis with uses outside of the loop require special handling and are
6124   // currently unsupported.
6125   for (auto &Entry : Legal->getInductionVars()) {
6126     // Look for uses of the value of the induction at the last iteration.
6127     Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
6128     for (User *U : PostInc->users())
6129       if (!L.contains(cast<Instruction>(U)))
6130         return false;
6131     // Look for uses of penultimate value of the induction.
6132     for (User *U : Entry.first->users())
6133       if (!L.contains(cast<Instruction>(U)))
6134         return false;
6135   }
6136 
6137   // Induction variables that are widened require special handling that is
6138   // currently not supported.
6139   if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
6140         return !(this->isScalarAfterVectorization(Entry.first, VF) ||
6141                  this->isProfitableToScalarize(Entry.first, VF));
6142       }))
6143     return false;
6144 
6145   return true;
6146 }
6147 
6148 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
6149     const ElementCount VF) const {
6150   // FIXME: We need a much better cost-model to take different parameters such
6151   // as register pressure, code size increase and cost of extra branches into
6152   // account. For now we apply a very crude heuristic and only consider loops
6153   // with vectorization factors larger than a certain value.
6154   // We also consider epilogue vectorization unprofitable for targets that don't
6155   // consider interleaving beneficial (eg. MVE).
6156   if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
6157     return false;
6158   if (VF.getFixedValue() >= EpilogueVectorizationMinVF)
6159     return true;
6160   return false;
6161 }
6162 
6163 VectorizationFactor
6164 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
6165     const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
6166   VectorizationFactor Result = VectorizationFactor::Disabled();
6167   if (!EnableEpilogueVectorization) {
6168     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
6169     return Result;
6170   }
6171 
6172   if (!isScalarEpilogueAllowed()) {
6173     LLVM_DEBUG(
6174         dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
6175                   "allowed.\n";);
6176     return Result;
6177   }
6178 
6179   // FIXME: This can be fixed for scalable vectors later, because at this stage
6180   // the LoopVectorizer will only consider vectorizing a loop with scalable
6181   // vectors when the loop has a hint to enable vectorization for a given VF.
6182   if (MainLoopVF.isScalable()) {
6183     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization for scalable vectors not "
6184                          "yet supported.\n");
6185     return Result;
6186   }
6187 
6188   // Not really a cost consideration, but check for unsupported cases here to
6189   // simplify the logic.
6190   if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
6191     LLVM_DEBUG(
6192         dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
6193                   "not a supported candidate.\n";);
6194     return Result;
6195   }
6196 
6197   if (EpilogueVectorizationForceVF > 1) {
6198     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
6199     if (LVP.hasPlanWithVFs(
6200             {MainLoopVF, ElementCount::getFixed(EpilogueVectorizationForceVF)}))
6201       return {ElementCount::getFixed(EpilogueVectorizationForceVF), 0};
6202     else {
6203       LLVM_DEBUG(
6204           dbgs()
6205               << "LEV: Epilogue vectorization forced factor is not viable.\n";);
6206       return Result;
6207     }
6208   }
6209 
6210   if (TheLoop->getHeader()->getParent()->hasOptSize() ||
6211       TheLoop->getHeader()->getParent()->hasMinSize()) {
6212     LLVM_DEBUG(
6213         dbgs()
6214             << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
6215     return Result;
6216   }
6217 
6218   if (!isEpilogueVectorizationProfitable(MainLoopVF))
6219     return Result;
6220 
6221   for (auto &NextVF : ProfitableVFs)
6222     if (ElementCount::isKnownLT(NextVF.Width, MainLoopVF) &&
6223         (Result.Width.getFixedValue() == 1 ||
6224          isMoreProfitable(NextVF, Result)) &&
6225         LVP.hasPlanWithVFs({MainLoopVF, NextVF.Width}))
6226       Result = NextVF;
6227 
6228   if (Result != VectorizationFactor::Disabled())
6229     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
6230                       << Result.Width.getFixedValue() << "\n";);
6231   return Result;
6232 }
6233 
6234 std::pair<unsigned, unsigned>
6235 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
6236   unsigned MinWidth = -1U;
6237   unsigned MaxWidth = 8;
6238   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
6239 
6240   // For each block.
6241   for (BasicBlock *BB : TheLoop->blocks()) {
6242     // For each instruction in the loop.
6243     for (Instruction &I : BB->instructionsWithoutDebug()) {
6244       Type *T = I.getType();
6245 
6246       // Skip ignored values.
6247       if (ValuesToIgnore.count(&I))
6248         continue;
6249 
6250       // Only examine Loads, Stores and PHINodes.
6251       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
6252         continue;
6253 
6254       // Examine PHI nodes that are reduction variables. Update the type to
6255       // account for the recurrence type.
6256       if (auto *PN = dyn_cast<PHINode>(&I)) {
6257         if (!Legal->isReductionVariable(PN))
6258           continue;
6259         RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
6260         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
6261             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
6262                                       RdxDesc.getRecurrenceType(),
6263                                       TargetTransformInfo::ReductionFlags()))
6264           continue;
6265         T = RdxDesc.getRecurrenceType();
6266       }
6267 
6268       // Examine the stored values.
6269       if (auto *ST = dyn_cast<StoreInst>(&I))
6270         T = ST->getValueOperand()->getType();
6271 
6272       // Ignore loaded pointer types and stored pointer types that are not
6273       // vectorizable.
6274       //
6275       // FIXME: The check here attempts to predict whether a load or store will
6276       //        be vectorized. We only know this for certain after a VF has
6277       //        been selected. Here, we assume that if an access can be
6278       //        vectorized, it will be. We should also look at extending this
6279       //        optimization to non-pointer types.
6280       //
6281       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
6282           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
6283         continue;
6284 
6285       MinWidth = std::min(MinWidth,
6286                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
6287       MaxWidth = std::max(MaxWidth,
6288                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
6289     }
6290   }
6291 
6292   return {MinWidth, MaxWidth};
6293 }
6294 
6295 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
6296                                                            unsigned LoopCost) {
6297   // -- The interleave heuristics --
6298   // We interleave the loop in order to expose ILP and reduce the loop overhead.
6299   // There are many micro-architectural considerations that we can't predict
6300   // at this level. For example, frontend pressure (on decode or fetch) due to
6301   // code size, or the number and capabilities of the execution ports.
6302   //
6303   // We use the following heuristics to select the interleave count:
6304   // 1. If the code has reductions, then we interleave to break the cross
6305   // iteration dependency.
6306   // 2. If the loop is really small, then we interleave to reduce the loop
6307   // overhead.
6308   // 3. We don't interleave if we think that we will spill registers to memory
6309   // due to the increased register pressure.
6310 
6311   if (!isScalarEpilogueAllowed())
6312     return 1;
6313 
6314   // We used the distance for the interleave count.
6315   if (Legal->getMaxSafeDepDistBytes() != -1U)
6316     return 1;
6317 
6318   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
6319   const bool HasReductions = !Legal->getReductionVars().empty();
6320   // Do not interleave loops with a relatively small known or estimated trip
6321   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
6322   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
6323   // because with the above conditions interleaving can expose ILP and break
6324   // cross iteration dependences for reductions.
6325   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
6326       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
6327     return 1;
6328 
6329   RegisterUsage R = calculateRegisterUsage({VF})[0];
6330   // We divide by these constants so assume that we have at least one
6331   // instruction that uses at least one register.
6332   for (auto& pair : R.MaxLocalUsers) {
6333     pair.second = std::max(pair.second, 1U);
6334   }
6335 
6336   // We calculate the interleave count using the following formula.
6337   // Subtract the number of loop invariants from the number of available
6338   // registers. These registers are used by all of the interleaved instances.
6339   // Next, divide the remaining registers by the number of registers that is
6340   // required by the loop, in order to estimate how many parallel instances
6341   // fit without causing spills. All of this is rounded down if necessary to be
6342   // a power of two. We want power of two interleave count to simplify any
6343   // addressing operations or alignment considerations.
6344   // We also want power of two interleave counts to ensure that the induction
6345   // variable of the vector loop wraps to zero, when tail is folded by masking;
6346   // this currently happens when OptForSize, in which case IC is set to 1 above.
6347   unsigned IC = UINT_MAX;
6348 
6349   for (auto& pair : R.MaxLocalUsers) {
6350     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
6351     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
6352                       << " registers of "
6353                       << TTI.getRegisterClassName(pair.first) << " register class\n");
6354     if (VF.isScalar()) {
6355       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
6356         TargetNumRegisters = ForceTargetNumScalarRegs;
6357     } else {
6358       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
6359         TargetNumRegisters = ForceTargetNumVectorRegs;
6360     }
6361     unsigned MaxLocalUsers = pair.second;
6362     unsigned LoopInvariantRegs = 0;
6363     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
6364       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
6365 
6366     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
6367     // Don't count the induction variable as interleaved.
6368     if (EnableIndVarRegisterHeur) {
6369       TmpIC =
6370           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
6371                         std::max(1U, (MaxLocalUsers - 1)));
6372     }
6373 
6374     IC = std::min(IC, TmpIC);
6375   }
6376 
6377   // Clamp the interleave ranges to reasonable counts.
6378   unsigned MaxInterleaveCount =
6379       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
6380 
6381   // Check if the user has overridden the max.
6382   if (VF.isScalar()) {
6383     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
6384       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
6385   } else {
6386     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
6387       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
6388   }
6389 
6390   // If trip count is known or estimated compile time constant, limit the
6391   // interleave count to be less than the trip count divided by VF, provided it
6392   // is at least 1.
6393   //
6394   // For scalable vectors we can't know if interleaving is beneficial. It may
6395   // not be beneficial for small loops if none of the lanes in the second vector
6396   // iterations is enabled. However, for larger loops, there is likely to be a
6397   // similar benefit as for fixed-width vectors. For now, we choose to leave
6398   // the InterleaveCount as if vscale is '1', although if some information about
6399   // the vector is known (e.g. min vector size), we can make a better decision.
6400   if (BestKnownTC) {
6401     MaxInterleaveCount =
6402         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
6403     // Make sure MaxInterleaveCount is greater than 0.
6404     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
6405   }
6406 
6407   assert(MaxInterleaveCount > 0 &&
6408          "Maximum interleave count must be greater than 0");
6409 
6410   // Clamp the calculated IC to be between the 1 and the max interleave count
6411   // that the target and trip count allows.
6412   if (IC > MaxInterleaveCount)
6413     IC = MaxInterleaveCount;
6414   else
6415     // Make sure IC is greater than 0.
6416     IC = std::max(1u, IC);
6417 
6418   assert(IC > 0 && "Interleave count must be greater than 0.");
6419 
6420   // If we did not calculate the cost for VF (because the user selected the VF)
6421   // then we calculate the cost of VF here.
6422   if (LoopCost == 0) {
6423     assert(expectedCost(VF).first.isValid() && "Expected a valid cost");
6424     LoopCost = *expectedCost(VF).first.getValue();
6425   }
6426 
6427   assert(LoopCost && "Non-zero loop cost expected");
6428 
6429   // Interleave if we vectorized this loop and there is a reduction that could
6430   // benefit from interleaving.
6431   if (VF.isVector() && HasReductions) {
6432     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
6433     return IC;
6434   }
6435 
6436   // Note that if we've already vectorized the loop we will have done the
6437   // runtime check and so interleaving won't require further checks.
6438   bool InterleavingRequiresRuntimePointerCheck =
6439       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
6440 
6441   // We want to interleave small loops in order to reduce the loop overhead and
6442   // potentially expose ILP opportunities.
6443   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
6444                     << "LV: IC is " << IC << '\n'
6445                     << "LV: VF is " << VF << '\n');
6446   const bool AggressivelyInterleaveReductions =
6447       TTI.enableAggressiveInterleaving(HasReductions);
6448   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
6449     // We assume that the cost overhead is 1 and we use the cost model
6450     // to estimate the cost of the loop and interleave until the cost of the
6451     // loop overhead is about 5% of the cost of the loop.
6452     unsigned SmallIC =
6453         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
6454 
6455     // Interleave until store/load ports (estimated by max interleave count) are
6456     // saturated.
6457     unsigned NumStores = Legal->getNumStores();
6458     unsigned NumLoads = Legal->getNumLoads();
6459     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
6460     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
6461 
6462     // If we have a scalar reduction (vector reductions are already dealt with
6463     // by this point), we can increase the critical path length if the loop
6464     // we're interleaving is inside another loop. Limit, by default to 2, so the
6465     // critical path only gets increased by one reduction operation.
6466     if (HasReductions && TheLoop->getLoopDepth() > 1) {
6467       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
6468       SmallIC = std::min(SmallIC, F);
6469       StoresIC = std::min(StoresIC, F);
6470       LoadsIC = std::min(LoadsIC, F);
6471     }
6472 
6473     if (EnableLoadStoreRuntimeInterleave &&
6474         std::max(StoresIC, LoadsIC) > SmallIC) {
6475       LLVM_DEBUG(
6476           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
6477       return std::max(StoresIC, LoadsIC);
6478     }
6479 
6480     // If there are scalar reductions and TTI has enabled aggressive
6481     // interleaving for reductions, we will interleave to expose ILP.
6482     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
6483         AggressivelyInterleaveReductions) {
6484       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6485       // Interleave no less than SmallIC but not as aggressive as the normal IC
6486       // to satisfy the rare situation when resources are too limited.
6487       return std::max(IC / 2, SmallIC);
6488     } else {
6489       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
6490       return SmallIC;
6491     }
6492   }
6493 
6494   // Interleave if this is a large loop (small loops are already dealt with by
6495   // this point) that could benefit from interleaving.
6496   if (AggressivelyInterleaveReductions) {
6497     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6498     return IC;
6499   }
6500 
6501   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
6502   return 1;
6503 }
6504 
6505 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
6506 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
6507   // This function calculates the register usage by measuring the highest number
6508   // of values that are alive at a single location. Obviously, this is a very
6509   // rough estimation. We scan the loop in a topological order in order and
6510   // assign a number to each instruction. We use RPO to ensure that defs are
6511   // met before their users. We assume that each instruction that has in-loop
6512   // users starts an interval. We record every time that an in-loop value is
6513   // used, so we have a list of the first and last occurrences of each
6514   // instruction. Next, we transpose this data structure into a multi map that
6515   // holds the list of intervals that *end* at a specific location. This multi
6516   // map allows us to perform a linear search. We scan the instructions linearly
6517   // and record each time that a new interval starts, by placing it in a set.
6518   // If we find this value in the multi-map then we remove it from the set.
6519   // The max register usage is the maximum size of the set.
6520   // We also search for instructions that are defined outside the loop, but are
6521   // used inside the loop. We need this number separately from the max-interval
6522   // usage number because when we unroll, loop-invariant values do not take
6523   // more register.
6524   LoopBlocksDFS DFS(TheLoop);
6525   DFS.perform(LI);
6526 
6527   RegisterUsage RU;
6528 
6529   // Each 'key' in the map opens a new interval. The values
6530   // of the map are the index of the 'last seen' usage of the
6531   // instruction that is the key.
6532   using IntervalMap = DenseMap<Instruction *, unsigned>;
6533 
6534   // Maps instruction to its index.
6535   SmallVector<Instruction *, 64> IdxToInstr;
6536   // Marks the end of each interval.
6537   IntervalMap EndPoint;
6538   // Saves the list of instruction indices that are used in the loop.
6539   SmallPtrSet<Instruction *, 8> Ends;
6540   // Saves the list of values that are used in the loop but are
6541   // defined outside the loop, such as arguments and constants.
6542   SmallPtrSet<Value *, 8> LoopInvariants;
6543 
6544   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6545     for (Instruction &I : BB->instructionsWithoutDebug()) {
6546       IdxToInstr.push_back(&I);
6547 
6548       // Save the end location of each USE.
6549       for (Value *U : I.operands()) {
6550         auto *Instr = dyn_cast<Instruction>(U);
6551 
6552         // Ignore non-instruction values such as arguments, constants, etc.
6553         if (!Instr)
6554           continue;
6555 
6556         // If this instruction is outside the loop then record it and continue.
6557         if (!TheLoop->contains(Instr)) {
6558           LoopInvariants.insert(Instr);
6559           continue;
6560         }
6561 
6562         // Overwrite previous end points.
6563         EndPoint[Instr] = IdxToInstr.size();
6564         Ends.insert(Instr);
6565       }
6566     }
6567   }
6568 
6569   // Saves the list of intervals that end with the index in 'key'.
6570   using InstrList = SmallVector<Instruction *, 2>;
6571   DenseMap<unsigned, InstrList> TransposeEnds;
6572 
6573   // Transpose the EndPoints to a list of values that end at each index.
6574   for (auto &Interval : EndPoint)
6575     TransposeEnds[Interval.second].push_back(Interval.first);
6576 
6577   SmallPtrSet<Instruction *, 8> OpenIntervals;
6578   SmallVector<RegisterUsage, 8> RUs(VFs.size());
6579   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
6580 
6581   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
6582 
6583   // A lambda that gets the register usage for the given type and VF.
6584   const auto &TTICapture = TTI;
6585   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) {
6586     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
6587       return 0;
6588     return *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue();
6589   };
6590 
6591   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
6592     Instruction *I = IdxToInstr[i];
6593 
6594     // Remove all of the instructions that end at this location.
6595     InstrList &List = TransposeEnds[i];
6596     for (Instruction *ToRemove : List)
6597       OpenIntervals.erase(ToRemove);
6598 
6599     // Ignore instructions that are never used within the loop.
6600     if (!Ends.count(I))
6601       continue;
6602 
6603     // Skip ignored values.
6604     if (ValuesToIgnore.count(I))
6605       continue;
6606 
6607     // For each VF find the maximum usage of registers.
6608     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6609       // Count the number of live intervals.
6610       SmallMapVector<unsigned, unsigned, 4> RegUsage;
6611 
6612       if (VFs[j].isScalar()) {
6613         for (auto Inst : OpenIntervals) {
6614           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6615           if (RegUsage.find(ClassID) == RegUsage.end())
6616             RegUsage[ClassID] = 1;
6617           else
6618             RegUsage[ClassID] += 1;
6619         }
6620       } else {
6621         collectUniformsAndScalars(VFs[j]);
6622         for (auto Inst : OpenIntervals) {
6623           // Skip ignored values for VF > 1.
6624           if (VecValuesToIgnore.count(Inst))
6625             continue;
6626           if (isScalarAfterVectorization(Inst, VFs[j])) {
6627             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6628             if (RegUsage.find(ClassID) == RegUsage.end())
6629               RegUsage[ClassID] = 1;
6630             else
6631               RegUsage[ClassID] += 1;
6632           } else {
6633             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
6634             if (RegUsage.find(ClassID) == RegUsage.end())
6635               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
6636             else
6637               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6638           }
6639         }
6640       }
6641 
6642       for (auto& pair : RegUsage) {
6643         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
6644           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
6645         else
6646           MaxUsages[j][pair.first] = pair.second;
6647       }
6648     }
6649 
6650     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6651                       << OpenIntervals.size() << '\n');
6652 
6653     // Add the current instruction to the list of open intervals.
6654     OpenIntervals.insert(I);
6655   }
6656 
6657   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6658     SmallMapVector<unsigned, unsigned, 4> Invariant;
6659 
6660     for (auto Inst : LoopInvariants) {
6661       unsigned Usage =
6662           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6663       unsigned ClassID =
6664           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6665       if (Invariant.find(ClassID) == Invariant.end())
6666         Invariant[ClassID] = Usage;
6667       else
6668         Invariant[ClassID] += Usage;
6669     }
6670 
6671     LLVM_DEBUG({
6672       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
6673       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
6674              << " item\n";
6675       for (const auto &pair : MaxUsages[i]) {
6676         dbgs() << "LV(REG): RegisterClass: "
6677                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6678                << " registers\n";
6679       }
6680       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
6681              << " item\n";
6682       for (const auto &pair : Invariant) {
6683         dbgs() << "LV(REG): RegisterClass: "
6684                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6685                << " registers\n";
6686       }
6687     });
6688 
6689     RU.LoopInvariantRegs = Invariant;
6690     RU.MaxLocalUsers = MaxUsages[i];
6691     RUs[i] = RU;
6692   }
6693 
6694   return RUs;
6695 }
6696 
6697 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
6698   // TODO: Cost model for emulated masked load/store is completely
6699   // broken. This hack guides the cost model to use an artificially
6700   // high enough value to practically disable vectorization with such
6701   // operations, except where previously deployed legality hack allowed
6702   // using very low cost values. This is to avoid regressions coming simply
6703   // from moving "masked load/store" check from legality to cost model.
6704   // Masked Load/Gather emulation was previously never allowed.
6705   // Limited number of Masked Store/Scatter emulation was allowed.
6706   assert(isPredicatedInst(I) &&
6707          "Expecting a scalar emulated instruction");
6708   return isa<LoadInst>(I) ||
6709          (isa<StoreInst>(I) &&
6710           NumPredStores > NumberOfStoresToPredicate);
6711 }
6712 
6713 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6714   // If we aren't vectorizing the loop, or if we've already collected the
6715   // instructions to scalarize, there's nothing to do. Collection may already
6716   // have occurred if we have a user-selected VF and are now computing the
6717   // expected cost for interleaving.
6718   if (VF.isScalar() || VF.isZero() ||
6719       InstsToScalarize.find(VF) != InstsToScalarize.end())
6720     return;
6721 
6722   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6723   // not profitable to scalarize any instructions, the presence of VF in the
6724   // map will indicate that we've analyzed it already.
6725   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6726 
6727   // Find all the instructions that are scalar with predication in the loop and
6728   // determine if it would be better to not if-convert the blocks they are in.
6729   // If so, we also record the instructions to scalarize.
6730   for (BasicBlock *BB : TheLoop->blocks()) {
6731     if (!blockNeedsPredication(BB))
6732       continue;
6733     for (Instruction &I : *BB)
6734       if (isScalarWithPredication(&I)) {
6735         ScalarCostsTy ScalarCosts;
6736         // Do not apply discount logic if hacked cost is needed
6737         // for emulated masked memrefs.
6738         if (!useEmulatedMaskMemRefHack(&I) &&
6739             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6740           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6741         // Remember that BB will remain after vectorization.
6742         PredicatedBBsAfterVectorization.insert(BB);
6743       }
6744   }
6745 }
6746 
6747 int LoopVectorizationCostModel::computePredInstDiscount(
6748     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
6749   assert(!isUniformAfterVectorization(PredInst, VF) &&
6750          "Instruction marked uniform-after-vectorization will be predicated");
6751 
6752   // Initialize the discount to zero, meaning that the scalar version and the
6753   // vector version cost the same.
6754   InstructionCost Discount = 0;
6755 
6756   // Holds instructions to analyze. The instructions we visit are mapped in
6757   // ScalarCosts. Those instructions are the ones that would be scalarized if
6758   // we find that the scalar version costs less.
6759   SmallVector<Instruction *, 8> Worklist;
6760 
6761   // Returns true if the given instruction can be scalarized.
6762   auto canBeScalarized = [&](Instruction *I) -> bool {
6763     // We only attempt to scalarize instructions forming a single-use chain
6764     // from the original predicated block that would otherwise be vectorized.
6765     // Although not strictly necessary, we give up on instructions we know will
6766     // already be scalar to avoid traversing chains that are unlikely to be
6767     // beneficial.
6768     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6769         isScalarAfterVectorization(I, VF))
6770       return false;
6771 
6772     // If the instruction is scalar with predication, it will be analyzed
6773     // separately. We ignore it within the context of PredInst.
6774     if (isScalarWithPredication(I))
6775       return false;
6776 
6777     // If any of the instruction's operands are uniform after vectorization,
6778     // the instruction cannot be scalarized. This prevents, for example, a
6779     // masked load from being scalarized.
6780     //
6781     // We assume we will only emit a value for lane zero of an instruction
6782     // marked uniform after vectorization, rather than VF identical values.
6783     // Thus, if we scalarize an instruction that uses a uniform, we would
6784     // create uses of values corresponding to the lanes we aren't emitting code
6785     // for. This behavior can be changed by allowing getScalarValue to clone
6786     // the lane zero values for uniforms rather than asserting.
6787     for (Use &U : I->operands())
6788       if (auto *J = dyn_cast<Instruction>(U.get()))
6789         if (isUniformAfterVectorization(J, VF))
6790           return false;
6791 
6792     // Otherwise, we can scalarize the instruction.
6793     return true;
6794   };
6795 
6796   // Compute the expected cost discount from scalarizing the entire expression
6797   // feeding the predicated instruction. We currently only consider expressions
6798   // that are single-use instruction chains.
6799   Worklist.push_back(PredInst);
6800   while (!Worklist.empty()) {
6801     Instruction *I = Worklist.pop_back_val();
6802 
6803     // If we've already analyzed the instruction, there's nothing to do.
6804     if (ScalarCosts.find(I) != ScalarCosts.end())
6805       continue;
6806 
6807     // Compute the cost of the vector instruction. Note that this cost already
6808     // includes the scalarization overhead of the predicated instruction.
6809     InstructionCost VectorCost = getInstructionCost(I, VF).first;
6810 
6811     // Compute the cost of the scalarized instruction. This cost is the cost of
6812     // the instruction as if it wasn't if-converted and instead remained in the
6813     // predicated block. We will scale this cost by block probability after
6814     // computing the scalarization overhead.
6815     assert(!VF.isScalable() && "scalable vectors not yet supported.");
6816     InstructionCost ScalarCost =
6817         VF.getKnownMinValue() *
6818         getInstructionCost(I, ElementCount::getFixed(1)).first;
6819 
6820     // Compute the scalarization overhead of needed insertelement instructions
6821     // and phi nodes.
6822     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
6823       ScalarCost += TTI.getScalarizationOverhead(
6824           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6825           APInt::getAllOnesValue(VF.getKnownMinValue()), true, false);
6826       assert(!VF.isScalable() && "scalable vectors not yet supported.");
6827       ScalarCost +=
6828           VF.getKnownMinValue() *
6829           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6830     }
6831 
6832     // Compute the scalarization overhead of needed extractelement
6833     // instructions. For each of the instruction's operands, if the operand can
6834     // be scalarized, add it to the worklist; otherwise, account for the
6835     // overhead.
6836     for (Use &U : I->operands())
6837       if (auto *J = dyn_cast<Instruction>(U.get())) {
6838         assert(VectorType::isValidElementType(J->getType()) &&
6839                "Instruction has non-scalar type");
6840         if (canBeScalarized(J))
6841           Worklist.push_back(J);
6842         else if (needsExtract(J, VF)) {
6843           assert(!VF.isScalable() && "scalable vectors not yet supported.");
6844           ScalarCost += TTI.getScalarizationOverhead(
6845               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6846               APInt::getAllOnesValue(VF.getKnownMinValue()), false, true);
6847         }
6848       }
6849 
6850     // Scale the total scalar cost by block probability.
6851     ScalarCost /= getReciprocalPredBlockProb();
6852 
6853     // Compute the discount. A non-negative discount means the vector version
6854     // of the instruction costs more, and scalarizing would be beneficial.
6855     Discount += VectorCost - ScalarCost;
6856     ScalarCosts[I] = ScalarCost;
6857   }
6858 
6859   return *Discount.getValue();
6860 }
6861 
6862 LoopVectorizationCostModel::VectorizationCostTy
6863 LoopVectorizationCostModel::expectedCost(ElementCount VF) {
6864   VectorizationCostTy Cost;
6865 
6866   // For each block.
6867   for (BasicBlock *BB : TheLoop->blocks()) {
6868     VectorizationCostTy BlockCost;
6869 
6870     // For each instruction in the old loop.
6871     for (Instruction &I : BB->instructionsWithoutDebug()) {
6872       // Skip ignored values.
6873       if (ValuesToIgnore.count(&I) ||
6874           (VF.isVector() && VecValuesToIgnore.count(&I)))
6875         continue;
6876 
6877       VectorizationCostTy C = getInstructionCost(&I, VF);
6878 
6879       // Check if we should override the cost.
6880       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
6881         C.first = InstructionCost(ForceTargetInstructionCost);
6882 
6883       BlockCost.first += C.first;
6884       BlockCost.second |= C.second;
6885       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6886                         << " for VF " << VF << " For instruction: " << I
6887                         << '\n');
6888     }
6889 
6890     // If we are vectorizing a predicated block, it will have been
6891     // if-converted. This means that the block's instructions (aside from
6892     // stores and instructions that may divide by zero) will now be
6893     // unconditionally executed. For the scalar case, we may not always execute
6894     // the predicated block, if it is an if-else block. Thus, scale the block's
6895     // cost by the probability of executing it. blockNeedsPredication from
6896     // Legal is used so as to not include all blocks in tail folded loops.
6897     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6898       BlockCost.first /= getReciprocalPredBlockProb();
6899 
6900     Cost.first += BlockCost.first;
6901     Cost.second |= BlockCost.second;
6902   }
6903 
6904   return Cost;
6905 }
6906 
6907 /// Gets Address Access SCEV after verifying that the access pattern
6908 /// is loop invariant except the induction variable dependence.
6909 ///
6910 /// This SCEV can be sent to the Target in order to estimate the address
6911 /// calculation cost.
6912 static const SCEV *getAddressAccessSCEV(
6913               Value *Ptr,
6914               LoopVectorizationLegality *Legal,
6915               PredicatedScalarEvolution &PSE,
6916               const Loop *TheLoop) {
6917 
6918   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6919   if (!Gep)
6920     return nullptr;
6921 
6922   // We are looking for a gep with all loop invariant indices except for one
6923   // which should be an induction variable.
6924   auto SE = PSE.getSE();
6925   unsigned NumOperands = Gep->getNumOperands();
6926   for (unsigned i = 1; i < NumOperands; ++i) {
6927     Value *Opd = Gep->getOperand(i);
6928     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6929         !Legal->isInductionVariable(Opd))
6930       return nullptr;
6931   }
6932 
6933   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6934   return PSE.getSCEV(Ptr);
6935 }
6936 
6937 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6938   return Legal->hasStride(I->getOperand(0)) ||
6939          Legal->hasStride(I->getOperand(1));
6940 }
6941 
6942 InstructionCost
6943 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6944                                                         ElementCount VF) {
6945   assert(VF.isVector() &&
6946          "Scalarization cost of instruction implies vectorization.");
6947   if (VF.isScalable())
6948     return InstructionCost::getInvalid();
6949 
6950   Type *ValTy = getLoadStoreType(I);
6951   auto SE = PSE.getSE();
6952 
6953   unsigned AS = getLoadStoreAddressSpace(I);
6954   Value *Ptr = getLoadStorePointerOperand(I);
6955   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6956 
6957   // Figure out whether the access is strided and get the stride value
6958   // if it's known in compile time
6959   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6960 
6961   // Get the cost of the scalar memory instruction and address computation.
6962   InstructionCost Cost =
6963       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6964 
6965   // Don't pass *I here, since it is scalar but will actually be part of a
6966   // vectorized loop where the user of it is a vectorized instruction.
6967   const Align Alignment = getLoadStoreAlignment(I);
6968   Cost += VF.getKnownMinValue() *
6969           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6970                               AS, TTI::TCK_RecipThroughput);
6971 
6972   // Get the overhead of the extractelement and insertelement instructions
6973   // we might create due to scalarization.
6974   Cost += getScalarizationOverhead(I, VF);
6975 
6976   // If we have a predicated load/store, it will need extra i1 extracts and
6977   // conditional branches, but may not be executed for each vector lane. Scale
6978   // the cost by the probability of executing the predicated block.
6979   if (isPredicatedInst(I)) {
6980     Cost /= getReciprocalPredBlockProb();
6981 
6982     // Add the cost of an i1 extract and a branch
6983     auto *Vec_i1Ty =
6984         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
6985     Cost += TTI.getScalarizationOverhead(
6986         Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()),
6987         /*Insert=*/false, /*Extract=*/true);
6988     Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
6989 
6990     if (useEmulatedMaskMemRefHack(I))
6991       // Artificially setting to a high enough value to practically disable
6992       // vectorization with such operations.
6993       Cost = 3000000;
6994   }
6995 
6996   return Cost;
6997 }
6998 
6999 InstructionCost
7000 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
7001                                                     ElementCount VF) {
7002   Type *ValTy = getLoadStoreType(I);
7003   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
7004   Value *Ptr = getLoadStorePointerOperand(I);
7005   unsigned AS = getLoadStoreAddressSpace(I);
7006   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
7007   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7008 
7009   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
7010          "Stride should be 1 or -1 for consecutive memory access");
7011   const Align Alignment = getLoadStoreAlignment(I);
7012   InstructionCost Cost = 0;
7013   if (Legal->isMaskRequired(I))
7014     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
7015                                       CostKind);
7016   else
7017     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
7018                                 CostKind, I);
7019 
7020   bool Reverse = ConsecutiveStride < 0;
7021   if (Reverse)
7022     Cost +=
7023         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
7024   return Cost;
7025 }
7026 
7027 InstructionCost
7028 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
7029                                                 ElementCount VF) {
7030   assert(Legal->isUniformMemOp(*I));
7031 
7032   Type *ValTy = getLoadStoreType(I);
7033   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
7034   const Align Alignment = getLoadStoreAlignment(I);
7035   unsigned AS = getLoadStoreAddressSpace(I);
7036   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7037   if (isa<LoadInst>(I)) {
7038     return TTI.getAddressComputationCost(ValTy) +
7039            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
7040                                CostKind) +
7041            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
7042   }
7043   StoreInst *SI = cast<StoreInst>(I);
7044 
7045   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
7046   return TTI.getAddressComputationCost(ValTy) +
7047          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
7048                              CostKind) +
7049          (isLoopInvariantStoreValue
7050               ? 0
7051               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
7052                                        VF.getKnownMinValue() - 1));
7053 }
7054 
7055 InstructionCost
7056 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
7057                                                  ElementCount VF) {
7058   Type *ValTy = getLoadStoreType(I);
7059   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
7060   const Align Alignment = getLoadStoreAlignment(I);
7061   const Value *Ptr = getLoadStorePointerOperand(I);
7062 
7063   return TTI.getAddressComputationCost(VectorTy) +
7064          TTI.getGatherScatterOpCost(
7065              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
7066              TargetTransformInfo::TCK_RecipThroughput, I);
7067 }
7068 
7069 InstructionCost
7070 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
7071                                                    ElementCount VF) {
7072   // TODO: Once we have support for interleaving with scalable vectors
7073   // we can calculate the cost properly here.
7074   if (VF.isScalable())
7075     return InstructionCost::getInvalid();
7076 
7077   Type *ValTy = getLoadStoreType(I);
7078   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
7079   unsigned AS = getLoadStoreAddressSpace(I);
7080 
7081   auto Group = getInterleavedAccessGroup(I);
7082   assert(Group && "Fail to get an interleaved access group.");
7083 
7084   unsigned InterleaveFactor = Group->getFactor();
7085   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
7086 
7087   // Holds the indices of existing members in an interleaved load group.
7088   // An interleaved store group doesn't need this as it doesn't allow gaps.
7089   SmallVector<unsigned, 4> Indices;
7090   if (isa<LoadInst>(I)) {
7091     for (unsigned i = 0; i < InterleaveFactor; i++)
7092       if (Group->getMember(i))
7093         Indices.push_back(i);
7094   }
7095 
7096   // Calculate the cost of the whole interleaved group.
7097   bool UseMaskForGaps =
7098       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
7099   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
7100       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
7101       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
7102 
7103   if (Group->isReverse()) {
7104     // TODO: Add support for reversed masked interleaved access.
7105     assert(!Legal->isMaskRequired(I) &&
7106            "Reverse masked interleaved access not supported.");
7107     Cost +=
7108         Group->getNumMembers() *
7109         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
7110   }
7111   return Cost;
7112 }
7113 
7114 InstructionCost LoopVectorizationCostModel::getReductionPatternCost(
7115     Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
7116   // Early exit for no inloop reductions
7117   if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
7118     return InstructionCost::getInvalid();
7119   auto *VectorTy = cast<VectorType>(Ty);
7120 
7121   // We are looking for a pattern of, and finding the minimal acceptable cost:
7122   //  reduce(mul(ext(A), ext(B))) or
7123   //  reduce(mul(A, B)) or
7124   //  reduce(ext(A)) or
7125   //  reduce(A).
7126   // The basic idea is that we walk down the tree to do that, finding the root
7127   // reduction instruction in InLoopReductionImmediateChains. From there we find
7128   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
7129   // of the components. If the reduction cost is lower then we return it for the
7130   // reduction instruction and 0 for the other instructions in the pattern. If
7131   // it is not we return an invalid cost specifying the orignal cost method
7132   // should be used.
7133   Instruction *RetI = I;
7134   if ((RetI->getOpcode() == Instruction::SExt ||
7135        RetI->getOpcode() == Instruction::ZExt)) {
7136     if (!RetI->hasOneUser())
7137       return InstructionCost::getInvalid();
7138     RetI = RetI->user_back();
7139   }
7140   if (RetI->getOpcode() == Instruction::Mul &&
7141       RetI->user_back()->getOpcode() == Instruction::Add) {
7142     if (!RetI->hasOneUser())
7143       return InstructionCost::getInvalid();
7144     RetI = RetI->user_back();
7145   }
7146 
7147   // Test if the found instruction is a reduction, and if not return an invalid
7148   // cost specifying the parent to use the original cost modelling.
7149   if (!InLoopReductionImmediateChains.count(RetI))
7150     return InstructionCost::getInvalid();
7151 
7152   // Find the reduction this chain is a part of and calculate the basic cost of
7153   // the reduction on its own.
7154   Instruction *LastChain = InLoopReductionImmediateChains[RetI];
7155   Instruction *ReductionPhi = LastChain;
7156   while (!isa<PHINode>(ReductionPhi))
7157     ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
7158 
7159   RecurrenceDescriptor RdxDesc =
7160       Legal->getReductionVars()[cast<PHINode>(ReductionPhi)];
7161   InstructionCost BaseCost = TTI.getArithmeticReductionCost(
7162       RdxDesc.getOpcode(), VectorTy, false, CostKind);
7163 
7164   // Get the operand that was not the reduction chain and match it to one of the
7165   // patterns, returning the better cost if it is found.
7166   Instruction *RedOp = RetI->getOperand(1) == LastChain
7167                            ? dyn_cast<Instruction>(RetI->getOperand(0))
7168                            : dyn_cast<Instruction>(RetI->getOperand(1));
7169 
7170   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
7171 
7172   if (RedOp && (isa<SExtInst>(RedOp) || isa<ZExtInst>(RedOp)) &&
7173       !TheLoop->isLoopInvariant(RedOp)) {
7174     bool IsUnsigned = isa<ZExtInst>(RedOp);
7175     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
7176     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7177         /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
7178         CostKind);
7179 
7180     InstructionCost ExtCost =
7181         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
7182                              TTI::CastContextHint::None, CostKind, RedOp);
7183     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
7184       return I == RetI ? *RedCost.getValue() : 0;
7185   } else if (RedOp && RedOp->getOpcode() == Instruction::Mul) {
7186     Instruction *Mul = RedOp;
7187     Instruction *Op0 = dyn_cast<Instruction>(Mul->getOperand(0));
7188     Instruction *Op1 = dyn_cast<Instruction>(Mul->getOperand(1));
7189     if (Op0 && Op1 && (isa<SExtInst>(Op0) || isa<ZExtInst>(Op0)) &&
7190         Op0->getOpcode() == Op1->getOpcode() &&
7191         Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
7192         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
7193       bool IsUnsigned = isa<ZExtInst>(Op0);
7194       auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
7195       // reduce(mul(ext, ext))
7196       InstructionCost ExtCost =
7197           TTI.getCastInstrCost(Op0->getOpcode(), VectorTy, ExtType,
7198                                TTI::CastContextHint::None, CostKind, Op0);
7199       InstructionCost MulCost =
7200           TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind);
7201 
7202       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7203           /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
7204           CostKind);
7205 
7206       if (RedCost.isValid() && RedCost < ExtCost * 2 + MulCost + BaseCost)
7207         return I == RetI ? *RedCost.getValue() : 0;
7208     } else {
7209       InstructionCost MulCost =
7210           TTI.getArithmeticInstrCost(Mul->getOpcode(), VectorTy, CostKind);
7211 
7212       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7213           /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy,
7214           CostKind);
7215 
7216       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
7217         return I == RetI ? *RedCost.getValue() : 0;
7218     }
7219   }
7220 
7221   return I == RetI ? BaseCost : InstructionCost::getInvalid();
7222 }
7223 
7224 InstructionCost
7225 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
7226                                                      ElementCount VF) {
7227   // Calculate scalar cost only. Vectorization cost should be ready at this
7228   // moment.
7229   if (VF.isScalar()) {
7230     Type *ValTy = getLoadStoreType(I);
7231     const Align Alignment = getLoadStoreAlignment(I);
7232     unsigned AS = getLoadStoreAddressSpace(I);
7233 
7234     return TTI.getAddressComputationCost(ValTy) +
7235            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
7236                                TTI::TCK_RecipThroughput, I);
7237   }
7238   return getWideningCost(I, VF);
7239 }
7240 
7241 LoopVectorizationCostModel::VectorizationCostTy
7242 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
7243                                                ElementCount VF) {
7244   // If we know that this instruction will remain uniform, check the cost of
7245   // the scalar version.
7246   if (isUniformAfterVectorization(I, VF))
7247     VF = ElementCount::getFixed(1);
7248 
7249   if (VF.isVector() && isProfitableToScalarize(I, VF))
7250     return VectorizationCostTy(InstsToScalarize[VF][I], false);
7251 
7252   // Forced scalars do not have any scalarization overhead.
7253   auto ForcedScalar = ForcedScalars.find(VF);
7254   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
7255     auto InstSet = ForcedScalar->second;
7256     if (InstSet.count(I))
7257       return VectorizationCostTy(
7258           (getInstructionCost(I, ElementCount::getFixed(1)).first *
7259            VF.getKnownMinValue()),
7260           false);
7261   }
7262 
7263   Type *VectorTy;
7264   InstructionCost C = getInstructionCost(I, VF, VectorTy);
7265 
7266   bool TypeNotScalarized =
7267       VF.isVector() && VectorTy->isVectorTy() &&
7268       TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue();
7269   return VectorizationCostTy(C, TypeNotScalarized);
7270 }
7271 
7272 InstructionCost
7273 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
7274                                                      ElementCount VF) const {
7275 
7276   if (VF.isScalable())
7277     return InstructionCost::getInvalid();
7278 
7279   if (VF.isScalar())
7280     return 0;
7281 
7282   InstructionCost Cost = 0;
7283   Type *RetTy = ToVectorTy(I->getType(), VF);
7284   if (!RetTy->isVoidTy() &&
7285       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
7286     Cost += TTI.getScalarizationOverhead(
7287         cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()),
7288         true, false);
7289 
7290   // Some targets keep addresses scalar.
7291   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
7292     return Cost;
7293 
7294   // Some targets support efficient element stores.
7295   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
7296     return Cost;
7297 
7298   // Collect operands to consider.
7299   CallInst *CI = dyn_cast<CallInst>(I);
7300   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
7301 
7302   // Skip operands that do not require extraction/scalarization and do not incur
7303   // any overhead.
7304   SmallVector<Type *> Tys;
7305   for (auto *V : filterExtractingOperands(Ops, VF))
7306     Tys.push_back(MaybeVectorizeType(V->getType(), VF));
7307   return Cost + TTI.getOperandsScalarizationOverhead(
7308                     filterExtractingOperands(Ops, VF), Tys);
7309 }
7310 
7311 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
7312   if (VF.isScalar())
7313     return;
7314   NumPredStores = 0;
7315   for (BasicBlock *BB : TheLoop->blocks()) {
7316     // For each instruction in the old loop.
7317     for (Instruction &I : *BB) {
7318       Value *Ptr =  getLoadStorePointerOperand(&I);
7319       if (!Ptr)
7320         continue;
7321 
7322       // TODO: We should generate better code and update the cost model for
7323       // predicated uniform stores. Today they are treated as any other
7324       // predicated store (see added test cases in
7325       // invariant-store-vectorization.ll).
7326       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
7327         NumPredStores++;
7328 
7329       if (Legal->isUniformMemOp(I)) {
7330         // TODO: Avoid replicating loads and stores instead of
7331         // relying on instcombine to remove them.
7332         // Load: Scalar load + broadcast
7333         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
7334         InstructionCost Cost = getUniformMemOpCost(&I, VF);
7335         setWideningDecision(&I, VF, CM_Scalarize, Cost);
7336         continue;
7337       }
7338 
7339       // We assume that widening is the best solution when possible.
7340       if (memoryInstructionCanBeWidened(&I, VF)) {
7341         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
7342         int ConsecutiveStride =
7343                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
7344         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
7345                "Expected consecutive stride.");
7346         InstWidening Decision =
7347             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
7348         setWideningDecision(&I, VF, Decision, Cost);
7349         continue;
7350       }
7351 
7352       // Choose between Interleaving, Gather/Scatter or Scalarization.
7353       InstructionCost InterleaveCost = InstructionCost::getInvalid();
7354       unsigned NumAccesses = 1;
7355       if (isAccessInterleaved(&I)) {
7356         auto Group = getInterleavedAccessGroup(&I);
7357         assert(Group && "Fail to get an interleaved access group.");
7358 
7359         // Make one decision for the whole group.
7360         if (getWideningDecision(&I, VF) != CM_Unknown)
7361           continue;
7362 
7363         NumAccesses = Group->getNumMembers();
7364         if (interleavedAccessCanBeWidened(&I, VF))
7365           InterleaveCost = getInterleaveGroupCost(&I, VF);
7366       }
7367 
7368       InstructionCost GatherScatterCost =
7369           isLegalGatherOrScatter(&I)
7370               ? getGatherScatterCost(&I, VF) * NumAccesses
7371               : InstructionCost::getInvalid();
7372 
7373       InstructionCost ScalarizationCost =
7374           getMemInstScalarizationCost(&I, VF) * NumAccesses;
7375 
7376       // Choose better solution for the current VF,
7377       // write down this decision and use it during vectorization.
7378       InstructionCost Cost;
7379       InstWidening Decision;
7380       if (InterleaveCost <= GatherScatterCost &&
7381           InterleaveCost < ScalarizationCost) {
7382         Decision = CM_Interleave;
7383         Cost = InterleaveCost;
7384       } else if (GatherScatterCost < ScalarizationCost) {
7385         Decision = CM_GatherScatter;
7386         Cost = GatherScatterCost;
7387       } else {
7388         assert(!VF.isScalable() &&
7389                "We cannot yet scalarise for scalable vectors");
7390         Decision = CM_Scalarize;
7391         Cost = ScalarizationCost;
7392       }
7393       // If the instructions belongs to an interleave group, the whole group
7394       // receives the same decision. The whole group receives the cost, but
7395       // the cost will actually be assigned to one instruction.
7396       if (auto Group = getInterleavedAccessGroup(&I))
7397         setWideningDecision(Group, VF, Decision, Cost);
7398       else
7399         setWideningDecision(&I, VF, Decision, Cost);
7400     }
7401   }
7402 
7403   // Make sure that any load of address and any other address computation
7404   // remains scalar unless there is gather/scatter support. This avoids
7405   // inevitable extracts into address registers, and also has the benefit of
7406   // activating LSR more, since that pass can't optimize vectorized
7407   // addresses.
7408   if (TTI.prefersVectorizedAddressing())
7409     return;
7410 
7411   // Start with all scalar pointer uses.
7412   SmallPtrSet<Instruction *, 8> AddrDefs;
7413   for (BasicBlock *BB : TheLoop->blocks())
7414     for (Instruction &I : *BB) {
7415       Instruction *PtrDef =
7416         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
7417       if (PtrDef && TheLoop->contains(PtrDef) &&
7418           getWideningDecision(&I, VF) != CM_GatherScatter)
7419         AddrDefs.insert(PtrDef);
7420     }
7421 
7422   // Add all instructions used to generate the addresses.
7423   SmallVector<Instruction *, 4> Worklist;
7424   append_range(Worklist, AddrDefs);
7425   while (!Worklist.empty()) {
7426     Instruction *I = Worklist.pop_back_val();
7427     for (auto &Op : I->operands())
7428       if (auto *InstOp = dyn_cast<Instruction>(Op))
7429         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
7430             AddrDefs.insert(InstOp).second)
7431           Worklist.push_back(InstOp);
7432   }
7433 
7434   for (auto *I : AddrDefs) {
7435     if (isa<LoadInst>(I)) {
7436       // Setting the desired widening decision should ideally be handled in
7437       // by cost functions, but since this involves the task of finding out
7438       // if the loaded register is involved in an address computation, it is
7439       // instead changed here when we know this is the case.
7440       InstWidening Decision = getWideningDecision(I, VF);
7441       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
7442         // Scalarize a widened load of address.
7443         setWideningDecision(
7444             I, VF, CM_Scalarize,
7445             (VF.getKnownMinValue() *
7446              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
7447       else if (auto Group = getInterleavedAccessGroup(I)) {
7448         // Scalarize an interleave group of address loads.
7449         for (unsigned I = 0; I < Group->getFactor(); ++I) {
7450           if (Instruction *Member = Group->getMember(I))
7451             setWideningDecision(
7452                 Member, VF, CM_Scalarize,
7453                 (VF.getKnownMinValue() *
7454                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
7455         }
7456       }
7457     } else
7458       // Make sure I gets scalarized and a cost estimate without
7459       // scalarization overhead.
7460       ForcedScalars[VF].insert(I);
7461   }
7462 }
7463 
7464 InstructionCost
7465 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
7466                                                Type *&VectorTy) {
7467   Type *RetTy = I->getType();
7468   if (canTruncateToMinimalBitwidth(I, VF))
7469     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
7470   auto SE = PSE.getSE();
7471   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7472 
7473   auto hasSingleCopyAfterVectorization = [this](Instruction *I,
7474                                                 ElementCount VF) -> bool {
7475     if (VF.isScalar())
7476       return true;
7477 
7478     auto Scalarized = InstsToScalarize.find(VF);
7479     assert(Scalarized != InstsToScalarize.end() &&
7480            "VF not yet analyzed for scalarization profitability");
7481     return !Scalarized->second.count(I) &&
7482            llvm::all_of(I->users(), [&](User *U) {
7483              auto *UI = cast<Instruction>(U);
7484              return !Scalarized->second.count(UI);
7485            });
7486   };
7487   (void) hasSingleCopyAfterVectorization;
7488 
7489   if (isScalarAfterVectorization(I, VF)) {
7490     // With the exception of GEPs and PHIs, after scalarization there should
7491     // only be one copy of the instruction generated in the loop. This is
7492     // because the VF is either 1, or any instructions that need scalarizing
7493     // have already been dealt with by the the time we get here. As a result,
7494     // it means we don't have to multiply the instruction cost by VF.
7495     assert(I->getOpcode() == Instruction::GetElementPtr ||
7496            I->getOpcode() == Instruction::PHI ||
7497            (I->getOpcode() == Instruction::BitCast &&
7498             I->getType()->isPointerTy()) ||
7499            hasSingleCopyAfterVectorization(I, VF));
7500     VectorTy = RetTy;
7501   } else
7502     VectorTy = ToVectorTy(RetTy, VF);
7503 
7504   // TODO: We need to estimate the cost of intrinsic calls.
7505   switch (I->getOpcode()) {
7506   case Instruction::GetElementPtr:
7507     // We mark this instruction as zero-cost because the cost of GEPs in
7508     // vectorized code depends on whether the corresponding memory instruction
7509     // is scalarized or not. Therefore, we handle GEPs with the memory
7510     // instruction cost.
7511     return 0;
7512   case Instruction::Br: {
7513     // In cases of scalarized and predicated instructions, there will be VF
7514     // predicated blocks in the vectorized loop. Each branch around these
7515     // blocks requires also an extract of its vector compare i1 element.
7516     bool ScalarPredicatedBB = false;
7517     BranchInst *BI = cast<BranchInst>(I);
7518     if (VF.isVector() && BI->isConditional() &&
7519         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
7520          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
7521       ScalarPredicatedBB = true;
7522 
7523     if (ScalarPredicatedBB) {
7524       // Return cost for branches around scalarized and predicated blocks.
7525       assert(!VF.isScalable() && "scalable vectors not yet supported.");
7526       auto *Vec_i1Ty =
7527           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
7528       return (TTI.getScalarizationOverhead(
7529                   Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()),
7530                   false, true) +
7531               (TTI.getCFInstrCost(Instruction::Br, CostKind) *
7532                VF.getKnownMinValue()));
7533     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
7534       // The back-edge branch will remain, as will all scalar branches.
7535       return TTI.getCFInstrCost(Instruction::Br, CostKind);
7536     else
7537       // This branch will be eliminated by if-conversion.
7538       return 0;
7539     // Note: We currently assume zero cost for an unconditional branch inside
7540     // a predicated block since it will become a fall-through, although we
7541     // may decide in the future to call TTI for all branches.
7542   }
7543   case Instruction::PHI: {
7544     auto *Phi = cast<PHINode>(I);
7545 
7546     // First-order recurrences are replaced by vector shuffles inside the loop.
7547     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
7548     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
7549       return TTI.getShuffleCost(
7550           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
7551           None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
7552 
7553     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7554     // converted into select instructions. We require N - 1 selects per phi
7555     // node, where N is the number of incoming values.
7556     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
7557       return (Phi->getNumIncomingValues() - 1) *
7558              TTI.getCmpSelInstrCost(
7559                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
7560                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
7561                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
7562 
7563     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
7564   }
7565   case Instruction::UDiv:
7566   case Instruction::SDiv:
7567   case Instruction::URem:
7568   case Instruction::SRem:
7569     // If we have a predicated instruction, it may not be executed for each
7570     // vector lane. Get the scalarization cost and scale this amount by the
7571     // probability of executing the predicated block. If the instruction is not
7572     // predicated, we fall through to the next case.
7573     if (VF.isVector() && isScalarWithPredication(I)) {
7574       InstructionCost Cost = 0;
7575 
7576       // These instructions have a non-void type, so account for the phi nodes
7577       // that we will create. This cost is likely to be zero. The phi node
7578       // cost, if any, should be scaled by the block probability because it
7579       // models a copy at the end of each predicated block.
7580       Cost += VF.getKnownMinValue() *
7581               TTI.getCFInstrCost(Instruction::PHI, CostKind);
7582 
7583       // The cost of the non-predicated instruction.
7584       Cost += VF.getKnownMinValue() *
7585               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
7586 
7587       // The cost of insertelement and extractelement instructions needed for
7588       // scalarization.
7589       Cost += getScalarizationOverhead(I, VF);
7590 
7591       // Scale the cost by the probability of executing the predicated blocks.
7592       // This assumes the predicated block for each vector lane is equally
7593       // likely.
7594       return Cost / getReciprocalPredBlockProb();
7595     }
7596     LLVM_FALLTHROUGH;
7597   case Instruction::Add:
7598   case Instruction::FAdd:
7599   case Instruction::Sub:
7600   case Instruction::FSub:
7601   case Instruction::Mul:
7602   case Instruction::FMul:
7603   case Instruction::FDiv:
7604   case Instruction::FRem:
7605   case Instruction::Shl:
7606   case Instruction::LShr:
7607   case Instruction::AShr:
7608   case Instruction::And:
7609   case Instruction::Or:
7610   case Instruction::Xor: {
7611     // Since we will replace the stride by 1 the multiplication should go away.
7612     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7613       return 0;
7614 
7615     // Detect reduction patterns
7616     InstructionCost RedCost;
7617     if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7618             .isValid())
7619       return RedCost;
7620 
7621     // Certain instructions can be cheaper to vectorize if they have a constant
7622     // second vector operand. One example of this are shifts on x86.
7623     Value *Op2 = I->getOperand(1);
7624     TargetTransformInfo::OperandValueProperties Op2VP;
7625     TargetTransformInfo::OperandValueKind Op2VK =
7626         TTI.getOperandInfo(Op2, Op2VP);
7627     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7628       Op2VK = TargetTransformInfo::OK_UniformValue;
7629 
7630     SmallVector<const Value *, 4> Operands(I->operand_values());
7631     return TTI.getArithmeticInstrCost(
7632         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7633         Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
7634   }
7635   case Instruction::FNeg: {
7636     return TTI.getArithmeticInstrCost(
7637         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7638         TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None,
7639         TargetTransformInfo::OP_None, I->getOperand(0), I);
7640   }
7641   case Instruction::Select: {
7642     SelectInst *SI = cast<SelectInst>(I);
7643     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7644     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7645 
7646     const Value *Op0, *Op1;
7647     using namespace llvm::PatternMatch;
7648     if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
7649                         match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
7650       // select x, y, false --> x & y
7651       // select x, true, y --> x | y
7652       TTI::OperandValueProperties Op1VP = TTI::OP_None;
7653       TTI::OperandValueProperties Op2VP = TTI::OP_None;
7654       TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP);
7655       TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP);
7656       assert(Op0->getType()->getScalarSizeInBits() == 1 &&
7657               Op1->getType()->getScalarSizeInBits() == 1);
7658 
7659       SmallVector<const Value *, 2> Operands{Op0, Op1};
7660       return TTI.getArithmeticInstrCost(
7661           match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
7662           CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I);
7663     }
7664 
7665     Type *CondTy = SI->getCondition()->getType();
7666     if (!ScalarCond)
7667       CondTy = VectorType::get(CondTy, VF);
7668     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
7669                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
7670   }
7671   case Instruction::ICmp:
7672   case Instruction::FCmp: {
7673     Type *ValTy = I->getOperand(0)->getType();
7674     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7675     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7676       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7677     VectorTy = ToVectorTy(ValTy, VF);
7678     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7679                                   CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
7680   }
7681   case Instruction::Store:
7682   case Instruction::Load: {
7683     ElementCount Width = VF;
7684     if (Width.isVector()) {
7685       InstWidening Decision = getWideningDecision(I, Width);
7686       assert(Decision != CM_Unknown &&
7687              "CM decision should be taken at this point");
7688       if (Decision == CM_Scalarize)
7689         Width = ElementCount::getFixed(1);
7690     }
7691     VectorTy = ToVectorTy(getLoadStoreType(I), Width);
7692     return getMemoryInstructionCost(I, VF);
7693   }
7694   case Instruction::BitCast:
7695     if (I->getType()->isPointerTy())
7696       return 0;
7697     LLVM_FALLTHROUGH;
7698   case Instruction::ZExt:
7699   case Instruction::SExt:
7700   case Instruction::FPToUI:
7701   case Instruction::FPToSI:
7702   case Instruction::FPExt:
7703   case Instruction::PtrToInt:
7704   case Instruction::IntToPtr:
7705   case Instruction::SIToFP:
7706   case Instruction::UIToFP:
7707   case Instruction::Trunc:
7708   case Instruction::FPTrunc: {
7709     // Computes the CastContextHint from a Load/Store instruction.
7710     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7711       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7712              "Expected a load or a store!");
7713 
7714       if (VF.isScalar() || !TheLoop->contains(I))
7715         return TTI::CastContextHint::Normal;
7716 
7717       switch (getWideningDecision(I, VF)) {
7718       case LoopVectorizationCostModel::CM_GatherScatter:
7719         return TTI::CastContextHint::GatherScatter;
7720       case LoopVectorizationCostModel::CM_Interleave:
7721         return TTI::CastContextHint::Interleave;
7722       case LoopVectorizationCostModel::CM_Scalarize:
7723       case LoopVectorizationCostModel::CM_Widen:
7724         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7725                                         : TTI::CastContextHint::Normal;
7726       case LoopVectorizationCostModel::CM_Widen_Reverse:
7727         return TTI::CastContextHint::Reversed;
7728       case LoopVectorizationCostModel::CM_Unknown:
7729         llvm_unreachable("Instr did not go through cost modelling?");
7730       }
7731 
7732       llvm_unreachable("Unhandled case!");
7733     };
7734 
7735     unsigned Opcode = I->getOpcode();
7736     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7737     // For Trunc, the context is the only user, which must be a StoreInst.
7738     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7739       if (I->hasOneUse())
7740         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7741           CCH = ComputeCCH(Store);
7742     }
7743     // For Z/Sext, the context is the operand, which must be a LoadInst.
7744     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7745              Opcode == Instruction::FPExt) {
7746       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7747         CCH = ComputeCCH(Load);
7748     }
7749 
7750     // We optimize the truncation of induction variables having constant
7751     // integer steps. The cost of these truncations is the same as the scalar
7752     // operation.
7753     if (isOptimizableIVTruncate(I, VF)) {
7754       auto *Trunc = cast<TruncInst>(I);
7755       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7756                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7757     }
7758 
7759     // Detect reduction patterns
7760     InstructionCost RedCost;
7761     if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7762             .isValid())
7763       return RedCost;
7764 
7765     Type *SrcScalarTy = I->getOperand(0)->getType();
7766     Type *SrcVecTy =
7767         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7768     if (canTruncateToMinimalBitwidth(I, VF)) {
7769       // This cast is going to be shrunk. This may remove the cast or it might
7770       // turn it into slightly different cast. For example, if MinBW == 16,
7771       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7772       //
7773       // Calculate the modified src and dest types.
7774       Type *MinVecTy = VectorTy;
7775       if (Opcode == Instruction::Trunc) {
7776         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7777         VectorTy =
7778             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7779       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7780         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7781         VectorTy =
7782             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7783       }
7784     }
7785 
7786     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7787   }
7788   case Instruction::Call: {
7789     bool NeedToScalarize;
7790     CallInst *CI = cast<CallInst>(I);
7791     InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7792     if (getVectorIntrinsicIDForCall(CI, TLI)) {
7793       InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
7794       return std::min(CallCost, IntrinsicCost);
7795     }
7796     return CallCost;
7797   }
7798   case Instruction::ExtractValue:
7799     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7800   default:
7801     // This opcode is unknown. Assume that it is the same as 'mul'.
7802     return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7803   } // end of switch.
7804 }
7805 
7806 char LoopVectorize::ID = 0;
7807 
7808 static const char lv_name[] = "Loop Vectorization";
7809 
7810 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7811 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7812 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7813 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
7814 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7815 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7816 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7817 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7818 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7819 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7820 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7821 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7822 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7823 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
7824 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
7825 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7826 
7827 namespace llvm {
7828 
7829 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7830 
7831 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7832                               bool VectorizeOnlyWhenForced) {
7833   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7834 }
7835 
7836 } // end namespace llvm
7837 
7838 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
7839   // Check if the pointer operand of a load or store instruction is
7840   // consecutive.
7841   if (auto *Ptr = getLoadStorePointerOperand(Inst))
7842     return Legal->isConsecutivePtr(Ptr);
7843   return false;
7844 }
7845 
7846 void LoopVectorizationCostModel::collectValuesToIgnore() {
7847   // Ignore ephemeral values.
7848   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7849 
7850   // Ignore type-promoting instructions we identified during reduction
7851   // detection.
7852   for (auto &Reduction : Legal->getReductionVars()) {
7853     RecurrenceDescriptor &RedDes = Reduction.second;
7854     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7855     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7856   }
7857   // Ignore type-casting instructions we identified during induction
7858   // detection.
7859   for (auto &Induction : Legal->getInductionVars()) {
7860     InductionDescriptor &IndDes = Induction.second;
7861     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7862     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7863   }
7864 }
7865 
7866 void LoopVectorizationCostModel::collectInLoopReductions() {
7867   for (auto &Reduction : Legal->getReductionVars()) {
7868     PHINode *Phi = Reduction.first;
7869     RecurrenceDescriptor &RdxDesc = Reduction.second;
7870 
7871     // We don't collect reductions that are type promoted (yet).
7872     if (RdxDesc.getRecurrenceType() != Phi->getType())
7873       continue;
7874 
7875     // If the target would prefer this reduction to happen "in-loop", then we
7876     // want to record it as such.
7877     unsigned Opcode = RdxDesc.getOpcode();
7878     if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7879         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7880                                    TargetTransformInfo::ReductionFlags()))
7881       continue;
7882 
7883     // Check that we can correctly put the reductions into the loop, by
7884     // finding the chain of operations that leads from the phi to the loop
7885     // exit value.
7886     SmallVector<Instruction *, 4> ReductionOperations =
7887         RdxDesc.getReductionOpChain(Phi, TheLoop);
7888     bool InLoop = !ReductionOperations.empty();
7889     if (InLoop) {
7890       InLoopReductionChains[Phi] = ReductionOperations;
7891       // Add the elements to InLoopReductionImmediateChains for cost modelling.
7892       Instruction *LastChain = Phi;
7893       for (auto *I : ReductionOperations) {
7894         InLoopReductionImmediateChains[I] = LastChain;
7895         LastChain = I;
7896       }
7897     }
7898     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7899                       << " reduction for phi: " << *Phi << "\n");
7900   }
7901 }
7902 
7903 // TODO: we could return a pair of values that specify the max VF and
7904 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7905 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7906 // doesn't have a cost model that can choose which plan to execute if
7907 // more than one is generated.
7908 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7909                                  LoopVectorizationCostModel &CM) {
7910   unsigned WidestType;
7911   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7912   return WidestVectorRegBits / WidestType;
7913 }
7914 
7915 VectorizationFactor
7916 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7917   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
7918   ElementCount VF = UserVF;
7919   // Outer loop handling: They may require CFG and instruction level
7920   // transformations before even evaluating whether vectorization is profitable.
7921   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7922   // the vectorization pipeline.
7923   if (!OrigLoop->isInnermost()) {
7924     // If the user doesn't provide a vectorization factor, determine a
7925     // reasonable one.
7926     if (UserVF.isZero()) {
7927       VF = ElementCount::getFixed(determineVPlanVF(
7928           TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
7929               .getFixedSize(),
7930           CM));
7931       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7932 
7933       // Make sure we have a VF > 1 for stress testing.
7934       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7935         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7936                           << "overriding computed VF.\n");
7937         VF = ElementCount::getFixed(4);
7938       }
7939     }
7940     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7941     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7942            "VF needs to be a power of two");
7943     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7944                       << "VF " << VF << " to build VPlans.\n");
7945     buildVPlans(VF, VF);
7946 
7947     // For VPlan build stress testing, we bail out after VPlan construction.
7948     if (VPlanBuildStressTest)
7949       return VectorizationFactor::Disabled();
7950 
7951     return {VF, 0 /*Cost*/};
7952   }
7953 
7954   LLVM_DEBUG(
7955       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7956                 "VPlan-native path.\n");
7957   return VectorizationFactor::Disabled();
7958 }
7959 
7960 Optional<VectorizationFactor>
7961 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7962   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7963   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7964   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7965     return None;
7966 
7967   // Invalidate interleave groups if all blocks of loop will be predicated.
7968   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
7969       !useMaskedInterleavedAccesses(*TTI)) {
7970     LLVM_DEBUG(
7971         dbgs()
7972         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7973            "which requires masked-interleaved support.\n");
7974     if (CM.InterleaveInfo.invalidateGroups())
7975       // Invalidating interleave groups also requires invalidating all decisions
7976       // based on them, which includes widening decisions and uniform and scalar
7977       // values.
7978       CM.invalidateCostModelingDecisions();
7979   }
7980 
7981   ElementCount MaxUserVF =
7982       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7983   bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
7984   if (!UserVF.isZero() && UserVFIsLegal) {
7985     LLVM_DEBUG(dbgs() << "LV: Using " << (UserVFIsLegal ? "user" : "max")
7986                       << " VF " << UserVF << ".\n");
7987     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7988            "VF needs to be a power of two");
7989     // Collect the instructions (and their associated costs) that will be more
7990     // profitable to scalarize.
7991     CM.selectUserVectorizationFactor(UserVF);
7992     CM.collectInLoopReductions();
7993     buildVPlansWithVPRecipes(UserVF, UserVF);
7994     LLVM_DEBUG(printPlans(dbgs()));
7995     return {{UserVF, 0}};
7996   }
7997 
7998   // Populate the set of Vectorization Factor Candidates.
7999   ElementCountSet VFCandidates;
8000   for (auto VF = ElementCount::getFixed(1);
8001        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
8002     VFCandidates.insert(VF);
8003   for (auto VF = ElementCount::getScalable(1);
8004        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
8005     VFCandidates.insert(VF);
8006 
8007   for (const auto VF : VFCandidates) {
8008     // Collect Uniform and Scalar instructions after vectorization with VF.
8009     CM.collectUniformsAndScalars(VF);
8010 
8011     // Collect the instructions (and their associated costs) that will be more
8012     // profitable to scalarize.
8013     if (VF.isVector())
8014       CM.collectInstsToScalarize(VF);
8015   }
8016 
8017   CM.collectInLoopReductions();
8018   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
8019   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
8020 
8021   LLVM_DEBUG(printPlans(dbgs()));
8022   if (!MaxFactors.hasVector())
8023     return VectorizationFactor::Disabled();
8024 
8025   // Select the optimal vectorization factor.
8026   auto SelectedVF = CM.selectVectorizationFactor(VFCandidates);
8027 
8028   // Check if it is profitable to vectorize with runtime checks.
8029   unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
8030   if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) {
8031     bool PragmaThresholdReached =
8032         NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
8033     bool ThresholdReached =
8034         NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
8035     if ((ThresholdReached && !Hints.allowReordering()) ||
8036         PragmaThresholdReached) {
8037       ORE->emit([&]() {
8038         return OptimizationRemarkAnalysisAliasing(
8039                    DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(),
8040                    OrigLoop->getHeader())
8041                << "loop not vectorized: cannot prove it is safe to reorder "
8042                   "memory operations";
8043       });
8044       LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
8045       Hints.emitRemarkWithHints();
8046       return VectorizationFactor::Disabled();
8047     }
8048   }
8049   return SelectedVF;
8050 }
8051 
8052 void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) {
8053   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
8054                     << '\n');
8055   BestVF = VF;
8056   BestUF = UF;
8057 
8058   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
8059     return !Plan->hasVF(VF);
8060   });
8061   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
8062 }
8063 
8064 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
8065                                            DominatorTree *DT) {
8066   // Perform the actual loop transformation.
8067 
8068   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
8069   assert(BestVF.hasValue() && "Vectorization Factor is missing");
8070   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
8071 
8072   VPTransformState State{
8073       *BestVF, BestUF, LI, DT, ILV.Builder, &ILV, VPlans.front().get()};
8074   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
8075   State.TripCount = ILV.getOrCreateTripCount(nullptr);
8076   State.CanonicalIV = ILV.Induction;
8077 
8078   ILV.printDebugTracesAtStart();
8079 
8080   //===------------------------------------------------===//
8081   //
8082   // Notice: any optimization or new instruction that go
8083   // into the code below should also be implemented in
8084   // the cost-model.
8085   //
8086   //===------------------------------------------------===//
8087 
8088   // 2. Copy and widen instructions from the old loop into the new loop.
8089   VPlans.front()->execute(&State);
8090 
8091   // 3. Fix the vectorized code: take care of header phi's, live-outs,
8092   //    predication, updating analyses.
8093   ILV.fixVectorizedLoop(State);
8094 
8095   ILV.printDebugTracesAtEnd();
8096 }
8097 
8098 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
8099 void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
8100   for (const auto &Plan : VPlans)
8101     if (PrintVPlansInDotFormat)
8102       Plan->printDOT(O);
8103     else
8104       Plan->print(O);
8105 }
8106 #endif
8107 
8108 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
8109     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
8110 
8111   // We create new control-flow for the vectorized loop, so the original exit
8112   // conditions will be dead after vectorization if it's only used by the
8113   // terminator
8114   SmallVector<BasicBlock*> ExitingBlocks;
8115   OrigLoop->getExitingBlocks(ExitingBlocks);
8116   for (auto *BB : ExitingBlocks) {
8117     auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));
8118     if (!Cmp || !Cmp->hasOneUse())
8119       continue;
8120 
8121     // TODO: we should introduce a getUniqueExitingBlocks on Loop
8122     if (!DeadInstructions.insert(Cmp).second)
8123       continue;
8124 
8125     // The operands of the icmp is often a dead trunc, used by IndUpdate.
8126     // TODO: can recurse through operands in general
8127     for (Value *Op : Cmp->operands()) {
8128       if (isa<TruncInst>(Op) && Op->hasOneUse())
8129           DeadInstructions.insert(cast<Instruction>(Op));
8130     }
8131   }
8132 
8133   // We create new "steps" for induction variable updates to which the original
8134   // induction variables map. An original update instruction will be dead if
8135   // all its users except the induction variable are dead.
8136   auto *Latch = OrigLoop->getLoopLatch();
8137   for (auto &Induction : Legal->getInductionVars()) {
8138     PHINode *Ind = Induction.first;
8139     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
8140 
8141     // If the tail is to be folded by masking, the primary induction variable,
8142     // if exists, isn't dead: it will be used for masking. Don't kill it.
8143     if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
8144       continue;
8145 
8146     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
8147           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
8148         }))
8149       DeadInstructions.insert(IndUpdate);
8150 
8151     // We record as "Dead" also the type-casting instructions we had identified
8152     // during induction analysis. We don't need any handling for them in the
8153     // vectorized loop because we have proven that, under a proper runtime
8154     // test guarding the vectorized loop, the value of the phi, and the casted
8155     // value of the phi, are the same. The last instruction in this casting chain
8156     // will get its scalar/vector/widened def from the scalar/vector/widened def
8157     // of the respective phi node. Any other casts in the induction def-use chain
8158     // have no other uses outside the phi update chain, and will be ignored.
8159     InductionDescriptor &IndDes = Induction.second;
8160     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
8161     DeadInstructions.insert(Casts.begin(), Casts.end());
8162   }
8163 }
8164 
8165 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
8166 
8167 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
8168 
8169 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
8170                                         Instruction::BinaryOps BinOp) {
8171   // When unrolling and the VF is 1, we only need to add a simple scalar.
8172   Type *Ty = Val->getType();
8173   assert(!Ty->isVectorTy() && "Val must be a scalar");
8174 
8175   if (Ty->isFloatingPointTy()) {
8176     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
8177 
8178     // Floating-point operations inherit FMF via the builder's flags.
8179     Value *MulOp = Builder.CreateFMul(C, Step);
8180     return Builder.CreateBinOp(BinOp, Val, MulOp);
8181   }
8182   Constant *C = ConstantInt::get(Ty, StartIdx);
8183   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
8184 }
8185 
8186 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
8187   SmallVector<Metadata *, 4> MDs;
8188   // Reserve first location for self reference to the LoopID metadata node.
8189   MDs.push_back(nullptr);
8190   bool IsUnrollMetadata = false;
8191   MDNode *LoopID = L->getLoopID();
8192   if (LoopID) {
8193     // First find existing loop unrolling disable metadata.
8194     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
8195       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
8196       if (MD) {
8197         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
8198         IsUnrollMetadata =
8199             S && S->getString().startswith("llvm.loop.unroll.disable");
8200       }
8201       MDs.push_back(LoopID->getOperand(i));
8202     }
8203   }
8204 
8205   if (!IsUnrollMetadata) {
8206     // Add runtime unroll disable metadata.
8207     LLVMContext &Context = L->getHeader()->getContext();
8208     SmallVector<Metadata *, 1> DisableOperands;
8209     DisableOperands.push_back(
8210         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
8211     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
8212     MDs.push_back(DisableNode);
8213     MDNode *NewLoopID = MDNode::get(Context, MDs);
8214     // Set operand 0 to refer to the loop id itself.
8215     NewLoopID->replaceOperandWith(0, NewLoopID);
8216     L->setLoopID(NewLoopID);
8217   }
8218 }
8219 
8220 //===--------------------------------------------------------------------===//
8221 // EpilogueVectorizerMainLoop
8222 //===--------------------------------------------------------------------===//
8223 
8224 /// This function is partially responsible for generating the control flow
8225 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
8226 BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
8227   MDNode *OrigLoopID = OrigLoop->getLoopID();
8228   Loop *Lp = createVectorLoopSkeleton("");
8229 
8230   // Generate the code to check the minimum iteration count of the vector
8231   // epilogue (see below).
8232   EPI.EpilogueIterationCountCheck =
8233       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true);
8234   EPI.EpilogueIterationCountCheck->setName("iter.check");
8235 
8236   // Generate the code to check any assumptions that we've made for SCEV
8237   // expressions.
8238   EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader);
8239 
8240   // Generate the code that checks at runtime if arrays overlap. We put the
8241   // checks into a separate block to make the more common case of few elements
8242   // faster.
8243   EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
8244 
8245   // Generate the iteration count check for the main loop, *after* the check
8246   // for the epilogue loop, so that the path-length is shorter for the case
8247   // that goes directly through the vector epilogue. The longer-path length for
8248   // the main loop is compensated for, by the gain from vectorizing the larger
8249   // trip count. Note: the branch will get updated later on when we vectorize
8250   // the epilogue.
8251   EPI.MainLoopIterationCountCheck =
8252       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
8253 
8254   // Generate the induction variable.
8255   OldInduction = Legal->getPrimaryInduction();
8256   Type *IdxTy = Legal->getWidestInductionType();
8257   Value *StartIdx = ConstantInt::get(IdxTy, 0);
8258   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
8259   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
8260   EPI.VectorTripCount = CountRoundDown;
8261   Induction =
8262       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
8263                               getDebugLocFromInstOrOperands(OldInduction));
8264 
8265   // Skip induction resume value creation here because they will be created in
8266   // the second pass. If we created them here, they wouldn't be used anyway,
8267   // because the vplan in the second pass still contains the inductions from the
8268   // original loop.
8269 
8270   return completeLoopSkeleton(Lp, OrigLoopID);
8271 }
8272 
8273 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
8274   LLVM_DEBUG({
8275     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
8276            << "Main Loop VF:" << EPI.MainLoopVF.getKnownMinValue()
8277            << ", Main Loop UF:" << EPI.MainLoopUF
8278            << ", Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
8279            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8280   });
8281 }
8282 
8283 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
8284   DEBUG_WITH_TYPE(VerboseDebug, {
8285     dbgs() << "intermediate fn:\n" << *Induction->getFunction() << "\n";
8286   });
8287 }
8288 
8289 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
8290     Loop *L, BasicBlock *Bypass, bool ForEpilogue) {
8291   assert(L && "Expected valid Loop.");
8292   assert(Bypass && "Expected valid bypass basic block.");
8293   unsigned VFactor =
8294       ForEpilogue ? EPI.EpilogueVF.getKnownMinValue() : VF.getKnownMinValue();
8295   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
8296   Value *Count = getOrCreateTripCount(L);
8297   // Reuse existing vector loop preheader for TC checks.
8298   // Note that new preheader block is generated for vector loop.
8299   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
8300   IRBuilder<> Builder(TCCheckBlock->getTerminator());
8301 
8302   // Generate code to check if the loop's trip count is less than VF * UF of the
8303   // main vector loop.
8304   auto P =
8305       Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
8306 
8307   Value *CheckMinIters = Builder.CreateICmp(
8308       P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor),
8309       "min.iters.check");
8310 
8311   if (!ForEpilogue)
8312     TCCheckBlock->setName("vector.main.loop.iter.check");
8313 
8314   // Create new preheader for vector loop.
8315   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
8316                                    DT, LI, nullptr, "vector.ph");
8317 
8318   if (ForEpilogue) {
8319     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
8320                                  DT->getNode(Bypass)->getIDom()) &&
8321            "TC check is expected to dominate Bypass");
8322 
8323     // Update dominator for Bypass & LoopExit.
8324     DT->changeImmediateDominator(Bypass, TCCheckBlock);
8325     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
8326 
8327     LoopBypassBlocks.push_back(TCCheckBlock);
8328 
8329     // Save the trip count so we don't have to regenerate it in the
8330     // vec.epilog.iter.check. This is safe to do because the trip count
8331     // generated here dominates the vector epilog iter check.
8332     EPI.TripCount = Count;
8333   }
8334 
8335   ReplaceInstWithInst(
8336       TCCheckBlock->getTerminator(),
8337       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
8338 
8339   return TCCheckBlock;
8340 }
8341 
8342 //===--------------------------------------------------------------------===//
8343 // EpilogueVectorizerEpilogueLoop
8344 //===--------------------------------------------------------------------===//
8345 
8346 /// This function is partially responsible for generating the control flow
8347 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
8348 BasicBlock *
8349 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
8350   MDNode *OrigLoopID = OrigLoop->getLoopID();
8351   Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
8352 
8353   // Now, compare the remaining count and if there aren't enough iterations to
8354   // execute the vectorized epilogue skip to the scalar part.
8355   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
8356   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
8357   LoopVectorPreHeader =
8358       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
8359                  LI, nullptr, "vec.epilog.ph");
8360   emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader,
8361                                           VecEpilogueIterationCountCheck);
8362 
8363   // Adjust the control flow taking the state info from the main loop
8364   // vectorization into account.
8365   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
8366          "expected this to be saved from the previous pass.");
8367   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
8368       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
8369 
8370   DT->changeImmediateDominator(LoopVectorPreHeader,
8371                                EPI.MainLoopIterationCountCheck);
8372 
8373   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
8374       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8375 
8376   if (EPI.SCEVSafetyCheck)
8377     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
8378         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8379   if (EPI.MemSafetyCheck)
8380     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
8381         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8382 
8383   DT->changeImmediateDominator(
8384       VecEpilogueIterationCountCheck,
8385       VecEpilogueIterationCountCheck->getSinglePredecessor());
8386 
8387   DT->changeImmediateDominator(LoopScalarPreHeader,
8388                                EPI.EpilogueIterationCountCheck);
8389   DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck);
8390 
8391   // Keep track of bypass blocks, as they feed start values to the induction
8392   // phis in the scalar loop preheader.
8393   if (EPI.SCEVSafetyCheck)
8394     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
8395   if (EPI.MemSafetyCheck)
8396     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
8397   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
8398 
8399   // Generate a resume induction for the vector epilogue and put it in the
8400   // vector epilogue preheader
8401   Type *IdxTy = Legal->getWidestInductionType();
8402   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
8403                                          LoopVectorPreHeader->getFirstNonPHI());
8404   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
8405   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
8406                            EPI.MainLoopIterationCountCheck);
8407 
8408   // Generate the induction variable.
8409   OldInduction = Legal->getPrimaryInduction();
8410   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
8411   Constant *Step = ConstantInt::get(IdxTy, VF.getKnownMinValue() * UF);
8412   Value *StartIdx = EPResumeVal;
8413   Induction =
8414       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
8415                               getDebugLocFromInstOrOperands(OldInduction));
8416 
8417   // Generate induction resume values. These variables save the new starting
8418   // indexes for the scalar loop. They are used to test if there are any tail
8419   // iterations left once the vector loop has completed.
8420   // Note that when the vectorized epilogue is skipped due to iteration count
8421   // check, then the resume value for the induction variable comes from
8422   // the trip count of the main vector loop, hence passing the AdditionalBypass
8423   // argument.
8424   createInductionResumeValues(Lp, CountRoundDown,
8425                               {VecEpilogueIterationCountCheck,
8426                                EPI.VectorTripCount} /* AdditionalBypass */);
8427 
8428   AddRuntimeUnrollDisableMetaData(Lp);
8429   return completeLoopSkeleton(Lp, OrigLoopID);
8430 }
8431 
8432 BasicBlock *
8433 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
8434     Loop *L, BasicBlock *Bypass, BasicBlock *Insert) {
8435 
8436   assert(EPI.TripCount &&
8437          "Expected trip count to have been safed in the first pass.");
8438   assert(
8439       (!isa<Instruction>(EPI.TripCount) ||
8440        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
8441       "saved trip count does not dominate insertion point.");
8442   Value *TC = EPI.TripCount;
8443   IRBuilder<> Builder(Insert->getTerminator());
8444   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
8445 
8446   // Generate code to check if the loop's trip count is less than VF * UF of the
8447   // vector epilogue loop.
8448   auto P =
8449       Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
8450 
8451   Value *CheckMinIters = Builder.CreateICmp(
8452       P, Count,
8453       ConstantInt::get(Count->getType(),
8454                        EPI.EpilogueVF.getKnownMinValue() * EPI.EpilogueUF),
8455       "min.epilog.iters.check");
8456 
8457   ReplaceInstWithInst(
8458       Insert->getTerminator(),
8459       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
8460 
8461   LoopBypassBlocks.push_back(Insert);
8462   return Insert;
8463 }
8464 
8465 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
8466   LLVM_DEBUG({
8467     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
8468            << "Epilogue Loop VF:" << EPI.EpilogueVF.getKnownMinValue()
8469            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8470   });
8471 }
8472 
8473 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
8474   DEBUG_WITH_TYPE(VerboseDebug, {
8475     dbgs() << "final fn:\n" << *Induction->getFunction() << "\n";
8476   });
8477 }
8478 
8479 bool LoopVectorizationPlanner::getDecisionAndClampRange(
8480     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
8481   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
8482   bool PredicateAtRangeStart = Predicate(Range.Start);
8483 
8484   for (ElementCount TmpVF = Range.Start * 2;
8485        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
8486     if (Predicate(TmpVF) != PredicateAtRangeStart) {
8487       Range.End = TmpVF;
8488       break;
8489     }
8490 
8491   return PredicateAtRangeStart;
8492 }
8493 
8494 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
8495 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
8496 /// of VF's starting at a given VF and extending it as much as possible. Each
8497 /// vectorization decision can potentially shorten this sub-range during
8498 /// buildVPlan().
8499 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
8500                                            ElementCount MaxVF) {
8501   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8502   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8503     VFRange SubRange = {VF, MaxVFPlusOne};
8504     VPlans.push_back(buildVPlan(SubRange));
8505     VF = SubRange.End;
8506   }
8507 }
8508 
8509 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
8510                                          VPlanPtr &Plan) {
8511   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8512 
8513   // Look for cached value.
8514   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8515   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8516   if (ECEntryIt != EdgeMaskCache.end())
8517     return ECEntryIt->second;
8518 
8519   VPValue *SrcMask = createBlockInMask(Src, Plan);
8520 
8521   // The terminator has to be a branch inst!
8522   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8523   assert(BI && "Unexpected terminator found");
8524 
8525   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8526     return EdgeMaskCache[Edge] = SrcMask;
8527 
8528   // If source is an exiting block, we know the exit edge is dynamically dead
8529   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
8530   // adding uses of an otherwise potentially dead instruction.
8531   if (OrigLoop->isLoopExiting(Src))
8532     return EdgeMaskCache[Edge] = SrcMask;
8533 
8534   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
8535   assert(EdgeMask && "No Edge Mask found for condition");
8536 
8537   if (BI->getSuccessor(0) != Dst)
8538     EdgeMask = Builder.createNot(EdgeMask);
8539 
8540   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8541     // The condition is 'SrcMask && EdgeMask', which is equivalent to
8542     // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8543     // The select version does not introduce new UB if SrcMask is false and
8544     // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8545     VPValue *False = Plan->getOrAddVPValue(
8546         ConstantInt::getFalse(BI->getCondition()->getType()));
8547     EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False);
8548   }
8549 
8550   return EdgeMaskCache[Edge] = EdgeMask;
8551 }
8552 
8553 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
8554   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8555 
8556   // Look for cached value.
8557   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
8558   if (BCEntryIt != BlockMaskCache.end())
8559     return BCEntryIt->second;
8560 
8561   // All-one mask is modelled as no-mask following the convention for masked
8562   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8563   VPValue *BlockMask = nullptr;
8564 
8565   if (OrigLoop->getHeader() == BB) {
8566     if (!CM.blockNeedsPredication(BB))
8567       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
8568 
8569     // Create the block in mask as the first non-phi instruction in the block.
8570     VPBuilder::InsertPointGuard Guard(Builder);
8571     auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
8572     Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
8573 
8574     // Introduce the early-exit compare IV <= BTC to form header block mask.
8575     // This is used instead of IV < TC because TC may wrap, unlike BTC.
8576     // Start by constructing the desired canonical IV.
8577     VPValue *IV = nullptr;
8578     if (Legal->getPrimaryInduction())
8579       IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
8580     else {
8581       auto IVRecipe = new VPWidenCanonicalIVRecipe();
8582       Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint);
8583       IV = IVRecipe->getVPSingleValue();
8584     }
8585     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
8586     bool TailFolded = !CM.isScalarEpilogueAllowed();
8587 
8588     if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
8589       // While ActiveLaneMask is a binary op that consumes the loop tripcount
8590       // as a second argument, we only pass the IV here and extract the
8591       // tripcount from the transform state where codegen of the VP instructions
8592       // happen.
8593       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
8594     } else {
8595       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
8596     }
8597     return BlockMaskCache[BB] = BlockMask;
8598   }
8599 
8600   // This is the block mask. We OR all incoming edges.
8601   for (auto *Predecessor : predecessors(BB)) {
8602     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8603     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
8604       return BlockMaskCache[BB] = EdgeMask;
8605 
8606     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8607       BlockMask = EdgeMask;
8608       continue;
8609     }
8610 
8611     BlockMask = Builder.createOr(BlockMask, EdgeMask);
8612   }
8613 
8614   return BlockMaskCache[BB] = BlockMask;
8615 }
8616 
8617 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
8618                                                 ArrayRef<VPValue *> Operands,
8619                                                 VFRange &Range,
8620                                                 VPlanPtr &Plan) {
8621   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8622          "Must be called with either a load or store");
8623 
8624   auto willWiden = [&](ElementCount VF) -> bool {
8625     if (VF.isScalar())
8626       return false;
8627     LoopVectorizationCostModel::InstWidening Decision =
8628         CM.getWideningDecision(I, VF);
8629     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8630            "CM decision should be taken at this point.");
8631     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8632       return true;
8633     if (CM.isScalarAfterVectorization(I, VF) ||
8634         CM.isProfitableToScalarize(I, VF))
8635       return false;
8636     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8637   };
8638 
8639   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8640     return nullptr;
8641 
8642   VPValue *Mask = nullptr;
8643   if (Legal->isMaskRequired(I))
8644     Mask = createBlockInMask(I->getParent(), Plan);
8645 
8646   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8647     return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask);
8648 
8649   StoreInst *Store = cast<StoreInst>(I);
8650   return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
8651                                             Mask);
8652 }
8653 
8654 VPWidenIntOrFpInductionRecipe *
8655 VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi,
8656                                            ArrayRef<VPValue *> Operands) const {
8657   // Check if this is an integer or fp induction. If so, build the recipe that
8658   // produces its scalar and vector values.
8659   InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
8660   if (II.getKind() == InductionDescriptor::IK_IntInduction ||
8661       II.getKind() == InductionDescriptor::IK_FpInduction) {
8662     assert(II.getStartValue() ==
8663            Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8664     const SmallVectorImpl<Instruction *> &Casts = II.getCastInsts();
8665     return new VPWidenIntOrFpInductionRecipe(
8666         Phi, Operands[0], Casts.empty() ? nullptr : Casts.front());
8667   }
8668 
8669   return nullptr;
8670 }
8671 
8672 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8673     TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range,
8674     VPlan &Plan) const {
8675   // Optimize the special case where the source is a constant integer
8676   // induction variable. Notice that we can only optimize the 'trunc' case
8677   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8678   // (c) other casts depend on pointer size.
8679 
8680   // Determine whether \p K is a truncation based on an induction variable that
8681   // can be optimized.
8682   auto isOptimizableIVTruncate =
8683       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8684     return [=](ElementCount VF) -> bool {
8685       return CM.isOptimizableIVTruncate(K, VF);
8686     };
8687   };
8688 
8689   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8690           isOptimizableIVTruncate(I), Range)) {
8691 
8692     InductionDescriptor II =
8693         Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0)));
8694     VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
8695     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
8696                                              Start, nullptr, I);
8697   }
8698   return nullptr;
8699 }
8700 
8701 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
8702                                                 ArrayRef<VPValue *> Operands,
8703                                                 VPlanPtr &Plan) {
8704   // If all incoming values are equal, the incoming VPValue can be used directly
8705   // instead of creating a new VPBlendRecipe.
8706   VPValue *FirstIncoming = Operands[0];
8707   if (all_of(Operands, [FirstIncoming](const VPValue *Inc) {
8708         return FirstIncoming == Inc;
8709       })) {
8710     return Operands[0];
8711   }
8712 
8713   // We know that all PHIs in non-header blocks are converted into selects, so
8714   // we don't have to worry about the insertion order and we can just use the
8715   // builder. At this point we generate the predication tree. There may be
8716   // duplications since this is a simple recursive scan, but future
8717   // optimizations will clean it up.
8718   SmallVector<VPValue *, 2> OperandsWithMask;
8719   unsigned NumIncoming = Phi->getNumIncomingValues();
8720 
8721   for (unsigned In = 0; In < NumIncoming; In++) {
8722     VPValue *EdgeMask =
8723       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8724     assert((EdgeMask || NumIncoming == 1) &&
8725            "Multiple predecessors with one having a full mask");
8726     OperandsWithMask.push_back(Operands[In]);
8727     if (EdgeMask)
8728       OperandsWithMask.push_back(EdgeMask);
8729   }
8730   return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
8731 }
8732 
8733 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8734                                                    ArrayRef<VPValue *> Operands,
8735                                                    VFRange &Range) const {
8736 
8737   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8738       [this, CI](ElementCount VF) { return CM.isScalarWithPredication(CI); },
8739       Range);
8740 
8741   if (IsPredicated)
8742     return nullptr;
8743 
8744   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8745   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8746              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8747              ID == Intrinsic::pseudoprobe ||
8748              ID == Intrinsic::experimental_noalias_scope_decl))
8749     return nullptr;
8750 
8751   auto willWiden = [&](ElementCount VF) -> bool {
8752     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8753     // The following case may be scalarized depending on the VF.
8754     // The flag shows whether we use Intrinsic or a usual Call for vectorized
8755     // version of the instruction.
8756     // Is it beneficial to perform intrinsic call compared to lib call?
8757     bool NeedToScalarize = false;
8758     InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
8759     InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
8760     bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
8761     assert((IntrinsicCost.isValid() || CallCost.isValid()) &&
8762            "Either the intrinsic cost or vector call cost must be valid");
8763     return UseVectorIntrinsic || !NeedToScalarize;
8764   };
8765 
8766   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8767     return nullptr;
8768 
8769   ArrayRef<VPValue *> Ops = Operands.take_front(CI->getNumArgOperands());
8770   return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()));
8771 }
8772 
8773 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8774   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8775          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8776   // Instruction should be widened, unless it is scalar after vectorization,
8777   // scalarization is profitable or it is predicated.
8778   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8779     return CM.isScalarAfterVectorization(I, VF) ||
8780            CM.isProfitableToScalarize(I, VF) || CM.isScalarWithPredication(I);
8781   };
8782   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8783                                                              Range);
8784 }
8785 
8786 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8787                                            ArrayRef<VPValue *> Operands) const {
8788   auto IsVectorizableOpcode = [](unsigned Opcode) {
8789     switch (Opcode) {
8790     case Instruction::Add:
8791     case Instruction::And:
8792     case Instruction::AShr:
8793     case Instruction::BitCast:
8794     case Instruction::FAdd:
8795     case Instruction::FCmp:
8796     case Instruction::FDiv:
8797     case Instruction::FMul:
8798     case Instruction::FNeg:
8799     case Instruction::FPExt:
8800     case Instruction::FPToSI:
8801     case Instruction::FPToUI:
8802     case Instruction::FPTrunc:
8803     case Instruction::FRem:
8804     case Instruction::FSub:
8805     case Instruction::ICmp:
8806     case Instruction::IntToPtr:
8807     case Instruction::LShr:
8808     case Instruction::Mul:
8809     case Instruction::Or:
8810     case Instruction::PtrToInt:
8811     case Instruction::SDiv:
8812     case Instruction::Select:
8813     case Instruction::SExt:
8814     case Instruction::Shl:
8815     case Instruction::SIToFP:
8816     case Instruction::SRem:
8817     case Instruction::Sub:
8818     case Instruction::Trunc:
8819     case Instruction::UDiv:
8820     case Instruction::UIToFP:
8821     case Instruction::URem:
8822     case Instruction::Xor:
8823     case Instruction::ZExt:
8824       return true;
8825     }
8826     return false;
8827   };
8828 
8829   if (!IsVectorizableOpcode(I->getOpcode()))
8830     return nullptr;
8831 
8832   // Success: widen this instruction.
8833   return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8834 }
8835 
8836 void VPRecipeBuilder::fixHeaderPhis() {
8837   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8838   for (VPWidenPHIRecipe *R : PhisToFix) {
8839     auto *PN = cast<PHINode>(R->getUnderlyingValue());
8840     VPRecipeBase *IncR =
8841         getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8842     R->addOperand(IncR->getVPSingleValue());
8843   }
8844 }
8845 
8846 VPBasicBlock *VPRecipeBuilder::handleReplication(
8847     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8848     VPlanPtr &Plan) {
8849   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8850       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8851       Range);
8852 
8853   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8854       [&](ElementCount VF) { return CM.isPredicatedInst(I); }, Range);
8855 
8856   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8857                                        IsUniform, IsPredicated);
8858   setRecipe(I, Recipe);
8859   Plan->addVPValue(I, Recipe);
8860 
8861   // Find if I uses a predicated instruction. If so, it will use its scalar
8862   // value. Avoid hoisting the insert-element which packs the scalar value into
8863   // a vector value, as that happens iff all users use the vector value.
8864   for (VPValue *Op : Recipe->operands()) {
8865     auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef());
8866     if (!PredR)
8867       continue;
8868     auto *RepR =
8869         cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef());
8870     assert(RepR->isPredicated() &&
8871            "expected Replicate recipe to be predicated");
8872     RepR->setAlsoPack(false);
8873   }
8874 
8875   // Finalize the recipe for Instr, first if it is not predicated.
8876   if (!IsPredicated) {
8877     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8878     VPBB->appendRecipe(Recipe);
8879     return VPBB;
8880   }
8881   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8882   assert(VPBB->getSuccessors().empty() &&
8883          "VPBB has successors when handling predicated replication.");
8884   // Record predicated instructions for above packing optimizations.
8885   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
8886   VPBlockUtils::insertBlockAfter(Region, VPBB);
8887   auto *RegSucc = new VPBasicBlock();
8888   VPBlockUtils::insertBlockAfter(RegSucc, Region);
8889   return RegSucc;
8890 }
8891 
8892 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
8893                                                       VPRecipeBase *PredRecipe,
8894                                                       VPlanPtr &Plan) {
8895   // Instructions marked for predication are replicated and placed under an
8896   // if-then construct to prevent side-effects.
8897 
8898   // Generate recipes to compute the block mask for this region.
8899   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
8900 
8901   // Build the triangular if-then region.
8902   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8903   assert(Instr->getParent() && "Predicated instruction not in any basic block");
8904   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8905   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8906   auto *PHIRecipe = Instr->getType()->isVoidTy()
8907                         ? nullptr
8908                         : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
8909   if (PHIRecipe) {
8910     Plan->removeVPValueFor(Instr);
8911     Plan->addVPValue(Instr, PHIRecipe);
8912   }
8913   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8914   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8915   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
8916 
8917   // Note: first set Entry as region entry and then connect successors starting
8918   // from it in order, to propagate the "parent" of each VPBasicBlock.
8919   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
8920   VPBlockUtils::connectBlocks(Pred, Exit);
8921 
8922   return Region;
8923 }
8924 
8925 VPRecipeOrVPValueTy
8926 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8927                                         ArrayRef<VPValue *> Operands,
8928                                         VFRange &Range, VPlanPtr &Plan) {
8929   // First, check for specific widening recipes that deal with calls, memory
8930   // operations, inductions and Phi nodes.
8931   if (auto *CI = dyn_cast<CallInst>(Instr))
8932     return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
8933 
8934   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8935     return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
8936 
8937   VPRecipeBase *Recipe;
8938   if (auto Phi = dyn_cast<PHINode>(Instr)) {
8939     if (Phi->getParent() != OrigLoop->getHeader())
8940       return tryToBlend(Phi, Operands, Plan);
8941     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands)))
8942       return toVPRecipeResult(Recipe);
8943 
8944     if (Legal->isReductionVariable(Phi)) {
8945       RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
8946       assert(RdxDesc.getRecurrenceStartValue() ==
8947              Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8948       VPValue *StartV = Operands[0];
8949 
8950       auto *PhiRecipe = new VPWidenPHIRecipe(Phi, RdxDesc, *StartV);
8951       PhisToFix.push_back(PhiRecipe);
8952       // Record the incoming value from the backedge, so we can add the incoming
8953       // value from the backedge after all recipes have been created.
8954       recordRecipeOf(cast<Instruction>(
8955           Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())));
8956       return toVPRecipeResult(PhiRecipe);
8957     }
8958 
8959     return toVPRecipeResult(new VPWidenPHIRecipe(Phi));
8960   }
8961 
8962   if (isa<TruncInst>(Instr) &&
8963       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
8964                                                Range, *Plan)))
8965     return toVPRecipeResult(Recipe);
8966 
8967   if (!shouldWiden(Instr, Range))
8968     return nullptr;
8969 
8970   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8971     return toVPRecipeResult(new VPWidenGEPRecipe(
8972         GEP, make_range(Operands.begin(), Operands.end()), OrigLoop));
8973 
8974   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8975     bool InvariantCond =
8976         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
8977     return toVPRecipeResult(new VPWidenSelectRecipe(
8978         *SI, make_range(Operands.begin(), Operands.end()), InvariantCond));
8979   }
8980 
8981   return toVPRecipeResult(tryToWiden(Instr, Operands));
8982 }
8983 
8984 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8985                                                         ElementCount MaxVF) {
8986   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8987 
8988   // Collect instructions from the original loop that will become trivially dead
8989   // in the vectorized loop. We don't need to vectorize these instructions. For
8990   // example, original induction update instructions can become dead because we
8991   // separately emit induction "steps" when generating code for the new loop.
8992   // Similarly, we create a new latch condition when setting up the structure
8993   // of the new loop, so the old one can become dead.
8994   SmallPtrSet<Instruction *, 4> DeadInstructions;
8995   collectTriviallyDeadInstructions(DeadInstructions);
8996 
8997   // Add assume instructions we need to drop to DeadInstructions, to prevent
8998   // them from being added to the VPlan.
8999   // TODO: We only need to drop assumes in blocks that get flattend. If the
9000   // control flow is preserved, we should keep them.
9001   auto &ConditionalAssumes = Legal->getConditionalAssumes();
9002   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
9003 
9004   MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
9005   // Dead instructions do not need sinking. Remove them from SinkAfter.
9006   for (Instruction *I : DeadInstructions)
9007     SinkAfter.erase(I);
9008 
9009   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
9010   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
9011     VFRange SubRange = {VF, MaxVFPlusOne};
9012     VPlans.push_back(
9013         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
9014     VF = SubRange.End;
9015   }
9016 }
9017 
9018 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
9019     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
9020     const MapVector<Instruction *, Instruction *> &SinkAfter) {
9021 
9022   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
9023 
9024   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
9025 
9026   // ---------------------------------------------------------------------------
9027   // Pre-construction: record ingredients whose recipes we'll need to further
9028   // process after constructing the initial VPlan.
9029   // ---------------------------------------------------------------------------
9030 
9031   // Mark instructions we'll need to sink later and their targets as
9032   // ingredients whose recipe we'll need to record.
9033   for (auto &Entry : SinkAfter) {
9034     RecipeBuilder.recordRecipeOf(Entry.first);
9035     RecipeBuilder.recordRecipeOf(Entry.second);
9036   }
9037   for (auto &Reduction : CM.getInLoopReductionChains()) {
9038     PHINode *Phi = Reduction.first;
9039     RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind();
9040     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9041 
9042     RecipeBuilder.recordRecipeOf(Phi);
9043     for (auto &R : ReductionOperations) {
9044       RecipeBuilder.recordRecipeOf(R);
9045       // For min/max reducitons, where we have a pair of icmp/select, we also
9046       // need to record the ICmp recipe, so it can be removed later.
9047       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
9048         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
9049     }
9050   }
9051 
9052   // For each interleave group which is relevant for this (possibly trimmed)
9053   // Range, add it to the set of groups to be later applied to the VPlan and add
9054   // placeholders for its members' Recipes which we'll be replacing with a
9055   // single VPInterleaveRecipe.
9056   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
9057     auto applyIG = [IG, this](ElementCount VF) -> bool {
9058       return (VF.isVector() && // Query is illegal for VF == 1
9059               CM.getWideningDecision(IG->getInsertPos(), VF) ==
9060                   LoopVectorizationCostModel::CM_Interleave);
9061     };
9062     if (!getDecisionAndClampRange(applyIG, Range))
9063       continue;
9064     InterleaveGroups.insert(IG);
9065     for (unsigned i = 0; i < IG->getFactor(); i++)
9066       if (Instruction *Member = IG->getMember(i))
9067         RecipeBuilder.recordRecipeOf(Member);
9068   };
9069 
9070   // ---------------------------------------------------------------------------
9071   // Build initial VPlan: Scan the body of the loop in a topological order to
9072   // visit each basic block after having visited its predecessor basic blocks.
9073   // ---------------------------------------------------------------------------
9074 
9075   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
9076   auto Plan = std::make_unique<VPlan>();
9077   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
9078   Plan->setEntry(VPBB);
9079 
9080   // Scan the body of the loop in a topological order to visit each basic block
9081   // after having visited its predecessor basic blocks.
9082   LoopBlocksDFS DFS(OrigLoop);
9083   DFS.perform(LI);
9084 
9085   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
9086     // Relevant instructions from basic block BB will be grouped into VPRecipe
9087     // ingredients and fill a new VPBasicBlock.
9088     unsigned VPBBsForBB = 0;
9089     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
9090     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
9091     VPBB = FirstVPBBForBB;
9092     Builder.setInsertPoint(VPBB);
9093 
9094     // Introduce each ingredient into VPlan.
9095     // TODO: Model and preserve debug instrinsics in VPlan.
9096     for (Instruction &I : BB->instructionsWithoutDebug()) {
9097       Instruction *Instr = &I;
9098 
9099       // First filter out irrelevant instructions, to ensure no recipes are
9100       // built for them.
9101       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
9102         continue;
9103 
9104       SmallVector<VPValue *, 4> Operands;
9105       auto *Phi = dyn_cast<PHINode>(Instr);
9106       if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
9107         Operands.push_back(Plan->getOrAddVPValue(
9108             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
9109       } else {
9110         auto OpRange = Plan->mapToVPValues(Instr->operands());
9111         Operands = {OpRange.begin(), OpRange.end()};
9112       }
9113       if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
9114               Instr, Operands, Range, Plan)) {
9115         // If Instr can be simplified to an existing VPValue, use it.
9116         if (RecipeOrValue.is<VPValue *>()) {
9117           auto *VPV = RecipeOrValue.get<VPValue *>();
9118           Plan->addVPValue(Instr, VPV);
9119           // If the re-used value is a recipe, register the recipe for the
9120           // instruction, in case the recipe for Instr needs to be recorded.
9121           if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef()))
9122             RecipeBuilder.setRecipe(Instr, R);
9123           continue;
9124         }
9125         // Otherwise, add the new recipe.
9126         VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>();
9127         for (auto *Def : Recipe->definedValues()) {
9128           auto *UV = Def->getUnderlyingValue();
9129           Plan->addVPValue(UV, Def);
9130         }
9131 
9132         RecipeBuilder.setRecipe(Instr, Recipe);
9133         VPBB->appendRecipe(Recipe);
9134         continue;
9135       }
9136 
9137       // Otherwise, if all widening options failed, Instruction is to be
9138       // replicated. This may create a successor for VPBB.
9139       VPBasicBlock *NextVPBB =
9140           RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan);
9141       if (NextVPBB != VPBB) {
9142         VPBB = NextVPBB;
9143         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
9144                                     : "");
9145       }
9146     }
9147   }
9148 
9149   RecipeBuilder.fixHeaderPhis();
9150 
9151   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
9152   // may also be empty, such as the last one VPBB, reflecting original
9153   // basic-blocks with no recipes.
9154   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
9155   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
9156   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
9157   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
9158   delete PreEntry;
9159 
9160   // ---------------------------------------------------------------------------
9161   // Transform initial VPlan: Apply previously taken decisions, in order, to
9162   // bring the VPlan to its final state.
9163   // ---------------------------------------------------------------------------
9164 
9165   // Apply Sink-After legal constraints.
9166   for (auto &Entry : SinkAfter) {
9167     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
9168     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
9169 
9170     auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {
9171       auto *Region =
9172           dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
9173       if (Region && Region->isReplicator())
9174         return Region;
9175       return nullptr;
9176     };
9177 
9178     // If the target is in a replication region, make sure to move Sink to the
9179     // block after it, not into the replication region itself.
9180     if (auto *TargetRegion = GetReplicateRegion(Target)) {
9181       assert(TargetRegion->getNumSuccessors() == 1 && "Expected SESE region!");
9182       assert(!GetReplicateRegion(Sink) &&
9183              "cannot sink a region into another region yet");
9184       VPBasicBlock *NextBlock =
9185           cast<VPBasicBlock>(TargetRegion->getSuccessors().front());
9186       Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
9187       continue;
9188     }
9189 
9190     auto *SinkRegion = GetReplicateRegion(Sink);
9191     // Unless the sink source is in a replicate region, sink the recipe
9192     // directly.
9193     if (!SinkRegion) {
9194       Sink->moveAfter(Target);
9195       continue;
9196     }
9197 
9198     // If the sink source is in a replicate region, we need to move the whole
9199     // replicate region, which should only contain a single recipe in the main
9200     // block.
9201     assert(Sink->getParent()->size() == 1 &&
9202            "parent must be a replicator with a single recipe");
9203     auto *SplitBlock =
9204         Target->getParent()->splitAt(std::next(Target->getIterator()));
9205 
9206     auto *Pred = SinkRegion->getSinglePredecessor();
9207     auto *Succ = SinkRegion->getSingleSuccessor();
9208     VPBlockUtils::disconnectBlocks(Pred, SinkRegion);
9209     VPBlockUtils::disconnectBlocks(SinkRegion, Succ);
9210     VPBlockUtils::connectBlocks(Pred, Succ);
9211 
9212     auto *SplitPred = SplitBlock->getSinglePredecessor();
9213 
9214     VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock);
9215     VPBlockUtils::connectBlocks(SplitPred, SinkRegion);
9216     VPBlockUtils::connectBlocks(SinkRegion, SplitBlock);
9217     if (VPBB == SplitPred)
9218       VPBB = SplitBlock;
9219   }
9220 
9221   // Interleave memory: for each Interleave Group we marked earlier as relevant
9222   // for this VPlan, replace the Recipes widening its memory instructions with a
9223   // single VPInterleaveRecipe at its insertion point.
9224   for (auto IG : InterleaveGroups) {
9225     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
9226         RecipeBuilder.getRecipe(IG->getInsertPos()));
9227     SmallVector<VPValue *, 4> StoredValues;
9228     for (unsigned i = 0; i < IG->getFactor(); ++i)
9229       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i)))
9230         StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0)));
9231 
9232     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
9233                                         Recipe->getMask());
9234     VPIG->insertBefore(Recipe);
9235     unsigned J = 0;
9236     for (unsigned i = 0; i < IG->getFactor(); ++i)
9237       if (Instruction *Member = IG->getMember(i)) {
9238         if (!Member->getType()->isVoidTy()) {
9239           VPValue *OriginalV = Plan->getVPValue(Member);
9240           Plan->removeVPValueFor(Member);
9241           Plan->addVPValue(Member, VPIG->getVPValue(J));
9242           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
9243           J++;
9244         }
9245         RecipeBuilder.getRecipe(Member)->eraseFromParent();
9246       }
9247   }
9248 
9249   // Adjust the recipes for any inloop reductions.
9250   if (Range.Start.isVector())
9251     adjustRecipesForInLoopReductions(Plan, RecipeBuilder);
9252 
9253   // Finally, if tail is folded by masking, introduce selects between the phi
9254   // and the live-out instruction of each reduction, at the end of the latch.
9255   if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) {
9256     Builder.setInsertPoint(VPBB);
9257     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
9258     for (auto &Reduction : Legal->getReductionVars()) {
9259       if (CM.isInLoopReduction(Reduction.first))
9260         continue;
9261       VPValue *Phi = Plan->getOrAddVPValue(Reduction.first);
9262       VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr());
9263       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
9264     }
9265   }
9266 
9267   VPlanTransforms::sinkScalarOperands(*Plan);
9268 
9269   std::string PlanName;
9270   raw_string_ostream RSO(PlanName);
9271   ElementCount VF = Range.Start;
9272   Plan->addVF(VF);
9273   RSO << "Initial VPlan for VF={" << VF;
9274   for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
9275     Plan->addVF(VF);
9276     RSO << "," << VF;
9277   }
9278   RSO << "},UF>=1";
9279   RSO.flush();
9280   Plan->setName(PlanName);
9281 
9282   return Plan;
9283 }
9284 
9285 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9286   // Outer loop handling: They may require CFG and instruction level
9287   // transformations before even evaluating whether vectorization is profitable.
9288   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9289   // the vectorization pipeline.
9290   assert(!OrigLoop->isInnermost());
9291   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9292 
9293   // Create new empty VPlan
9294   auto Plan = std::make_unique<VPlan>();
9295 
9296   // Build hierarchical CFG
9297   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9298   HCFGBuilder.buildHierarchicalCFG();
9299 
9300   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
9301        VF *= 2)
9302     Plan->addVF(VF);
9303 
9304   if (EnableVPlanPredication) {
9305     VPlanPredicator VPP(*Plan);
9306     VPP.predicate();
9307 
9308     // Avoid running transformation to recipes until masked code generation in
9309     // VPlan-native path is in place.
9310     return Plan;
9311   }
9312 
9313   SmallPtrSet<Instruction *, 1> DeadInstructions;
9314   VPlanTransforms::VPInstructionsToVPRecipes(OrigLoop, Plan,
9315                                              Legal->getInductionVars(),
9316                                              DeadInstructions, *PSE.getSE());
9317   return Plan;
9318 }
9319 
9320 // Adjust the recipes for any inloop reductions. The chain of instructions
9321 // leading from the loop exit instr to the phi need to be converted to
9322 // reductions, with one operand being vector and the other being the scalar
9323 // reduction chain.
9324 void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
9325     VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) {
9326   for (auto &Reduction : CM.getInLoopReductionChains()) {
9327     PHINode *Phi = Reduction.first;
9328     RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
9329     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9330 
9331     // ReductionOperations are orders top-down from the phi's use to the
9332     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
9333     // which of the two operands will remain scalar and which will be reduced.
9334     // For minmax the chain will be the select instructions.
9335     Instruction *Chain = Phi;
9336     for (Instruction *R : ReductionOperations) {
9337       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
9338       RecurKind Kind = RdxDesc.getRecurrenceKind();
9339 
9340       VPValue *ChainOp = Plan->getVPValue(Chain);
9341       unsigned FirstOpId;
9342       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9343         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
9344                "Expected to replace a VPWidenSelectSC");
9345         FirstOpId = 1;
9346       } else {
9347         assert(isa<VPWidenRecipe>(WidenRecipe) &&
9348                "Expected to replace a VPWidenSC");
9349         FirstOpId = 0;
9350       }
9351       unsigned VecOpId =
9352           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
9353       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
9354 
9355       auto *CondOp = CM.foldTailByMasking()
9356                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
9357                          : nullptr;
9358       VPReductionRecipe *RedRecipe = new VPReductionRecipe(
9359           &RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
9360       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9361       Plan->removeVPValueFor(R);
9362       Plan->addVPValue(R, RedRecipe);
9363       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
9364       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9365       WidenRecipe->eraseFromParent();
9366 
9367       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9368         VPRecipeBase *CompareRecipe =
9369             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
9370         assert(isa<VPWidenRecipe>(CompareRecipe) &&
9371                "Expected to replace a VPWidenSC");
9372         assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
9373                "Expected no remaining users");
9374         CompareRecipe->eraseFromParent();
9375       }
9376       Chain = R;
9377     }
9378   }
9379 }
9380 
9381 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9382 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9383                                VPSlotTracker &SlotTracker) const {
9384   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9385   IG->getInsertPos()->printAsOperand(O, false);
9386   O << ", ";
9387   getAddr()->printAsOperand(O, SlotTracker);
9388   VPValue *Mask = getMask();
9389   if (Mask) {
9390     O << ", ";
9391     Mask->printAsOperand(O, SlotTracker);
9392   }
9393   for (unsigned i = 0; i < IG->getFactor(); ++i)
9394     if (Instruction *I = IG->getMember(i))
9395       O << "\n" << Indent << "  " << VPlanIngredient(I) << " " << i;
9396 }
9397 #endif
9398 
9399 void VPWidenCallRecipe::execute(VPTransformState &State) {
9400   State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
9401                                   *this, State);
9402 }
9403 
9404 void VPWidenSelectRecipe::execute(VPTransformState &State) {
9405   State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()),
9406                                     this, *this, InvariantCond, State);
9407 }
9408 
9409 void VPWidenRecipe::execute(VPTransformState &State) {
9410   State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State);
9411 }
9412 
9413 void VPWidenGEPRecipe::execute(VPTransformState &State) {
9414   State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this,
9415                       *this, State.UF, State.VF, IsPtrLoopInvariant,
9416                       IsIndexLoopInvariant, State);
9417 }
9418 
9419 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
9420   assert(!State.Instance && "Int or FP induction being replicated.");
9421   State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(),
9422                                    getTruncInst(), getVPValue(0),
9423                                    getCastValue(), State);
9424 }
9425 
9426 void VPWidenPHIRecipe::execute(VPTransformState &State) {
9427   State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), RdxDesc,
9428                                  this, State);
9429 }
9430 
9431 void VPBlendRecipe::execute(VPTransformState &State) {
9432   State.ILV->setDebugLocFromInst(State.Builder, Phi);
9433   // We know that all PHIs in non-header blocks are converted into
9434   // selects, so we don't have to worry about the insertion order and we
9435   // can just use the builder.
9436   // At this point we generate the predication tree. There may be
9437   // duplications since this is a simple recursive scan, but future
9438   // optimizations will clean it up.
9439 
9440   unsigned NumIncoming = getNumIncomingValues();
9441 
9442   // Generate a sequence of selects of the form:
9443   // SELECT(Mask3, In3,
9444   //        SELECT(Mask2, In2,
9445   //               SELECT(Mask1, In1,
9446   //                      In0)))
9447   // Note that Mask0 is never used: lanes for which no path reaches this phi and
9448   // are essentially undef are taken from In0.
9449   InnerLoopVectorizer::VectorParts Entry(State.UF);
9450   for (unsigned In = 0; In < NumIncoming; ++In) {
9451     for (unsigned Part = 0; Part < State.UF; ++Part) {
9452       // We might have single edge PHIs (blocks) - use an identity
9453       // 'select' for the first PHI operand.
9454       Value *In0 = State.get(getIncomingValue(In), Part);
9455       if (In == 0)
9456         Entry[Part] = In0; // Initialize with the first incoming value.
9457       else {
9458         // Select between the current value and the previous incoming edge
9459         // based on the incoming mask.
9460         Value *Cond = State.get(getMask(In), Part);
9461         Entry[Part] =
9462             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
9463       }
9464     }
9465   }
9466   for (unsigned Part = 0; Part < State.UF; ++Part)
9467     State.set(this, Entry[Part], Part);
9468 }
9469 
9470 void VPInterleaveRecipe::execute(VPTransformState &State) {
9471   assert(!State.Instance && "Interleave group being replicated.");
9472   State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9473                                       getStoredValues(), getMask());
9474 }
9475 
9476 void VPReductionRecipe::execute(VPTransformState &State) {
9477   assert(!State.Instance && "Reduction being replicated.");
9478   Value *PrevInChain = State.get(getChainOp(), 0);
9479   for (unsigned Part = 0; Part < State.UF; ++Part) {
9480     RecurKind Kind = RdxDesc->getRecurrenceKind();
9481     bool IsOrdered = useOrderedReductions(*RdxDesc);
9482     Value *NewVecOp = State.get(getVecOp(), Part);
9483     if (VPValue *Cond = getCondOp()) {
9484       Value *NewCond = State.get(Cond, Part);
9485       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
9486       Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
9487           Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags());
9488       Constant *IdenVec =
9489           ConstantVector::getSplat(VecTy->getElementCount(), Iden);
9490       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
9491       NewVecOp = Select;
9492     }
9493     Value *NewRed;
9494     Value *NextInChain;
9495     if (IsOrdered) {
9496       NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp,
9497                                       PrevInChain);
9498       PrevInChain = NewRed;
9499     } else {
9500       PrevInChain = State.get(getChainOp(), Part);
9501       NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
9502     }
9503     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9504       NextInChain =
9505           createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
9506                          NewRed, PrevInChain);
9507     } else if (IsOrdered)
9508       NextInChain = NewRed;
9509     else {
9510       NextInChain = State.Builder.CreateBinOp(
9511           (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed,
9512           PrevInChain);
9513     }
9514     State.set(this, NextInChain, Part);
9515   }
9516 }
9517 
9518 void VPReplicateRecipe::execute(VPTransformState &State) {
9519   if (State.Instance) { // Generate a single instance.
9520     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9521     State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this,
9522                                     *State.Instance, IsPredicated, State);
9523     // Insert scalar instance packing it into a vector.
9524     if (AlsoPack && State.VF.isVector()) {
9525       // If we're constructing lane 0, initialize to start from poison.
9526       if (State.Instance->Lane.isFirstLane()) {
9527         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9528         Value *Poison = PoisonValue::get(
9529             VectorType::get(getUnderlyingValue()->getType(), State.VF));
9530         State.set(this, Poison, State.Instance->Part);
9531       }
9532       State.ILV->packScalarIntoVectorValue(this, *State.Instance, State);
9533     }
9534     return;
9535   }
9536 
9537   // Generate scalar instances for all VF lanes of all UF parts, unless the
9538   // instruction is uniform inwhich case generate only the first lane for each
9539   // of the UF parts.
9540   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
9541   assert((!State.VF.isScalable() || IsUniform) &&
9542          "Can't scalarize a scalable vector");
9543   for (unsigned Part = 0; Part < State.UF; ++Part)
9544     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9545       State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *this,
9546                                       VPIteration(Part, Lane), IsPredicated,
9547                                       State);
9548 }
9549 
9550 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
9551   assert(State.Instance && "Branch on Mask works only on single instance.");
9552 
9553   unsigned Part = State.Instance->Part;
9554   unsigned Lane = State.Instance->Lane.getKnownLane();
9555 
9556   Value *ConditionBit = nullptr;
9557   VPValue *BlockInMask = getMask();
9558   if (BlockInMask) {
9559     ConditionBit = State.get(BlockInMask, Part);
9560     if (ConditionBit->getType()->isVectorTy())
9561       ConditionBit = State.Builder.CreateExtractElement(
9562           ConditionBit, State.Builder.getInt32(Lane));
9563   } else // Block in mask is all-one.
9564     ConditionBit = State.Builder.getTrue();
9565 
9566   // Replace the temporary unreachable terminator with a new conditional branch,
9567   // whose two destinations will be set later when they are created.
9568   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
9569   assert(isa<UnreachableInst>(CurrentTerminator) &&
9570          "Expected to replace unreachable terminator with conditional branch.");
9571   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
9572   CondBr->setSuccessor(0, nullptr);
9573   ReplaceInstWithInst(CurrentTerminator, CondBr);
9574 }
9575 
9576 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
9577   assert(State.Instance && "Predicated instruction PHI works per instance.");
9578   Instruction *ScalarPredInst =
9579       cast<Instruction>(State.get(getOperand(0), *State.Instance));
9580   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
9581   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
9582   assert(PredicatingBB && "Predicated block has no single predecessor.");
9583   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
9584          "operand must be VPReplicateRecipe");
9585 
9586   // By current pack/unpack logic we need to generate only a single phi node: if
9587   // a vector value for the predicated instruction exists at this point it means
9588   // the instruction has vector users only, and a phi for the vector value is
9589   // needed. In this case the recipe of the predicated instruction is marked to
9590   // also do that packing, thereby "hoisting" the insert-element sequence.
9591   // Otherwise, a phi node for the scalar value is needed.
9592   unsigned Part = State.Instance->Part;
9593   if (State.hasVectorValue(getOperand(0), Part)) {
9594     Value *VectorValue = State.get(getOperand(0), Part);
9595     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
9596     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
9597     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
9598     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
9599     if (State.hasVectorValue(this, Part))
9600       State.reset(this, VPhi, Part);
9601     else
9602       State.set(this, VPhi, Part);
9603     // NOTE: Currently we need to update the value of the operand, so the next
9604     // predicated iteration inserts its generated value in the correct vector.
9605     State.reset(getOperand(0), VPhi, Part);
9606   } else {
9607     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
9608     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
9609     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
9610                      PredicatingBB);
9611     Phi->addIncoming(ScalarPredInst, PredicatedBB);
9612     if (State.hasScalarValue(this, *State.Instance))
9613       State.reset(this, Phi, *State.Instance);
9614     else
9615       State.set(this, Phi, *State.Instance);
9616     // NOTE: Currently we need to update the value of the operand, so the next
9617     // predicated iteration inserts its generated value in the correct vector.
9618     State.reset(getOperand(0), Phi, *State.Instance);
9619   }
9620 }
9621 
9622 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9623   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9624   State.ILV->vectorizeMemoryInstruction(
9625       &Ingredient, State, StoredValue ? nullptr : getVPSingleValue(), getAddr(),
9626       StoredValue, getMask());
9627 }
9628 
9629 // Determine how to lower the scalar epilogue, which depends on 1) optimising
9630 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9631 // predication, and 4) a TTI hook that analyses whether the loop is suitable
9632 // for predication.
9633 static ScalarEpilogueLowering getScalarEpilogueLowering(
9634     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9635     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
9636     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
9637     LoopVectorizationLegality &LVL) {
9638   // 1) OptSize takes precedence over all other options, i.e. if this is set,
9639   // don't look at hints or options, and don't request a scalar epilogue.
9640   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9641   // LoopAccessInfo (due to code dependency and not being able to reliably get
9642   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9643   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9644   // versioning when the vectorization is forced, unlike hasOptSize. So revert
9645   // back to the old way and vectorize with versioning when forced. See D81345.)
9646   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9647                                                       PGSOQueryType::IRPass) &&
9648                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
9649     return CM_ScalarEpilogueNotAllowedOptSize;
9650 
9651   // 2) If set, obey the directives
9652   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9653     switch (PreferPredicateOverEpilogue) {
9654     case PreferPredicateTy::ScalarEpilogue:
9655       return CM_ScalarEpilogueAllowed;
9656     case PreferPredicateTy::PredicateElseScalarEpilogue:
9657       return CM_ScalarEpilogueNotNeededUsePredicate;
9658     case PreferPredicateTy::PredicateOrDontVectorize:
9659       return CM_ScalarEpilogueNotAllowedUsePredicate;
9660     };
9661   }
9662 
9663   // 3) If set, obey the hints
9664   switch (Hints.getPredicate()) {
9665   case LoopVectorizeHints::FK_Enabled:
9666     return CM_ScalarEpilogueNotNeededUsePredicate;
9667   case LoopVectorizeHints::FK_Disabled:
9668     return CM_ScalarEpilogueAllowed;
9669   };
9670 
9671   // 4) if the TTI hook indicates this is profitable, request predication.
9672   if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
9673                                        LVL.getLAI()))
9674     return CM_ScalarEpilogueNotNeededUsePredicate;
9675 
9676   return CM_ScalarEpilogueAllowed;
9677 }
9678 
9679 Value *VPTransformState::get(VPValue *Def, unsigned Part) {
9680   // If Values have been set for this Def return the one relevant for \p Part.
9681   if (hasVectorValue(Def, Part))
9682     return Data.PerPartOutput[Def][Part];
9683 
9684   if (!hasScalarValue(Def, {Part, 0})) {
9685     Value *IRV = Def->getLiveInIRValue();
9686     Value *B = ILV->getBroadcastInstrs(IRV);
9687     set(Def, B, Part);
9688     return B;
9689   }
9690 
9691   Value *ScalarValue = get(Def, {Part, 0});
9692   // If we aren't vectorizing, we can just copy the scalar map values over
9693   // to the vector map.
9694   if (VF.isScalar()) {
9695     set(Def, ScalarValue, Part);
9696     return ScalarValue;
9697   }
9698 
9699   auto *RepR = dyn_cast<VPReplicateRecipe>(Def);
9700   bool IsUniform = RepR && RepR->isUniform();
9701 
9702   unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
9703   // Check if there is a scalar value for the selected lane.
9704   if (!hasScalarValue(Def, {Part, LastLane})) {
9705     // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform.
9706     assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) &&
9707            "unexpected recipe found to be invariant");
9708     IsUniform = true;
9709     LastLane = 0;
9710   }
9711 
9712   auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));
9713 
9714   // Set the insert point after the last scalarized instruction. This
9715   // ensures the insertelement sequence will directly follow the scalar
9716   // definitions.
9717   auto OldIP = Builder.saveIP();
9718   auto NewIP = std::next(BasicBlock::iterator(LastInst));
9719   Builder.SetInsertPoint(&*NewIP);
9720 
9721   // However, if we are vectorizing, we need to construct the vector values.
9722   // If the value is known to be uniform after vectorization, we can just
9723   // broadcast the scalar value corresponding to lane zero for each unroll
9724   // iteration. Otherwise, we construct the vector values using
9725   // insertelement instructions. Since the resulting vectors are stored in
9726   // State, we will only generate the insertelements once.
9727   Value *VectorValue = nullptr;
9728   if (IsUniform) {
9729     VectorValue = ILV->getBroadcastInstrs(ScalarValue);
9730     set(Def, VectorValue, Part);
9731   } else {
9732     // Initialize packing with insertelements to start from undef.
9733     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
9734     Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
9735     set(Def, Undef, Part);
9736     for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
9737       ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this);
9738     VectorValue = get(Def, Part);
9739   }
9740   Builder.restoreIP(OldIP);
9741   return VectorValue;
9742 }
9743 
9744 // Process the loop in the VPlan-native vectorization path. This path builds
9745 // VPlan upfront in the vectorization pipeline, which allows to apply
9746 // VPlan-to-VPlan transformations from the very beginning without modifying the
9747 // input LLVM IR.
9748 static bool processLoopInVPlanNativePath(
9749     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
9750     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
9751     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
9752     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
9753     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
9754     LoopVectorizationRequirements &Requirements) {
9755 
9756   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9757     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9758     return false;
9759   }
9760   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9761   Function *F = L->getHeader()->getParent();
9762   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9763 
9764   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
9765       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
9766 
9767   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9768                                 &Hints, IAI);
9769   // Use the planner for outer loop vectorization.
9770   // TODO: CM is not used at this point inside the planner. Turn CM into an
9771   // optional argument if we don't need it in the future.
9772   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints,
9773                                Requirements, ORE);
9774 
9775   // Get user vectorization factor.
9776   ElementCount UserVF = Hints.getWidth();
9777 
9778   // Plan how to best vectorize, return the best VF and its cost.
9779   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9780 
9781   // If we are stress testing VPlan builds, do not attempt to generate vector
9782   // code. Masked vector code generation support will follow soon.
9783   // Also, do not attempt to vectorize if no vector code will be produced.
9784   if (VPlanBuildStressTest || EnableVPlanPredication ||
9785       VectorizationFactor::Disabled() == VF)
9786     return false;
9787 
9788   LVP.setBestPlan(VF.Width, 1);
9789 
9790   {
9791     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
9792                              F->getParent()->getDataLayout());
9793     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
9794                            &CM, BFI, PSI, Checks);
9795     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9796                       << L->getHeader()->getParent()->getName() << "\"\n");
9797     LVP.executePlan(LB, DT);
9798   }
9799 
9800   // Mark the loop as already vectorized to avoid vectorizing again.
9801   Hints.setAlreadyVectorized();
9802   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9803   return true;
9804 }
9805 
9806 // Emit a remark if there are stores to floats that required a floating point
9807 // extension. If the vectorized loop was generated with floating point there
9808 // will be a performance penalty from the conversion overhead and the change in
9809 // the vector width.
9810 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
9811   SmallVector<Instruction *, 4> Worklist;
9812   for (BasicBlock *BB : L->getBlocks()) {
9813     for (Instruction &Inst : *BB) {
9814       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
9815         if (S->getValueOperand()->getType()->isFloatTy())
9816           Worklist.push_back(S);
9817       }
9818     }
9819   }
9820 
9821   // Traverse the floating point stores upwards searching, for floating point
9822   // conversions.
9823   SmallPtrSet<const Instruction *, 4> Visited;
9824   SmallPtrSet<const Instruction *, 4> EmittedRemark;
9825   while (!Worklist.empty()) {
9826     auto *I = Worklist.pop_back_val();
9827     if (!L->contains(I))
9828       continue;
9829     if (!Visited.insert(I).second)
9830       continue;
9831 
9832     // Emit a remark if the floating point store required a floating
9833     // point conversion.
9834     // TODO: More work could be done to identify the root cause such as a
9835     // constant or a function return type and point the user to it.
9836     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
9837       ORE->emit([&]() {
9838         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
9839                                           I->getDebugLoc(), L->getHeader())
9840                << "floating point conversion changes vector width. "
9841                << "Mixed floating point precision requires an up/down "
9842                << "cast that will negatively impact performance.";
9843       });
9844 
9845     for (Use &Op : I->operands())
9846       if (auto *OpI = dyn_cast<Instruction>(Op))
9847         Worklist.push_back(OpI);
9848   }
9849 }
9850 
9851 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
9852     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
9853                                !EnableLoopInterleaving),
9854       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
9855                               !EnableLoopVectorization) {}
9856 
9857 bool LoopVectorizePass::processLoop(Loop *L) {
9858   assert((EnableVPlanNativePath || L->isInnermost()) &&
9859          "VPlan-native path is not enabled. Only process inner loops.");
9860 
9861 #ifndef NDEBUG
9862   const std::string DebugLocStr = getDebugLocString(L);
9863 #endif /* NDEBUG */
9864 
9865   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
9866                     << L->getHeader()->getParent()->getName() << "\" from "
9867                     << DebugLocStr << "\n");
9868 
9869   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
9870 
9871   LLVM_DEBUG(
9872       dbgs() << "LV: Loop hints:"
9873              << " force="
9874              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
9875                      ? "disabled"
9876                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
9877                             ? "enabled"
9878                             : "?"))
9879              << " width=" << Hints.getWidth()
9880              << " interleave=" << Hints.getInterleave() << "\n");
9881 
9882   // Function containing loop
9883   Function *F = L->getHeader()->getParent();
9884 
9885   // Looking at the diagnostic output is the only way to determine if a loop
9886   // was vectorized (other than looking at the IR or machine code), so it
9887   // is important to generate an optimization remark for each loop. Most of
9888   // these messages are generated as OptimizationRemarkAnalysis. Remarks
9889   // generated as OptimizationRemark and OptimizationRemarkMissed are
9890   // less verbose reporting vectorized loops and unvectorized loops that may
9891   // benefit from vectorization, respectively.
9892 
9893   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9894     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9895     return false;
9896   }
9897 
9898   PredicatedScalarEvolution PSE(*SE, *L);
9899 
9900   // Check if it is legal to vectorize the loop.
9901   LoopVectorizationRequirements Requirements;
9902   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
9903                                 &Requirements, &Hints, DB, AC, BFI, PSI);
9904   if (!LVL.canVectorize(EnableVPlanNativePath)) {
9905     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9906     Hints.emitRemarkWithHints();
9907     return false;
9908   }
9909 
9910   // Check the function attributes and profiles to find out if this function
9911   // should be optimized for size.
9912   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
9913       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
9914 
9915   // Entrance to the VPlan-native vectorization path. Outer loops are processed
9916   // here. They may require CFG and instruction level transformations before
9917   // even evaluating whether vectorization is profitable. Since we cannot modify
9918   // the incoming IR, we need to build VPlan upfront in the vectorization
9919   // pipeline.
9920   if (!L->isInnermost())
9921     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
9922                                         ORE, BFI, PSI, Hints, Requirements);
9923 
9924   assert(L->isInnermost() && "Inner loop expected.");
9925 
9926   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
9927   // count by optimizing for size, to minimize overheads.
9928   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
9929   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
9930     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
9931                       << "This loop is worth vectorizing only if no scalar "
9932                       << "iteration overheads are incurred.");
9933     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
9934       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
9935     else {
9936       LLVM_DEBUG(dbgs() << "\n");
9937       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
9938     }
9939   }
9940 
9941   // Check the function attributes to see if implicit floats are allowed.
9942   // FIXME: This check doesn't seem possibly correct -- what if the loop is
9943   // an integer loop and the vector instructions selected are purely integer
9944   // vector instructions?
9945   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
9946     reportVectorizationFailure(
9947         "Can't vectorize when the NoImplicitFloat attribute is used",
9948         "loop not vectorized due to NoImplicitFloat attribute",
9949         "NoImplicitFloat", ORE, L);
9950     Hints.emitRemarkWithHints();
9951     return false;
9952   }
9953 
9954   // Check if the target supports potentially unsafe FP vectorization.
9955   // FIXME: Add a check for the type of safety issue (denormal, signaling)
9956   // for the target we're vectorizing for, to make sure none of the
9957   // additional fp-math flags can help.
9958   if (Hints.isPotentiallyUnsafe() &&
9959       TTI->isFPVectorizationPotentiallyUnsafe()) {
9960     reportVectorizationFailure(
9961         "Potentially unsafe FP op prevents vectorization",
9962         "loop not vectorized due to unsafe FP support.",
9963         "UnsafeFP", ORE, L);
9964     Hints.emitRemarkWithHints();
9965     return false;
9966   }
9967 
9968   if (!LVL.canVectorizeFPMath(EnableStrictReductions)) {
9969     ORE->emit([&]() {
9970       auto *ExactFPMathInst = Requirements.getExactFPInst();
9971       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
9972                                                  ExactFPMathInst->getDebugLoc(),
9973                                                  ExactFPMathInst->getParent())
9974              << "loop not vectorized: cannot prove it is safe to reorder "
9975                 "floating-point operations";
9976     });
9977     LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
9978                          "reorder floating-point operations\n");
9979     Hints.emitRemarkWithHints();
9980     return false;
9981   }
9982 
9983   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9984   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9985 
9986   // If an override option has been passed in for interleaved accesses, use it.
9987   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9988     UseInterleaved = EnableInterleavedMemAccesses;
9989 
9990   // Analyze interleaved memory accesses.
9991   if (UseInterleaved) {
9992     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
9993   }
9994 
9995   // Use the cost model.
9996   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
9997                                 F, &Hints, IAI);
9998   CM.collectValuesToIgnore();
9999 
10000   // Use the planner for vectorization.
10001   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints,
10002                                Requirements, ORE);
10003 
10004   // Get user vectorization factor and interleave count.
10005   ElementCount UserVF = Hints.getWidth();
10006   unsigned UserIC = Hints.getInterleave();
10007 
10008   // Plan how to best vectorize, return the best VF and its cost.
10009   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
10010 
10011   VectorizationFactor VF = VectorizationFactor::Disabled();
10012   unsigned IC = 1;
10013 
10014   if (MaybeVF) {
10015     VF = *MaybeVF;
10016     // Select the interleave count.
10017     IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());
10018   }
10019 
10020   // Identify the diagnostic messages that should be produced.
10021   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10022   bool VectorizeLoop = true, InterleaveLoop = true;
10023   if (VF.Width.isScalar()) {
10024     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10025     VecDiagMsg = std::make_pair(
10026         "VectorizationNotBeneficial",
10027         "the cost-model indicates that vectorization is not beneficial");
10028     VectorizeLoop = false;
10029   }
10030 
10031   if (!MaybeVF && UserIC > 1) {
10032     // Tell the user interleaving was avoided up-front, despite being explicitly
10033     // requested.
10034     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10035                          "interleaving should be avoided up front\n");
10036     IntDiagMsg = std::make_pair(
10037         "InterleavingAvoided",
10038         "Ignoring UserIC, because interleaving was avoided up front");
10039     InterleaveLoop = false;
10040   } else if (IC == 1 && UserIC <= 1) {
10041     // Tell the user interleaving is not beneficial.
10042     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10043     IntDiagMsg = std::make_pair(
10044         "InterleavingNotBeneficial",
10045         "the cost-model indicates that interleaving is not beneficial");
10046     InterleaveLoop = false;
10047     if (UserIC == 1) {
10048       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10049       IntDiagMsg.second +=
10050           " and is explicitly disabled or interleave count is set to 1";
10051     }
10052   } else if (IC > 1 && UserIC == 1) {
10053     // Tell the user interleaving is beneficial, but it explicitly disabled.
10054     LLVM_DEBUG(
10055         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10056     IntDiagMsg = std::make_pair(
10057         "InterleavingBeneficialButDisabled",
10058         "the cost-model indicates that interleaving is beneficial "
10059         "but is explicitly disabled or interleave count is set to 1");
10060     InterleaveLoop = false;
10061   }
10062 
10063   // Override IC if user provided an interleave count.
10064   IC = UserIC > 0 ? UserIC : IC;
10065 
10066   // Emit diagnostic messages, if any.
10067   const char *VAPassName = Hints.vectorizeAnalysisPassName();
10068   if (!VectorizeLoop && !InterleaveLoop) {
10069     // Do not vectorize or interleaving the loop.
10070     ORE->emit([&]() {
10071       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10072                                       L->getStartLoc(), L->getHeader())
10073              << VecDiagMsg.second;
10074     });
10075     ORE->emit([&]() {
10076       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10077                                       L->getStartLoc(), L->getHeader())
10078              << IntDiagMsg.second;
10079     });
10080     return false;
10081   } else if (!VectorizeLoop && InterleaveLoop) {
10082     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10083     ORE->emit([&]() {
10084       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10085                                         L->getStartLoc(), L->getHeader())
10086              << VecDiagMsg.second;
10087     });
10088   } else if (VectorizeLoop && !InterleaveLoop) {
10089     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10090                       << ") in " << DebugLocStr << '\n');
10091     ORE->emit([&]() {
10092       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10093                                         L->getStartLoc(), L->getHeader())
10094              << IntDiagMsg.second;
10095     });
10096   } else if (VectorizeLoop && InterleaveLoop) {
10097     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10098                       << ") in " << DebugLocStr << '\n');
10099     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10100   }
10101 
10102   bool DisableRuntimeUnroll = false;
10103   MDNode *OrigLoopID = L->getLoopID();
10104   {
10105     // Optimistically generate runtime checks. Drop them if they turn out to not
10106     // be profitable. Limit the scope of Checks, so the cleanup happens
10107     // immediately after vector codegeneration is done.
10108     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10109                              F->getParent()->getDataLayout());
10110     if (!VF.Width.isScalar() || IC > 1)
10111       Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate());
10112     LVP.setBestPlan(VF.Width, IC);
10113 
10114     using namespace ore;
10115     if (!VectorizeLoop) {
10116       assert(IC > 1 && "interleave count should not be 1 or 0");
10117       // If we decided that it is not legal to vectorize the loop, then
10118       // interleave it.
10119       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10120                                  &CM, BFI, PSI, Checks);
10121       LVP.executePlan(Unroller, DT);
10122 
10123       ORE->emit([&]() {
10124         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10125                                   L->getHeader())
10126                << "interleaved loop (interleaved count: "
10127                << NV("InterleaveCount", IC) << ")";
10128       });
10129     } else {
10130       // If we decided that it is *legal* to vectorize the loop, then do it.
10131 
10132       // Consider vectorizing the epilogue too if it's profitable.
10133       VectorizationFactor EpilogueVF =
10134           CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
10135       if (EpilogueVF.Width.isVector()) {
10136 
10137         // The first pass vectorizes the main loop and creates a scalar epilogue
10138         // to be vectorized by executing the plan (potentially with a different
10139         // factor) again shortly afterwards.
10140         EpilogueLoopVectorizationInfo EPI(VF.Width.getKnownMinValue(), IC,
10141                                           EpilogueVF.Width.getKnownMinValue(),
10142                                           1);
10143         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10144                                            EPI, &LVL, &CM, BFI, PSI, Checks);
10145 
10146         LVP.setBestPlan(EPI.MainLoopVF, EPI.MainLoopUF);
10147         LVP.executePlan(MainILV, DT);
10148         ++LoopsVectorized;
10149 
10150         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10151         formLCSSARecursively(*L, *DT, LI, SE);
10152 
10153         // Second pass vectorizes the epilogue and adjusts the control flow
10154         // edges from the first pass.
10155         LVP.setBestPlan(EPI.EpilogueVF, EPI.EpilogueUF);
10156         EPI.MainLoopVF = EPI.EpilogueVF;
10157         EPI.MainLoopUF = EPI.EpilogueUF;
10158         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10159                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
10160                                                  Checks);
10161         LVP.executePlan(EpilogILV, DT);
10162         ++LoopsEpilogueVectorized;
10163 
10164         if (!MainILV.areSafetyChecksAdded())
10165           DisableRuntimeUnroll = true;
10166       } else {
10167         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
10168                                &LVL, &CM, BFI, PSI, Checks);
10169         LVP.executePlan(LB, DT);
10170         ++LoopsVectorized;
10171 
10172         // Add metadata to disable runtime unrolling a scalar loop when there
10173         // are no runtime checks about strides and memory. A scalar loop that is
10174         // rarely used is not worth unrolling.
10175         if (!LB.areSafetyChecksAdded())
10176           DisableRuntimeUnroll = true;
10177       }
10178       // Report the vectorization decision.
10179       ORE->emit([&]() {
10180         return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
10181                                   L->getHeader())
10182                << "vectorized loop (vectorization width: "
10183                << NV("VectorizationFactor", VF.Width)
10184                << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
10185       });
10186     }
10187 
10188     if (ORE->allowExtraAnalysis(LV_NAME))
10189       checkMixedPrecision(L, ORE);
10190   }
10191 
10192   Optional<MDNode *> RemainderLoopID =
10193       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10194                                       LLVMLoopVectorizeFollowupEpilogue});
10195   if (RemainderLoopID.hasValue()) {
10196     L->setLoopID(RemainderLoopID.getValue());
10197   } else {
10198     if (DisableRuntimeUnroll)
10199       AddRuntimeUnrollDisableMetaData(L);
10200 
10201     // Mark the loop as already vectorized to avoid vectorizing again.
10202     Hints.setAlreadyVectorized();
10203   }
10204 
10205   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10206   return true;
10207 }
10208 
10209 LoopVectorizeResult LoopVectorizePass::runImpl(
10210     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10211     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
10212     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
10213     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
10214     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10215   SE = &SE_;
10216   LI = &LI_;
10217   TTI = &TTI_;
10218   DT = &DT_;
10219   BFI = &BFI_;
10220   TLI = TLI_;
10221   AA = &AA_;
10222   AC = &AC_;
10223   GetLAA = &GetLAA_;
10224   DB = &DB_;
10225   ORE = &ORE_;
10226   PSI = PSI_;
10227 
10228   // Don't attempt if
10229   // 1. the target claims to have no vector registers, and
10230   // 2. interleaving won't help ILP.
10231   //
10232   // The second condition is necessary because, even if the target has no
10233   // vector registers, loop vectorization may still enable scalar
10234   // interleaving.
10235   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10236       TTI->getMaxInterleaveFactor(1) < 2)
10237     return LoopVectorizeResult(false, false);
10238 
10239   bool Changed = false, CFGChanged = false;
10240 
10241   // The vectorizer requires loops to be in simplified form.
10242   // Since simplification may add new inner loops, it has to run before the
10243   // legality and profitability checks. This means running the loop vectorizer
10244   // will simplify all loops, regardless of whether anything end up being
10245   // vectorized.
10246   for (auto &L : *LI)
10247     Changed |= CFGChanged |=
10248         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10249 
10250   // Build up a worklist of inner-loops to vectorize. This is necessary as
10251   // the act of vectorizing or partially unrolling a loop creates new loops
10252   // and can invalidate iterators across the loops.
10253   SmallVector<Loop *, 8> Worklist;
10254 
10255   for (Loop *L : *LI)
10256     collectSupportedLoops(*L, LI, ORE, Worklist);
10257 
10258   LoopsAnalyzed += Worklist.size();
10259 
10260   // Now walk the identified inner loops.
10261   while (!Worklist.empty()) {
10262     Loop *L = Worklist.pop_back_val();
10263 
10264     // For the inner loops we actually process, form LCSSA to simplify the
10265     // transform.
10266     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10267 
10268     Changed |= CFGChanged |= processLoop(L);
10269   }
10270 
10271   // Process each loop nest in the function.
10272   return LoopVectorizeResult(Changed, CFGChanged);
10273 }
10274 
10275 PreservedAnalyses LoopVectorizePass::run(Function &F,
10276                                          FunctionAnalysisManager &AM) {
10277     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
10278     auto &LI = AM.getResult<LoopAnalysis>(F);
10279     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10280     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10281     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
10282     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10283     auto &AA = AM.getResult<AAManager>(F);
10284     auto &AC = AM.getResult<AssumptionAnalysis>(F);
10285     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10286     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10287     MemorySSA *MSSA = EnableMSSALoopDependency
10288                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
10289                           : nullptr;
10290 
10291     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
10292     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
10293         [&](Loop &L) -> const LoopAccessInfo & {
10294       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,  SE,
10295                                         TLI, TTI, nullptr, MSSA};
10296       return LAM.getResult<LoopAccessAnalysis>(L, AR);
10297     };
10298     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10299     ProfileSummaryInfo *PSI =
10300         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10301     LoopVectorizeResult Result =
10302         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
10303     if (!Result.MadeAnyChange)
10304       return PreservedAnalyses::all();
10305     PreservedAnalyses PA;
10306 
10307     // We currently do not preserve loopinfo/dominator analyses with outer loop
10308     // vectorization. Until this is addressed, mark these analyses as preserved
10309     // only for non-VPlan-native path.
10310     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10311     if (!EnableVPlanNativePath) {
10312       PA.preserve<LoopAnalysis>();
10313       PA.preserve<DominatorTreeAnalysis>();
10314     }
10315     if (!Result.MadeCFGChange)
10316       PA.preserveSet<CFGAnalyses>();
10317     return PA;
10318 }
10319