1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanTransforms.h"
62 #include "llvm/ADT/APInt.h"
63 #include "llvm/ADT/ArrayRef.h"
64 #include "llvm/ADT/DenseMap.h"
65 #include "llvm/ADT/DenseMapInfo.h"
66 #include "llvm/ADT/Hashing.h"
67 #include "llvm/ADT/MapVector.h"
68 #include "llvm/ADT/None.h"
69 #include "llvm/ADT/Optional.h"
70 #include "llvm/ADT/STLExtras.h"
71 #include "llvm/ADT/SmallPtrSet.h"
72 #include "llvm/ADT/SmallSet.h"
73 #include "llvm/ADT/SmallVector.h"
74 #include "llvm/ADT/Statistic.h"
75 #include "llvm/ADT/StringRef.h"
76 #include "llvm/ADT/Twine.h"
77 #include "llvm/ADT/iterator_range.h"
78 #include "llvm/Analysis/AssumptionCache.h"
79 #include "llvm/Analysis/BasicAliasAnalysis.h"
80 #include "llvm/Analysis/BlockFrequencyInfo.h"
81 #include "llvm/Analysis/CFG.h"
82 #include "llvm/Analysis/CodeMetrics.h"
83 #include "llvm/Analysis/DemandedBits.h"
84 #include "llvm/Analysis/GlobalsModRef.h"
85 #include "llvm/Analysis/LoopAccessAnalysis.h"
86 #include "llvm/Analysis/LoopAnalysisManager.h"
87 #include "llvm/Analysis/LoopInfo.h"
88 #include "llvm/Analysis/LoopIterator.h"
89 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
90 #include "llvm/Analysis/ProfileSummaryInfo.h"
91 #include "llvm/Analysis/ScalarEvolution.h"
92 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
93 #include "llvm/Analysis/TargetLibraryInfo.h"
94 #include "llvm/Analysis/TargetTransformInfo.h"
95 #include "llvm/Analysis/VectorUtils.h"
96 #include "llvm/IR/Attributes.h"
97 #include "llvm/IR/BasicBlock.h"
98 #include "llvm/IR/CFG.h"
99 #include "llvm/IR/Constant.h"
100 #include "llvm/IR/Constants.h"
101 #include "llvm/IR/DataLayout.h"
102 #include "llvm/IR/DebugInfoMetadata.h"
103 #include "llvm/IR/DebugLoc.h"
104 #include "llvm/IR/DerivedTypes.h"
105 #include "llvm/IR/DiagnosticInfo.h"
106 #include "llvm/IR/Dominators.h"
107 #include "llvm/IR/Function.h"
108 #include "llvm/IR/IRBuilder.h"
109 #include "llvm/IR/InstrTypes.h"
110 #include "llvm/IR/Instruction.h"
111 #include "llvm/IR/Instructions.h"
112 #include "llvm/IR/IntrinsicInst.h"
113 #include "llvm/IR/Intrinsics.h"
114 #include "llvm/IR/Metadata.h"
115 #include "llvm/IR/Module.h"
116 #include "llvm/IR/Operator.h"
117 #include "llvm/IR/PatternMatch.h"
118 #include "llvm/IR/Type.h"
119 #include "llvm/IR/Use.h"
120 #include "llvm/IR/User.h"
121 #include "llvm/IR/Value.h"
122 #include "llvm/IR/ValueHandle.h"
123 #include "llvm/IR/Verifier.h"
124 #include "llvm/InitializePasses.h"
125 #include "llvm/Pass.h"
126 #include "llvm/Support/Casting.h"
127 #include "llvm/Support/CommandLine.h"
128 #include "llvm/Support/Compiler.h"
129 #include "llvm/Support/Debug.h"
130 #include "llvm/Support/ErrorHandling.h"
131 #include "llvm/Support/InstructionCost.h"
132 #include "llvm/Support/MathExtras.h"
133 #include "llvm/Support/raw_ostream.h"
134 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
135 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
136 #include "llvm/Transforms/Utils/LoopSimplify.h"
137 #include "llvm/Transforms/Utils/LoopUtils.h"
138 #include "llvm/Transforms/Utils/LoopVersioning.h"
139 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
140 #include "llvm/Transforms/Utils/SizeOpts.h"
141 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
142 #include <algorithm>
143 #include <cassert>
144 #include <cstdint>
145 #include <functional>
146 #include <iterator>
147 #include <limits>
148 #include <map>
149 #include <memory>
150 #include <string>
151 #include <tuple>
152 #include <utility>
153 
154 using namespace llvm;
155 
156 #define LV_NAME "loop-vectorize"
157 #define DEBUG_TYPE LV_NAME
158 
159 #ifndef NDEBUG
160 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
161 #endif
162 
163 /// @{
164 /// Metadata attribute names
165 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
166 const char LLVMLoopVectorizeFollowupVectorized[] =
167     "llvm.loop.vectorize.followup_vectorized";
168 const char LLVMLoopVectorizeFollowupEpilogue[] =
169     "llvm.loop.vectorize.followup_epilogue";
170 /// @}
171 
172 STATISTIC(LoopsVectorized, "Number of loops vectorized");
173 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
174 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
175 
176 static cl::opt<bool> EnableEpilogueVectorization(
177     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
178     cl::desc("Enable vectorization of epilogue loops."));
179 
180 static cl::opt<unsigned> EpilogueVectorizationForceVF(
181     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
182     cl::desc("When epilogue vectorization is enabled, and a value greater than "
183              "1 is specified, forces the given VF for all applicable epilogue "
184              "loops."));
185 
186 static cl::opt<unsigned> EpilogueVectorizationMinVF(
187     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
188     cl::desc("Only loops with vectorization factor equal to or larger than "
189              "the specified value are considered for epilogue vectorization."));
190 
191 /// Loops with a known constant trip count below this number are vectorized only
192 /// if no scalar iteration overheads are incurred.
193 static cl::opt<unsigned> TinyTripCountVectorThreshold(
194     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
195     cl::desc("Loops with a constant trip count that is smaller than this "
196              "value are vectorized only if no scalar iteration overheads "
197              "are incurred."));
198 
199 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
200     "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
201     cl::desc("The maximum allowed number of runtime memory checks with a "
202              "vectorize(enable) pragma."));
203 
204 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
205 // that predication is preferred, and this lists all options. I.e., the
206 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
207 // and predicate the instructions accordingly. If tail-folding fails, there are
208 // different fallback strategies depending on these values:
209 namespace PreferPredicateTy {
210   enum Option {
211     ScalarEpilogue = 0,
212     PredicateElseScalarEpilogue,
213     PredicateOrDontVectorize
214   };
215 } // namespace PreferPredicateTy
216 
217 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
218     "prefer-predicate-over-epilogue",
219     cl::init(PreferPredicateTy::ScalarEpilogue),
220     cl::Hidden,
221     cl::desc("Tail-folding and predication preferences over creating a scalar "
222              "epilogue loop."),
223     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
224                          "scalar-epilogue",
225                          "Don't tail-predicate loops, create scalar epilogue"),
226               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
227                          "predicate-else-scalar-epilogue",
228                          "prefer tail-folding, create scalar epilogue if tail "
229                          "folding fails."),
230               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
231                          "predicate-dont-vectorize",
232                          "prefers tail-folding, don't attempt vectorization if "
233                          "tail-folding fails.")));
234 
235 static cl::opt<bool> MaximizeBandwidth(
236     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
237     cl::desc("Maximize bandwidth when selecting vectorization factor which "
238              "will be determined by the smallest type in loop."));
239 
240 static cl::opt<bool> EnableInterleavedMemAccesses(
241     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
242     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
243 
244 /// An interleave-group may need masking if it resides in a block that needs
245 /// predication, or in order to mask away gaps.
246 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
247     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
248     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
249 
250 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
251     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
252     cl::desc("We don't interleave loops with a estimated constant trip count "
253              "below this number"));
254 
255 static cl::opt<unsigned> ForceTargetNumScalarRegs(
256     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
257     cl::desc("A flag that overrides the target's number of scalar registers."));
258 
259 static cl::opt<unsigned> ForceTargetNumVectorRegs(
260     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
261     cl::desc("A flag that overrides the target's number of vector registers."));
262 
263 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
264     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
265     cl::desc("A flag that overrides the target's max interleave factor for "
266              "scalar loops."));
267 
268 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
269     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
270     cl::desc("A flag that overrides the target's max interleave factor for "
271              "vectorized loops."));
272 
273 static cl::opt<unsigned> ForceTargetInstructionCost(
274     "force-target-instruction-cost", cl::init(0), cl::Hidden,
275     cl::desc("A flag that overrides the target's expected cost for "
276              "an instruction to a single constant value. Mostly "
277              "useful for getting consistent testing."));
278 
279 static cl::opt<bool> ForceTargetSupportsScalableVectors(
280     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
281     cl::desc(
282         "Pretend that scalable vectors are supported, even if the target does "
283         "not support them. This flag should only be used for testing."));
284 
285 static cl::opt<unsigned> SmallLoopCost(
286     "small-loop-cost", cl::init(20), cl::Hidden,
287     cl::desc(
288         "The cost of a loop that is considered 'small' by the interleaver."));
289 
290 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
291     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
292     cl::desc("Enable the use of the block frequency analysis to access PGO "
293              "heuristics minimizing code growth in cold regions and being more "
294              "aggressive in hot regions."));
295 
296 // Runtime interleave loops for load/store throughput.
297 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
298     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
299     cl::desc(
300         "Enable runtime interleaving until load/store ports are saturated"));
301 
302 /// Interleave small loops with scalar reductions.
303 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
304     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
305     cl::desc("Enable interleaving for loops with small iteration counts that "
306              "contain scalar reductions to expose ILP."));
307 
308 /// The number of stores in a loop that are allowed to need predication.
309 static cl::opt<unsigned> NumberOfStoresToPredicate(
310     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
311     cl::desc("Max number of stores to be predicated behind an if."));
312 
313 static cl::opt<bool> EnableIndVarRegisterHeur(
314     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
315     cl::desc("Count the induction variable only once when interleaving"));
316 
317 static cl::opt<bool> EnableCondStoresVectorization(
318     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
319     cl::desc("Enable if predication of stores during vectorization."));
320 
321 static cl::opt<unsigned> MaxNestedScalarReductionIC(
322     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
323     cl::desc("The maximum interleave count to use when interleaving a scalar "
324              "reduction in a nested loop."));
325 
326 static cl::opt<bool>
327     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
328                            cl::Hidden,
329                            cl::desc("Prefer in-loop vector reductions, "
330                                     "overriding the targets preference."));
331 
332 static cl::opt<bool> ForceOrderedReductions(
333     "force-ordered-reductions", cl::init(false), cl::Hidden,
334     cl::desc("Enable the vectorisation of loops with in-order (strict) "
335              "FP reductions"));
336 
337 static cl::opt<bool> PreferPredicatedReductionSelect(
338     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
339     cl::desc(
340         "Prefer predicating a reduction operation over an after loop select."));
341 
342 cl::opt<bool> EnableVPlanNativePath(
343     "enable-vplan-native-path", cl::init(false), cl::Hidden,
344     cl::desc("Enable VPlan-native vectorization path with "
345              "support for outer loop vectorization."));
346 
347 // This flag enables the stress testing of the VPlan H-CFG construction in the
348 // VPlan-native vectorization path. It must be used in conjuction with
349 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
350 // verification of the H-CFGs built.
351 static cl::opt<bool> VPlanBuildStressTest(
352     "vplan-build-stress-test", cl::init(false), cl::Hidden,
353     cl::desc(
354         "Build VPlan for every supported loop nest in the function and bail "
355         "out right after the build (stress test the VPlan H-CFG construction "
356         "in the VPlan-native vectorization path)."));
357 
358 cl::opt<bool> llvm::EnableLoopInterleaving(
359     "interleave-loops", cl::init(true), cl::Hidden,
360     cl::desc("Enable loop interleaving in Loop vectorization passes"));
361 cl::opt<bool> llvm::EnableLoopVectorization(
362     "vectorize-loops", cl::init(true), cl::Hidden,
363     cl::desc("Run the Loop vectorization passes"));
364 
365 cl::opt<bool> PrintVPlansInDotFormat(
366     "vplan-print-in-dot-format", cl::init(false), cl::Hidden,
367     cl::desc("Use dot format instead of plain text when dumping VPlans"));
368 
369 /// A helper function that returns true if the given type is irregular. The
370 /// type is irregular if its allocated size doesn't equal the store size of an
371 /// element of the corresponding vector type.
372 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
373   // Determine if an array of N elements of type Ty is "bitcast compatible"
374   // with a <N x Ty> vector.
375   // This is only true if there is no padding between the array elements.
376   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
377 }
378 
379 /// A helper function that returns the reciprocal of the block probability of
380 /// predicated blocks. If we return X, we are assuming the predicated block
381 /// will execute once for every X iterations of the loop header.
382 ///
383 /// TODO: We should use actual block probability here, if available. Currently,
384 ///       we always assume predicated blocks have a 50% chance of executing.
385 static unsigned getReciprocalPredBlockProb() { return 2; }
386 
387 /// A helper function that returns an integer or floating-point constant with
388 /// value C.
389 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
390   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
391                            : ConstantFP::get(Ty, C);
392 }
393 
394 /// Returns "best known" trip count for the specified loop \p L as defined by
395 /// the following procedure:
396 ///   1) Returns exact trip count if it is known.
397 ///   2) Returns expected trip count according to profile data if any.
398 ///   3) Returns upper bound estimate if it is known.
399 ///   4) Returns None if all of the above failed.
400 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
401   // Check if exact trip count is known.
402   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
403     return ExpectedTC;
404 
405   // Check if there is an expected trip count available from profile data.
406   if (LoopVectorizeWithBlockFrequency)
407     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
408       return EstimatedTC;
409 
410   // Check if upper bound estimate is known.
411   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
412     return ExpectedTC;
413 
414   return None;
415 }
416 
417 // Forward declare GeneratedRTChecks.
418 class GeneratedRTChecks;
419 
420 namespace llvm {
421 
422 AnalysisKey ShouldRunExtraVectorPasses::Key;
423 
424 /// InnerLoopVectorizer vectorizes loops which contain only one basic
425 /// block to a specified vectorization factor (VF).
426 /// This class performs the widening of scalars into vectors, or multiple
427 /// scalars. This class also implements the following features:
428 /// * It inserts an epilogue loop for handling loops that don't have iteration
429 ///   counts that are known to be a multiple of the vectorization factor.
430 /// * It handles the code generation for reduction variables.
431 /// * Scalarization (implementation using scalars) of un-vectorizable
432 ///   instructions.
433 /// InnerLoopVectorizer does not perform any vectorization-legality
434 /// checks, and relies on the caller to check for the different legality
435 /// aspects. The InnerLoopVectorizer relies on the
436 /// LoopVectorizationLegality class to provide information about the induction
437 /// and reduction variables that were found to a given vectorization factor.
438 class InnerLoopVectorizer {
439 public:
440   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
441                       LoopInfo *LI, DominatorTree *DT,
442                       const TargetLibraryInfo *TLI,
443                       const TargetTransformInfo *TTI, AssumptionCache *AC,
444                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
445                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
446                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
447                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
448       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
449         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
450         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
451         PSI(PSI), RTChecks(RTChecks) {
452     // Query this against the original loop and save it here because the profile
453     // of the original loop header may change as the transformation happens.
454     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
455         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
456   }
457 
458   virtual ~InnerLoopVectorizer() = default;
459 
460   /// Create a new empty loop that will contain vectorized instructions later
461   /// on, while the old loop will be used as the scalar remainder. Control flow
462   /// is generated around the vectorized (and scalar epilogue) loops consisting
463   /// of various checks and bypasses. Return the pre-header block of the new
464   /// loop and the start value for the canonical induction, if it is != 0. The
465   /// latter is the case when vectorizing the epilogue loop. In the case of
466   /// epilogue vectorization, this function is overriden to handle the more
467   /// complex control flow around the loops.
468   virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton();
469 
470   /// Widen a single call instruction within the innermost loop.
471   void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
472                             VPTransformState &State);
473 
474   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
475   void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
476 
477   // Return true if any runtime check is added.
478   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
479 
480   /// A type for vectorized values in the new loop. Each value from the
481   /// original loop, when vectorized, is represented by UF vector values in the
482   /// new unrolled loop, where UF is the unroll factor.
483   using VectorParts = SmallVector<Value *, 2>;
484 
485   /// A helper function to scalarize a single Instruction in the innermost loop.
486   /// Generates a sequence of scalar instances for each lane between \p MinLane
487   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
488   /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
489   /// Instr's operands.
490   void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe,
491                             const VPIteration &Instance, bool IfPredicateInstr,
492                             VPTransformState &State);
493 
494   /// Construct the vector value of a scalarized value \p V one lane at a time.
495   void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
496                                  VPTransformState &State);
497 
498   /// Try to vectorize interleaved access group \p Group with the base address
499   /// given in \p Addr, optionally masking the vector operations if \p
500   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
501   /// values in the vectorized loop.
502   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
503                                 ArrayRef<VPValue *> VPDefs,
504                                 VPTransformState &State, VPValue *Addr,
505                                 ArrayRef<VPValue *> StoredValues,
506                                 VPValue *BlockInMask = nullptr);
507 
508   /// Set the debug location in the builder \p Ptr using the debug location in
509   /// \p V. If \p Ptr is None then it uses the class member's Builder.
510   void setDebugLocFromInst(const Value *V);
511 
512   /// Fix the non-induction PHIs in \p Plan.
513   void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
514 
515   /// Returns true if the reordering of FP operations is not allowed, but we are
516   /// able to vectorize with strict in-order reductions for the given RdxDesc.
517   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
518 
519   /// Create a broadcast instruction. This method generates a broadcast
520   /// instruction (shuffle) for loop invariant values and for the induction
521   /// value. If this is the induction variable then we extend it to N, N+1, ...
522   /// this is needed because each iteration in the loop corresponds to a SIMD
523   /// element.
524   virtual Value *getBroadcastInstrs(Value *V);
525 
526   // Returns the resume value (bc.merge.rdx) for a reduction as
527   // generated by fixReduction.
528   PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);
529 
530 protected:
531   friend class LoopVectorizationPlanner;
532 
533   /// A small list of PHINodes.
534   using PhiVector = SmallVector<PHINode *, 4>;
535 
536   /// A type for scalarized values in the new loop. Each value from the
537   /// original loop, when scalarized, is represented by UF x VF scalar values
538   /// in the new unrolled loop, where UF is the unroll factor and VF is the
539   /// vectorization factor.
540   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
541 
542   /// Set up the values of the IVs correctly when exiting the vector loop.
543   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
544                     Value *VectorTripCount, Value *EndValue,
545                     BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
546                     VPlan &Plan);
547 
548   /// Handle all cross-iteration phis in the header.
549   void fixCrossIterationPHIs(VPTransformState &State);
550 
551   /// Create the exit value of first order recurrences in the middle block and
552   /// update their users.
553   void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
554                                VPTransformState &State);
555 
556   /// Create code for the loop exit value of the reduction.
557   void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
558 
559   /// Clear NSW/NUW flags from reduction instructions if necessary.
560   void clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
561                                VPTransformState &State);
562 
563   /// Iteratively sink the scalarized operands of a predicated instruction into
564   /// the block that was created for it.
565   void sinkScalarOperands(Instruction *PredInst);
566 
567   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
568   /// represented as.
569   void truncateToMinimalBitwidths(VPTransformState &State);
570 
571   /// Returns (and creates if needed) the original loop trip count.
572   Value *getOrCreateTripCount(BasicBlock *InsertBlock);
573 
574   /// Returns (and creates if needed) the trip count of the widened loop.
575   Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
576 
577   /// Returns a bitcasted value to the requested vector type.
578   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
579   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
580                                 const DataLayout &DL);
581 
582   /// Emit a bypass check to see if the vector trip count is zero, including if
583   /// it overflows.
584   void emitIterationCountCheck(BasicBlock *Bypass);
585 
586   /// Emit a bypass check to see if all of the SCEV assumptions we've
587   /// had to make are correct. Returns the block containing the checks or
588   /// nullptr if no checks have been added.
589   BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
590 
591   /// Emit bypass checks to check any memory assumptions we may have made.
592   /// Returns the block containing the checks or nullptr if no checks have been
593   /// added.
594   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
595 
596   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
597   /// vector loop preheader, middle block and scalar preheader.
598   void createVectorLoopSkeleton(StringRef Prefix);
599 
600   /// Create new phi nodes for the induction variables to resume iteration count
601   /// in the scalar epilogue, from where the vectorized loop left off.
602   /// In cases where the loop skeleton is more complicated (eg. epilogue
603   /// vectorization) and the resume values can come from an additional bypass
604   /// block, the \p AdditionalBypass pair provides information about the bypass
605   /// block and the end value on the edge from bypass to this loop.
606   void createInductionResumeValues(
607       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
608 
609   /// Complete the loop skeleton by adding debug MDs, creating appropriate
610   /// conditional branches in the middle block, preparing the builder and
611   /// running the verifier. Return the preheader of the completed vector loop.
612   BasicBlock *completeLoopSkeleton(MDNode *OrigLoopID);
613 
614   /// Collect poison-generating recipes that may generate a poison value that is
615   /// used after vectorization, even when their operands are not poison. Those
616   /// recipes meet the following conditions:
617   ///  * Contribute to the address computation of a recipe generating a widen
618   ///    memory load/store (VPWidenMemoryInstructionRecipe or
619   ///    VPInterleaveRecipe).
620   ///  * Such a widen memory load/store has at least one underlying Instruction
621   ///    that is in a basic block that needs predication and after vectorization
622   ///    the generated instruction won't be predicated.
623   void collectPoisonGeneratingRecipes(VPTransformState &State);
624 
625   /// Allow subclasses to override and print debug traces before/after vplan
626   /// execution, when trace information is requested.
627   virtual void printDebugTracesAtStart(){};
628   virtual void printDebugTracesAtEnd(){};
629 
630   /// The original loop.
631   Loop *OrigLoop;
632 
633   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
634   /// dynamic knowledge to simplify SCEV expressions and converts them to a
635   /// more usable form.
636   PredicatedScalarEvolution &PSE;
637 
638   /// Loop Info.
639   LoopInfo *LI;
640 
641   /// Dominator Tree.
642   DominatorTree *DT;
643 
644   /// Alias Analysis.
645   AAResults *AA;
646 
647   /// Target Library Info.
648   const TargetLibraryInfo *TLI;
649 
650   /// Target Transform Info.
651   const TargetTransformInfo *TTI;
652 
653   /// Assumption Cache.
654   AssumptionCache *AC;
655 
656   /// Interface to emit optimization remarks.
657   OptimizationRemarkEmitter *ORE;
658 
659   /// The vectorization SIMD factor to use. Each vector will have this many
660   /// vector elements.
661   ElementCount VF;
662 
663   /// The vectorization unroll factor to use. Each scalar is vectorized to this
664   /// many different vector instructions.
665   unsigned UF;
666 
667   /// The builder that we use
668   IRBuilder<> Builder;
669 
670   // --- Vectorization state ---
671 
672   /// The vector-loop preheader.
673   BasicBlock *LoopVectorPreHeader;
674 
675   /// The scalar-loop preheader.
676   BasicBlock *LoopScalarPreHeader;
677 
678   /// Middle Block between the vector and the scalar.
679   BasicBlock *LoopMiddleBlock;
680 
681   /// The unique ExitBlock of the scalar loop if one exists.  Note that
682   /// there can be multiple exiting edges reaching this block.
683   BasicBlock *LoopExitBlock;
684 
685   /// The scalar loop body.
686   BasicBlock *LoopScalarBody;
687 
688   /// A list of all bypass blocks. The first block is the entry of the loop.
689   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
690 
691   /// Store instructions that were predicated.
692   SmallVector<Instruction *, 4> PredicatedInstructions;
693 
694   /// Trip count of the original loop.
695   Value *TripCount = nullptr;
696 
697   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
698   Value *VectorTripCount = nullptr;
699 
700   /// The legality analysis.
701   LoopVectorizationLegality *Legal;
702 
703   /// The profitablity analysis.
704   LoopVectorizationCostModel *Cost;
705 
706   // Record whether runtime checks are added.
707   bool AddedSafetyChecks = false;
708 
709   // Holds the end values for each induction variable. We save the end values
710   // so we can later fix-up the external users of the induction variables.
711   DenseMap<PHINode *, Value *> IVEndValues;
712 
713   /// BFI and PSI are used to check for profile guided size optimizations.
714   BlockFrequencyInfo *BFI;
715   ProfileSummaryInfo *PSI;
716 
717   // Whether this loop should be optimized for size based on profile guided size
718   // optimizatios.
719   bool OptForSizeBasedOnProfile;
720 
721   /// Structure to hold information about generated runtime checks, responsible
722   /// for cleaning the checks, if vectorization turns out unprofitable.
723   GeneratedRTChecks &RTChecks;
724 
725   // Holds the resume values for reductions in the loops, used to set the
726   // correct start value of reduction PHIs when vectorizing the epilogue.
727   SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
728       ReductionResumeValues;
729 };
730 
731 class InnerLoopUnroller : public InnerLoopVectorizer {
732 public:
733   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
734                     LoopInfo *LI, DominatorTree *DT,
735                     const TargetLibraryInfo *TLI,
736                     const TargetTransformInfo *TTI, AssumptionCache *AC,
737                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
738                     LoopVectorizationLegality *LVL,
739                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
740                     ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
741       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
742                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
743                             BFI, PSI, Check) {}
744 
745 private:
746   Value *getBroadcastInstrs(Value *V) override;
747 };
748 
749 /// Encapsulate information regarding vectorization of a loop and its epilogue.
750 /// This information is meant to be updated and used across two stages of
751 /// epilogue vectorization.
752 struct EpilogueLoopVectorizationInfo {
753   ElementCount MainLoopVF = ElementCount::getFixed(0);
754   unsigned MainLoopUF = 0;
755   ElementCount EpilogueVF = ElementCount::getFixed(0);
756   unsigned EpilogueUF = 0;
757   BasicBlock *MainLoopIterationCountCheck = nullptr;
758   BasicBlock *EpilogueIterationCountCheck = nullptr;
759   BasicBlock *SCEVSafetyCheck = nullptr;
760   BasicBlock *MemSafetyCheck = nullptr;
761   Value *TripCount = nullptr;
762   Value *VectorTripCount = nullptr;
763 
764   EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
765                                 ElementCount EVF, unsigned EUF)
766       : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
767     assert(EUF == 1 &&
768            "A high UF for the epilogue loop is likely not beneficial.");
769   }
770 };
771 
772 /// An extension of the inner loop vectorizer that creates a skeleton for a
773 /// vectorized loop that has its epilogue (residual) also vectorized.
774 /// The idea is to run the vplan on a given loop twice, firstly to setup the
775 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
776 /// from the first step and vectorize the epilogue.  This is achieved by
777 /// deriving two concrete strategy classes from this base class and invoking
778 /// them in succession from the loop vectorizer planner.
779 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
780 public:
781   InnerLoopAndEpilogueVectorizer(
782       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
783       DominatorTree *DT, const TargetLibraryInfo *TLI,
784       const TargetTransformInfo *TTI, AssumptionCache *AC,
785       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
786       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
787       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
788       GeneratedRTChecks &Checks)
789       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
790                             EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI,
791                             Checks),
792         EPI(EPI) {}
793 
794   // Override this function to handle the more complex control flow around the
795   // three loops.
796   std::pair<BasicBlock *, Value *>
797   createVectorizedLoopSkeleton() final override {
798     return createEpilogueVectorizedLoopSkeleton();
799   }
800 
801   /// The interface for creating a vectorized skeleton using one of two
802   /// different strategies, each corresponding to one execution of the vplan
803   /// as described above.
804   virtual std::pair<BasicBlock *, Value *>
805   createEpilogueVectorizedLoopSkeleton() = 0;
806 
807   /// Holds and updates state information required to vectorize the main loop
808   /// and its epilogue in two separate passes. This setup helps us avoid
809   /// regenerating and recomputing runtime safety checks. It also helps us to
810   /// shorten the iteration-count-check path length for the cases where the
811   /// iteration count of the loop is so small that the main vector loop is
812   /// completely skipped.
813   EpilogueLoopVectorizationInfo &EPI;
814 };
815 
816 /// A specialized derived class of inner loop vectorizer that performs
817 /// vectorization of *main* loops in the process of vectorizing loops and their
818 /// epilogues.
819 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
820 public:
821   EpilogueVectorizerMainLoop(
822       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
823       DominatorTree *DT, const TargetLibraryInfo *TLI,
824       const TargetTransformInfo *TTI, AssumptionCache *AC,
825       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
826       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
827       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
828       GeneratedRTChecks &Check)
829       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
830                                        EPI, LVL, CM, BFI, PSI, Check) {}
831   /// Implements the interface for creating a vectorized skeleton using the
832   /// *main loop* strategy (ie the first pass of vplan execution).
833   std::pair<BasicBlock *, Value *>
834   createEpilogueVectorizedLoopSkeleton() final override;
835 
836 protected:
837   /// Emits an iteration count bypass check once for the main loop (when \p
838   /// ForEpilogue is false) and once for the epilogue loop (when \p
839   /// ForEpilogue is true).
840   BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
841   void printDebugTracesAtStart() override;
842   void printDebugTracesAtEnd() override;
843 };
844 
845 // A specialized derived class of inner loop vectorizer that performs
846 // vectorization of *epilogue* loops in the process of vectorizing loops and
847 // their epilogues.
848 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
849 public:
850   EpilogueVectorizerEpilogueLoop(
851       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
852       DominatorTree *DT, const TargetLibraryInfo *TLI,
853       const TargetTransformInfo *TTI, AssumptionCache *AC,
854       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
855       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
856       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
857       GeneratedRTChecks &Checks)
858       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
859                                        EPI, LVL, CM, BFI, PSI, Checks) {
860     TripCount = EPI.TripCount;
861   }
862   /// Implements the interface for creating a vectorized skeleton using the
863   /// *epilogue loop* strategy (ie the second pass of vplan execution).
864   std::pair<BasicBlock *, Value *>
865   createEpilogueVectorizedLoopSkeleton() final override;
866 
867 protected:
868   /// Emits an iteration count bypass check after the main vector loop has
869   /// finished to see if there are any iterations left to execute by either
870   /// the vector epilogue or the scalar epilogue.
871   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
872                                                       BasicBlock *Bypass,
873                                                       BasicBlock *Insert);
874   void printDebugTracesAtStart() override;
875   void printDebugTracesAtEnd() override;
876 };
877 } // end namespace llvm
878 
879 /// Look for a meaningful debug location on the instruction or it's
880 /// operands.
881 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
882   if (!I)
883     return I;
884 
885   DebugLoc Empty;
886   if (I->getDebugLoc() != Empty)
887     return I;
888 
889   for (Use &Op : I->operands()) {
890     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
891       if (OpInst->getDebugLoc() != Empty)
892         return OpInst;
893   }
894 
895   return I;
896 }
897 
898 void InnerLoopVectorizer::setDebugLocFromInst(
899     const Value *V) {
900   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) {
901     const DILocation *DIL = Inst->getDebugLoc();
902 
903     // When a FSDiscriminator is enabled, we don't need to add the multiply
904     // factors to the discriminators.
905     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
906         !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) {
907       // FIXME: For scalable vectors, assume vscale=1.
908       auto NewDIL =
909           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
910       if (NewDIL)
911         Builder.SetCurrentDebugLocation(*NewDIL);
912       else
913         LLVM_DEBUG(dbgs()
914                    << "Failed to create new discriminator: "
915                    << DIL->getFilename() << " Line: " << DIL->getLine());
916     } else
917       Builder.SetCurrentDebugLocation(DIL);
918   } else
919     Builder.SetCurrentDebugLocation(DebugLoc());
920 }
921 
922 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
923 /// is passed, the message relates to that particular instruction.
924 #ifndef NDEBUG
925 static void debugVectorizationMessage(const StringRef Prefix,
926                                       const StringRef DebugMsg,
927                                       Instruction *I) {
928   dbgs() << "LV: " << Prefix << DebugMsg;
929   if (I != nullptr)
930     dbgs() << " " << *I;
931   else
932     dbgs() << '.';
933   dbgs() << '\n';
934 }
935 #endif
936 
937 /// Create an analysis remark that explains why vectorization failed
938 ///
939 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
940 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
941 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
942 /// the location of the remark.  \return the remark object that can be
943 /// streamed to.
944 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
945     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
946   Value *CodeRegion = TheLoop->getHeader();
947   DebugLoc DL = TheLoop->getStartLoc();
948 
949   if (I) {
950     CodeRegion = I->getParent();
951     // If there is no debug location attached to the instruction, revert back to
952     // using the loop's.
953     if (I->getDebugLoc())
954       DL = I->getDebugLoc();
955   }
956 
957   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
958 }
959 
960 namespace llvm {
961 
962 /// Return a value for Step multiplied by VF.
963 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
964                        int64_t Step) {
965   assert(Ty->isIntegerTy() && "Expected an integer step");
966   Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue());
967   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
968 }
969 
970 /// Return the runtime value for VF.
971 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
972   Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
973   return VF.isScalable() ? B.CreateVScale(EC) : EC;
974 }
975 
976 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
977                                   ElementCount VF) {
978   assert(FTy->isFloatingPointTy() && "Expected floating point type!");
979   Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
980   Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
981   return B.CreateUIToFP(RuntimeVF, FTy);
982 }
983 
984 void reportVectorizationFailure(const StringRef DebugMsg,
985                                 const StringRef OREMsg, const StringRef ORETag,
986                                 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
987                                 Instruction *I) {
988   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
989   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
990   ORE->emit(
991       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
992       << "loop not vectorized: " << OREMsg);
993 }
994 
995 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
996                              OptimizationRemarkEmitter *ORE, Loop *TheLoop,
997                              Instruction *I) {
998   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
999   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1000   ORE->emit(
1001       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1002       << Msg);
1003 }
1004 
1005 } // end namespace llvm
1006 
1007 #ifndef NDEBUG
1008 /// \return string containing a file name and a line # for the given loop.
1009 static std::string getDebugLocString(const Loop *L) {
1010   std::string Result;
1011   if (L) {
1012     raw_string_ostream OS(Result);
1013     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1014       LoopDbgLoc.print(OS);
1015     else
1016       // Just print the module name.
1017       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1018     OS.flush();
1019   }
1020   return Result;
1021 }
1022 #endif
1023 
1024 void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
1025     VPTransformState &State) {
1026 
1027   // Collect recipes in the backward slice of `Root` that may generate a poison
1028   // value that is used after vectorization.
1029   SmallPtrSet<VPRecipeBase *, 16> Visited;
1030   auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
1031     SmallVector<VPRecipeBase *, 16> Worklist;
1032     Worklist.push_back(Root);
1033 
1034     // Traverse the backward slice of Root through its use-def chain.
1035     while (!Worklist.empty()) {
1036       VPRecipeBase *CurRec = Worklist.back();
1037       Worklist.pop_back();
1038 
1039       if (!Visited.insert(CurRec).second)
1040         continue;
1041 
1042       // Prune search if we find another recipe generating a widen memory
1043       // instruction. Widen memory instructions involved in address computation
1044       // will lead to gather/scatter instructions, which don't need to be
1045       // handled.
1046       if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
1047           isa<VPInterleaveRecipe>(CurRec) ||
1048           isa<VPScalarIVStepsRecipe>(CurRec) ||
1049           isa<VPCanonicalIVPHIRecipe>(CurRec))
1050         continue;
1051 
1052       // This recipe contributes to the address computation of a widen
1053       // load/store. Collect recipe if its underlying instruction has
1054       // poison-generating flags.
1055       Instruction *Instr = CurRec->getUnderlyingInstr();
1056       if (Instr && Instr->hasPoisonGeneratingFlags())
1057         State.MayGeneratePoisonRecipes.insert(CurRec);
1058 
1059       // Add new definitions to the worklist.
1060       for (VPValue *operand : CurRec->operands())
1061         if (VPDef *OpDef = operand->getDef())
1062           Worklist.push_back(cast<VPRecipeBase>(OpDef));
1063     }
1064   });
1065 
1066   // Traverse all the recipes in the VPlan and collect the poison-generating
1067   // recipes in the backward slice starting at the address of a VPWidenRecipe or
1068   // VPInterleaveRecipe.
1069   auto Iter = depth_first(
1070       VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry()));
1071   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1072     for (VPRecipeBase &Recipe : *VPBB) {
1073       if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
1074         Instruction &UnderlyingInstr = WidenRec->getIngredient();
1075         VPDef *AddrDef = WidenRec->getAddr()->getDef();
1076         if (AddrDef && WidenRec->isConsecutive() &&
1077             Legal->blockNeedsPredication(UnderlyingInstr.getParent()))
1078           collectPoisonGeneratingInstrsInBackwardSlice(
1079               cast<VPRecipeBase>(AddrDef));
1080       } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1081         VPDef *AddrDef = InterleaveRec->getAddr()->getDef();
1082         if (AddrDef) {
1083           // Check if any member of the interleave group needs predication.
1084           const InterleaveGroup<Instruction> *InterGroup =
1085               InterleaveRec->getInterleaveGroup();
1086           bool NeedPredication = false;
1087           for (int I = 0, NumMembers = InterGroup->getNumMembers();
1088                I < NumMembers; ++I) {
1089             Instruction *Member = InterGroup->getMember(I);
1090             if (Member)
1091               NeedPredication |=
1092                   Legal->blockNeedsPredication(Member->getParent());
1093           }
1094 
1095           if (NeedPredication)
1096             collectPoisonGeneratingInstrsInBackwardSlice(
1097                 cast<VPRecipeBase>(AddrDef));
1098         }
1099       }
1100     }
1101   }
1102 }
1103 
1104 PHINode *InnerLoopVectorizer::getReductionResumeValue(
1105     const RecurrenceDescriptor &RdxDesc) {
1106   auto It = ReductionResumeValues.find(&RdxDesc);
1107   assert(It != ReductionResumeValues.end() &&
1108          "Expected to find a resume value for the reduction.");
1109   return It->second;
1110 }
1111 
1112 namespace llvm {
1113 
1114 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1115 // lowered.
1116 enum ScalarEpilogueLowering {
1117 
1118   // The default: allowing scalar epilogues.
1119   CM_ScalarEpilogueAllowed,
1120 
1121   // Vectorization with OptForSize: don't allow epilogues.
1122   CM_ScalarEpilogueNotAllowedOptSize,
1123 
1124   // A special case of vectorisation with OptForSize: loops with a very small
1125   // trip count are considered for vectorization under OptForSize, thereby
1126   // making sure the cost of their loop body is dominant, free of runtime
1127   // guards and scalar iteration overheads.
1128   CM_ScalarEpilogueNotAllowedLowTripLoop,
1129 
1130   // Loop hint predicate indicating an epilogue is undesired.
1131   CM_ScalarEpilogueNotNeededUsePredicate,
1132 
1133   // Directive indicating we must either tail fold or not vectorize
1134   CM_ScalarEpilogueNotAllowedUsePredicate
1135 };
1136 
1137 /// ElementCountComparator creates a total ordering for ElementCount
1138 /// for the purposes of using it in a set structure.
1139 struct ElementCountComparator {
1140   bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
1141     return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
1142            std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
1143   }
1144 };
1145 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
1146 
1147 /// LoopVectorizationCostModel - estimates the expected speedups due to
1148 /// vectorization.
1149 /// In many cases vectorization is not profitable. This can happen because of
1150 /// a number of reasons. In this class we mainly attempt to predict the
1151 /// expected speedup/slowdowns due to the supported instruction set. We use the
1152 /// TargetTransformInfo to query the different backends for the cost of
1153 /// different operations.
1154 class LoopVectorizationCostModel {
1155 public:
1156   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1157                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1158                              LoopVectorizationLegality *Legal,
1159                              const TargetTransformInfo &TTI,
1160                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1161                              AssumptionCache *AC,
1162                              OptimizationRemarkEmitter *ORE, const Function *F,
1163                              const LoopVectorizeHints *Hints,
1164                              InterleavedAccessInfo &IAI)
1165       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1166         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1167         Hints(Hints), InterleaveInfo(IAI) {}
1168 
1169   /// \return An upper bound for the vectorization factors (both fixed and
1170   /// scalable). If the factors are 0, vectorization and interleaving should be
1171   /// avoided up front.
1172   FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1173 
1174   /// \return True if runtime checks are required for vectorization, and false
1175   /// otherwise.
1176   bool runtimeChecksRequired();
1177 
1178   /// \return The most profitable vectorization factor and the cost of that VF.
1179   /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
1180   /// then this vectorization factor will be selected if vectorization is
1181   /// possible.
1182   VectorizationFactor
1183   selectVectorizationFactor(const ElementCountSet &CandidateVFs);
1184 
1185   VectorizationFactor
1186   selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1187                                     const LoopVectorizationPlanner &LVP);
1188 
1189   /// Setup cost-based decisions for user vectorization factor.
1190   /// \return true if the UserVF is a feasible VF to be chosen.
1191   bool selectUserVectorizationFactor(ElementCount UserVF) {
1192     collectUniformsAndScalars(UserVF);
1193     collectInstsToScalarize(UserVF);
1194     return expectedCost(UserVF).first.isValid();
1195   }
1196 
1197   /// \return The size (in bits) of the smallest and widest types in the code
1198   /// that needs to be vectorized. We ignore values that remain scalar such as
1199   /// 64 bit loop indices.
1200   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1201 
1202   /// \return The desired interleave count.
1203   /// If interleave count has been specified by metadata it will be returned.
1204   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1205   /// are the selected vectorization factor and the cost of the selected VF.
1206   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1207 
1208   /// Memory access instruction may be vectorized in more than one way.
1209   /// Form of instruction after vectorization depends on cost.
1210   /// This function takes cost-based decisions for Load/Store instructions
1211   /// and collects them in a map. This decisions map is used for building
1212   /// the lists of loop-uniform and loop-scalar instructions.
1213   /// The calculated cost is saved with widening decision in order to
1214   /// avoid redundant calculations.
1215   void setCostBasedWideningDecision(ElementCount VF);
1216 
1217   /// A struct that represents some properties of the register usage
1218   /// of a loop.
1219   struct RegisterUsage {
1220     /// Holds the number of loop invariant values that are used in the loop.
1221     /// The key is ClassID of target-provided register class.
1222     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1223     /// Holds the maximum number of concurrent live intervals in the loop.
1224     /// The key is ClassID of target-provided register class.
1225     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1226   };
1227 
1228   /// \return Returns information about the register usages of the loop for the
1229   /// given vectorization factors.
1230   SmallVector<RegisterUsage, 8>
1231   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1232 
1233   /// Collect values we want to ignore in the cost model.
1234   void collectValuesToIgnore();
1235 
1236   /// Collect all element types in the loop for which widening is needed.
1237   void collectElementTypesForWidening();
1238 
1239   /// Split reductions into those that happen in the loop, and those that happen
1240   /// outside. In loop reductions are collected into InLoopReductionChains.
1241   void collectInLoopReductions();
1242 
1243   /// Returns true if we should use strict in-order reductions for the given
1244   /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1245   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1246   /// of FP operations.
1247   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1248     return !Hints->allowReordering() && RdxDesc.isOrdered();
1249   }
1250 
1251   /// \returns The smallest bitwidth each instruction can be represented with.
1252   /// The vector equivalents of these instructions should be truncated to this
1253   /// type.
1254   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1255     return MinBWs;
1256   }
1257 
1258   /// \returns True if it is more profitable to scalarize instruction \p I for
1259   /// vectorization factor \p VF.
1260   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1261     assert(VF.isVector() &&
1262            "Profitable to scalarize relevant only for VF > 1.");
1263 
1264     // Cost model is not run in the VPlan-native path - return conservative
1265     // result until this changes.
1266     if (EnableVPlanNativePath)
1267       return false;
1268 
1269     auto Scalars = InstsToScalarize.find(VF);
1270     assert(Scalars != InstsToScalarize.end() &&
1271            "VF not yet analyzed for scalarization profitability");
1272     return Scalars->second.find(I) != Scalars->second.end();
1273   }
1274 
1275   /// Returns true if \p I is known to be uniform after vectorization.
1276   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1277     if (VF.isScalar())
1278       return true;
1279 
1280     // Cost model is not run in the VPlan-native path - return conservative
1281     // result until this changes.
1282     if (EnableVPlanNativePath)
1283       return false;
1284 
1285     auto UniformsPerVF = Uniforms.find(VF);
1286     assert(UniformsPerVF != Uniforms.end() &&
1287            "VF not yet analyzed for uniformity");
1288     return UniformsPerVF->second.count(I);
1289   }
1290 
1291   /// Returns true if \p I is known to be scalar after vectorization.
1292   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1293     if (VF.isScalar())
1294       return true;
1295 
1296     // Cost model is not run in the VPlan-native path - return conservative
1297     // result until this changes.
1298     if (EnableVPlanNativePath)
1299       return false;
1300 
1301     auto ScalarsPerVF = Scalars.find(VF);
1302     assert(ScalarsPerVF != Scalars.end() &&
1303            "Scalar values are not calculated for VF");
1304     return ScalarsPerVF->second.count(I);
1305   }
1306 
1307   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1308   /// for vectorization factor \p VF.
1309   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1310     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1311            !isProfitableToScalarize(I, VF) &&
1312            !isScalarAfterVectorization(I, VF);
1313   }
1314 
1315   /// Decision that was taken during cost calculation for memory instruction.
1316   enum InstWidening {
1317     CM_Unknown,
1318     CM_Widen,         // For consecutive accesses with stride +1.
1319     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1320     CM_Interleave,
1321     CM_GatherScatter,
1322     CM_Scalarize
1323   };
1324 
1325   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1326   /// instruction \p I and vector width \p VF.
1327   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1328                            InstructionCost Cost) {
1329     assert(VF.isVector() && "Expected VF >=2");
1330     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1331   }
1332 
1333   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1334   /// interleaving group \p Grp and vector width \p VF.
1335   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1336                            ElementCount VF, InstWidening W,
1337                            InstructionCost Cost) {
1338     assert(VF.isVector() && "Expected VF >=2");
1339     /// Broadcast this decicion to all instructions inside the group.
1340     /// But the cost will be assigned to one instruction only.
1341     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1342       if (auto *I = Grp->getMember(i)) {
1343         if (Grp->getInsertPos() == I)
1344           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1345         else
1346           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1347       }
1348     }
1349   }
1350 
1351   /// Return the cost model decision for the given instruction \p I and vector
1352   /// width \p VF. Return CM_Unknown if this instruction did not pass
1353   /// through the cost modeling.
1354   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1355     assert(VF.isVector() && "Expected VF to be a vector VF");
1356     // Cost model is not run in the VPlan-native path - return conservative
1357     // result until this changes.
1358     if (EnableVPlanNativePath)
1359       return CM_GatherScatter;
1360 
1361     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1362     auto Itr = WideningDecisions.find(InstOnVF);
1363     if (Itr == WideningDecisions.end())
1364       return CM_Unknown;
1365     return Itr->second.first;
1366   }
1367 
1368   /// Return the vectorization cost for the given instruction \p I and vector
1369   /// width \p VF.
1370   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1371     assert(VF.isVector() && "Expected VF >=2");
1372     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1373     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1374            "The cost is not calculated");
1375     return WideningDecisions[InstOnVF].second;
1376   }
1377 
1378   /// Return True if instruction \p I is an optimizable truncate whose operand
1379   /// is an induction variable. Such a truncate will be removed by adding a new
1380   /// induction variable with the destination type.
1381   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1382     // If the instruction is not a truncate, return false.
1383     auto *Trunc = dyn_cast<TruncInst>(I);
1384     if (!Trunc)
1385       return false;
1386 
1387     // Get the source and destination types of the truncate.
1388     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1389     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1390 
1391     // If the truncate is free for the given types, return false. Replacing a
1392     // free truncate with an induction variable would add an induction variable
1393     // update instruction to each iteration of the loop. We exclude from this
1394     // check the primary induction variable since it will need an update
1395     // instruction regardless.
1396     Value *Op = Trunc->getOperand(0);
1397     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1398       return false;
1399 
1400     // If the truncated value is not an induction variable, return false.
1401     return Legal->isInductionPhi(Op);
1402   }
1403 
1404   /// Collects the instructions to scalarize for each predicated instruction in
1405   /// the loop.
1406   void collectInstsToScalarize(ElementCount VF);
1407 
1408   /// Collect Uniform and Scalar values for the given \p VF.
1409   /// The sets depend on CM decision for Load/Store instructions
1410   /// that may be vectorized as interleave, gather-scatter or scalarized.
1411   void collectUniformsAndScalars(ElementCount VF) {
1412     // Do the analysis once.
1413     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1414       return;
1415     setCostBasedWideningDecision(VF);
1416     collectLoopUniforms(VF);
1417     collectLoopScalars(VF);
1418   }
1419 
1420   /// Returns true if the target machine supports masked store operation
1421   /// for the given \p DataType and kind of access to \p Ptr.
1422   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1423     return Legal->isConsecutivePtr(DataType, Ptr) &&
1424            TTI.isLegalMaskedStore(DataType, Alignment);
1425   }
1426 
1427   /// Returns true if the target machine supports masked load operation
1428   /// for the given \p DataType and kind of access to \p Ptr.
1429   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1430     return Legal->isConsecutivePtr(DataType, Ptr) &&
1431            TTI.isLegalMaskedLoad(DataType, Alignment);
1432   }
1433 
1434   /// Returns true if the target machine can represent \p V as a masked gather
1435   /// or scatter operation.
1436   bool isLegalGatherOrScatter(Value *V,
1437                               ElementCount VF = ElementCount::getFixed(1)) {
1438     bool LI = isa<LoadInst>(V);
1439     bool SI = isa<StoreInst>(V);
1440     if (!LI && !SI)
1441       return false;
1442     auto *Ty = getLoadStoreType(V);
1443     Align Align = getLoadStoreAlignment(V);
1444     if (VF.isVector())
1445       Ty = VectorType::get(Ty, VF);
1446     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1447            (SI && TTI.isLegalMaskedScatter(Ty, Align));
1448   }
1449 
1450   /// Returns true if the target machine supports all of the reduction
1451   /// variables found for the given VF.
1452   bool canVectorizeReductions(ElementCount VF) const {
1453     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1454       const RecurrenceDescriptor &RdxDesc = Reduction.second;
1455       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1456     }));
1457   }
1458 
1459   /// Returns true if \p I is an instruction that will be scalarized with
1460   /// predication when vectorizing \p I with vectorization factor \p VF. Such
1461   /// instructions include conditional stores and instructions that may divide
1462   /// by zero.
1463   bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1464 
1465   // Returns true if \p I is an instruction that will be predicated either
1466   // through scalar predication or masked load/store or masked gather/scatter.
1467   // \p VF is the vectorization factor that will be used to vectorize \p I.
1468   // Superset of instructions that return true for isScalarWithPredication.
1469   bool isPredicatedInst(Instruction *I, ElementCount VF,
1470                         bool IsKnownUniform = false) {
1471     // When we know the load is uniform and the original scalar loop was not
1472     // predicated we don't need to mark it as a predicated instruction. Any
1473     // vectorised blocks created when tail-folding are something artificial we
1474     // have introduced and we know there is always at least one active lane.
1475     // That's why we call Legal->blockNeedsPredication here because it doesn't
1476     // query tail-folding.
1477     if (IsKnownUniform && isa<LoadInst>(I) &&
1478         !Legal->blockNeedsPredication(I->getParent()))
1479       return false;
1480     if (!blockNeedsPredicationForAnyReason(I->getParent()))
1481       return false;
1482     // Loads and stores that need some form of masked operation are predicated
1483     // instructions.
1484     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1485       return Legal->isMaskRequired(I);
1486     return isScalarWithPredication(I, VF);
1487   }
1488 
1489   /// Returns true if \p I is a memory instruction with consecutive memory
1490   /// access that can be widened.
1491   bool
1492   memoryInstructionCanBeWidened(Instruction *I,
1493                                 ElementCount VF = ElementCount::getFixed(1));
1494 
1495   /// Returns true if \p I is a memory instruction in an interleaved-group
1496   /// of memory accesses that can be vectorized with wide vector loads/stores
1497   /// and shuffles.
1498   bool
1499   interleavedAccessCanBeWidened(Instruction *I,
1500                                 ElementCount VF = ElementCount::getFixed(1));
1501 
1502   /// Check if \p Instr belongs to any interleaved access group.
1503   bool isAccessInterleaved(Instruction *Instr) {
1504     return InterleaveInfo.isInterleaved(Instr);
1505   }
1506 
1507   /// Get the interleaved access group that \p Instr belongs to.
1508   const InterleaveGroup<Instruction> *
1509   getInterleavedAccessGroup(Instruction *Instr) {
1510     return InterleaveInfo.getInterleaveGroup(Instr);
1511   }
1512 
1513   /// Returns true if we're required to use a scalar epilogue for at least
1514   /// the final iteration of the original loop.
1515   bool requiresScalarEpilogue(ElementCount VF) const {
1516     if (!isScalarEpilogueAllowed())
1517       return false;
1518     // If we might exit from anywhere but the latch, must run the exiting
1519     // iteration in scalar form.
1520     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1521       return true;
1522     return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();
1523   }
1524 
1525   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1526   /// loop hint annotation.
1527   bool isScalarEpilogueAllowed() const {
1528     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1529   }
1530 
1531   /// Returns true if all loop blocks should be masked to fold tail loop.
1532   bool foldTailByMasking() const { return FoldTailByMasking; }
1533 
1534   /// Returns true if the instructions in this block requires predication
1535   /// for any reason, e.g. because tail folding now requires a predicate
1536   /// or because the block in the original loop was predicated.
1537   bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1538     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1539   }
1540 
1541   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1542   /// nodes to the chain of instructions representing the reductions. Uses a
1543   /// MapVector to ensure deterministic iteration order.
1544   using ReductionChainMap =
1545       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1546 
1547   /// Return the chain of instructions representing an inloop reduction.
1548   const ReductionChainMap &getInLoopReductionChains() const {
1549     return InLoopReductionChains;
1550   }
1551 
1552   /// Returns true if the Phi is part of an inloop reduction.
1553   bool isInLoopReduction(PHINode *Phi) const {
1554     return InLoopReductionChains.count(Phi);
1555   }
1556 
1557   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1558   /// with factor VF.  Return the cost of the instruction, including
1559   /// scalarization overhead if it's needed.
1560   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1561 
1562   /// Estimate cost of a call instruction CI if it were vectorized with factor
1563   /// VF. Return the cost of the instruction, including scalarization overhead
1564   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1565   /// scalarized -
1566   /// i.e. either vector version isn't available, or is too expensive.
1567   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1568                                     bool &NeedToScalarize) const;
1569 
1570   /// Returns true if the per-lane cost of VectorizationFactor A is lower than
1571   /// that of B.
1572   bool isMoreProfitable(const VectorizationFactor &A,
1573                         const VectorizationFactor &B) const;
1574 
1575   /// Invalidates decisions already taken by the cost model.
1576   void invalidateCostModelingDecisions() {
1577     WideningDecisions.clear();
1578     Uniforms.clear();
1579     Scalars.clear();
1580   }
1581 
1582 private:
1583   unsigned NumPredStores = 0;
1584 
1585   /// Convenience function that returns the value of vscale_range iff
1586   /// vscale_range.min == vscale_range.max or otherwise returns the value
1587   /// returned by the corresponding TLI method.
1588   Optional<unsigned> getVScaleForTuning() const;
1589 
1590   /// \return An upper bound for the vectorization factors for both
1591   /// fixed and scalable vectorization, where the minimum-known number of
1592   /// elements is a power-of-2 larger than zero. If scalable vectorization is
1593   /// disabled or unsupported, then the scalable part will be equal to
1594   /// ElementCount::getScalable(0).
1595   FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1596                                            ElementCount UserVF,
1597                                            bool FoldTailByMasking);
1598 
1599   /// \return the maximized element count based on the targets vector
1600   /// registers and the loop trip-count, but limited to a maximum safe VF.
1601   /// This is a helper function of computeFeasibleMaxVF.
1602   ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
1603                                        unsigned SmallestType,
1604                                        unsigned WidestType,
1605                                        ElementCount MaxSafeVF,
1606                                        bool FoldTailByMasking);
1607 
1608   /// \return the maximum legal scalable VF, based on the safe max number
1609   /// of elements.
1610   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1611 
1612   /// The vectorization cost is a combination of the cost itself and a boolean
1613   /// indicating whether any of the contributing operations will actually
1614   /// operate on vector values after type legalization in the backend. If this
1615   /// latter value is false, then all operations will be scalarized (i.e. no
1616   /// vectorization has actually taken place).
1617   using VectorizationCostTy = std::pair<InstructionCost, bool>;
1618 
1619   /// Returns the expected execution cost. The unit of the cost does
1620   /// not matter because we use the 'cost' units to compare different
1621   /// vector widths. The cost that is returned is *not* normalized by
1622   /// the factor width. If \p Invalid is not nullptr, this function
1623   /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1624   /// each instruction that has an Invalid cost for the given VF.
1625   using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1626   VectorizationCostTy
1627   expectedCost(ElementCount VF,
1628                SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1629 
1630   /// Returns the execution time cost of an instruction for a given vector
1631   /// width. Vector width of one means scalar.
1632   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1633 
1634   /// The cost-computation logic from getInstructionCost which provides
1635   /// the vector type as an output parameter.
1636   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1637                                      Type *&VectorTy);
1638 
1639   /// Return the cost of instructions in an inloop reduction pattern, if I is
1640   /// part of that pattern.
1641   Optional<InstructionCost>
1642   getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1643                           TTI::TargetCostKind CostKind);
1644 
1645   /// Calculate vectorization cost of memory instruction \p I.
1646   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1647 
1648   /// The cost computation for scalarized memory instruction.
1649   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1650 
1651   /// The cost computation for interleaving group of memory instructions.
1652   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1653 
1654   /// The cost computation for Gather/Scatter instruction.
1655   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1656 
1657   /// The cost computation for widening instruction \p I with consecutive
1658   /// memory access.
1659   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1660 
1661   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1662   /// Load: scalar load + broadcast.
1663   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1664   /// element)
1665   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1666 
1667   /// Estimate the overhead of scalarizing an instruction. This is a
1668   /// convenience wrapper for the type-based getScalarizationOverhead API.
1669   InstructionCost getScalarizationOverhead(Instruction *I,
1670                                            ElementCount VF) const;
1671 
1672   /// Returns whether the instruction is a load or store and will be a emitted
1673   /// as a vector operation.
1674   bool isConsecutiveLoadOrStore(Instruction *I);
1675 
1676   /// Returns true if an artificially high cost for emulated masked memrefs
1677   /// should be used.
1678   bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1679 
1680   /// Map of scalar integer values to the smallest bitwidth they can be legally
1681   /// represented as. The vector equivalents of these values should be truncated
1682   /// to this type.
1683   MapVector<Instruction *, uint64_t> MinBWs;
1684 
1685   /// A type representing the costs for instructions if they were to be
1686   /// scalarized rather than vectorized. The entries are Instruction-Cost
1687   /// pairs.
1688   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1689 
1690   /// A set containing all BasicBlocks that are known to present after
1691   /// vectorization as a predicated block.
1692   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1693 
1694   /// Records whether it is allowed to have the original scalar loop execute at
1695   /// least once. This may be needed as a fallback loop in case runtime
1696   /// aliasing/dependence checks fail, or to handle the tail/remainder
1697   /// iterations when the trip count is unknown or doesn't divide by the VF,
1698   /// or as a peel-loop to handle gaps in interleave-groups.
1699   /// Under optsize and when the trip count is very small we don't allow any
1700   /// iterations to execute in the scalar loop.
1701   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1702 
1703   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1704   bool FoldTailByMasking = false;
1705 
1706   /// A map holding scalar costs for different vectorization factors. The
1707   /// presence of a cost for an instruction in the mapping indicates that the
1708   /// instruction will be scalarized when vectorizing with the associated
1709   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1710   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1711 
1712   /// Holds the instructions known to be uniform after vectorization.
1713   /// The data is collected per VF.
1714   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1715 
1716   /// Holds the instructions known to be scalar after vectorization.
1717   /// The data is collected per VF.
1718   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1719 
1720   /// Holds the instructions (address computations) that are forced to be
1721   /// scalarized.
1722   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1723 
1724   /// PHINodes of the reductions that should be expanded in-loop along with
1725   /// their associated chains of reduction operations, in program order from top
1726   /// (PHI) to bottom
1727   ReductionChainMap InLoopReductionChains;
1728 
1729   /// A Map of inloop reduction operations and their immediate chain operand.
1730   /// FIXME: This can be removed once reductions can be costed correctly in
1731   /// vplan. This was added to allow quick lookup to the inloop operations,
1732   /// without having to loop through InLoopReductionChains.
1733   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1734 
1735   /// Returns the expected difference in cost from scalarizing the expression
1736   /// feeding a predicated instruction \p PredInst. The instructions to
1737   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1738   /// non-negative return value implies the expression will be scalarized.
1739   /// Currently, only single-use chains are considered for scalarization.
1740   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1741                               ElementCount VF);
1742 
1743   /// Collect the instructions that are uniform after vectorization. An
1744   /// instruction is uniform if we represent it with a single scalar value in
1745   /// the vectorized loop corresponding to each vector iteration. Examples of
1746   /// uniform instructions include pointer operands of consecutive or
1747   /// interleaved memory accesses. Note that although uniformity implies an
1748   /// instruction will be scalar, the reverse is not true. In general, a
1749   /// scalarized instruction will be represented by VF scalar values in the
1750   /// vectorized loop, each corresponding to an iteration of the original
1751   /// scalar loop.
1752   void collectLoopUniforms(ElementCount VF);
1753 
1754   /// Collect the instructions that are scalar after vectorization. An
1755   /// instruction is scalar if it is known to be uniform or will be scalarized
1756   /// during vectorization. collectLoopScalars should only add non-uniform nodes
1757   /// to the list if they are used by a load/store instruction that is marked as
1758   /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1759   /// VF values in the vectorized loop, each corresponding to an iteration of
1760   /// the original scalar loop.
1761   void collectLoopScalars(ElementCount VF);
1762 
1763   /// Keeps cost model vectorization decision and cost for instructions.
1764   /// Right now it is used for memory instructions only.
1765   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1766                                 std::pair<InstWidening, InstructionCost>>;
1767 
1768   DecisionList WideningDecisions;
1769 
1770   /// Returns true if \p V is expected to be vectorized and it needs to be
1771   /// extracted.
1772   bool needsExtract(Value *V, ElementCount VF) const {
1773     Instruction *I = dyn_cast<Instruction>(V);
1774     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1775         TheLoop->isLoopInvariant(I))
1776       return false;
1777 
1778     // Assume we can vectorize V (and hence we need extraction) if the
1779     // scalars are not computed yet. This can happen, because it is called
1780     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1781     // the scalars are collected. That should be a safe assumption in most
1782     // cases, because we check if the operands have vectorizable types
1783     // beforehand in LoopVectorizationLegality.
1784     return Scalars.find(VF) == Scalars.end() ||
1785            !isScalarAfterVectorization(I, VF);
1786   };
1787 
1788   /// Returns a range containing only operands needing to be extracted.
1789   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1790                                                    ElementCount VF) const {
1791     return SmallVector<Value *, 4>(make_filter_range(
1792         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1793   }
1794 
1795   /// Determines if we have the infrastructure to vectorize loop \p L and its
1796   /// epilogue, assuming the main loop is vectorized by \p VF.
1797   bool isCandidateForEpilogueVectorization(const Loop &L,
1798                                            const ElementCount VF) const;
1799 
1800   /// Returns true if epilogue vectorization is considered profitable, and
1801   /// false otherwise.
1802   /// \p VF is the vectorization factor chosen for the original loop.
1803   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1804 
1805 public:
1806   /// The loop that we evaluate.
1807   Loop *TheLoop;
1808 
1809   /// Predicated scalar evolution analysis.
1810   PredicatedScalarEvolution &PSE;
1811 
1812   /// Loop Info analysis.
1813   LoopInfo *LI;
1814 
1815   /// Vectorization legality.
1816   LoopVectorizationLegality *Legal;
1817 
1818   /// Vector target information.
1819   const TargetTransformInfo &TTI;
1820 
1821   /// Target Library Info.
1822   const TargetLibraryInfo *TLI;
1823 
1824   /// Demanded bits analysis.
1825   DemandedBits *DB;
1826 
1827   /// Assumption cache.
1828   AssumptionCache *AC;
1829 
1830   /// Interface to emit optimization remarks.
1831   OptimizationRemarkEmitter *ORE;
1832 
1833   const Function *TheFunction;
1834 
1835   /// Loop Vectorize Hint.
1836   const LoopVectorizeHints *Hints;
1837 
1838   /// The interleave access information contains groups of interleaved accesses
1839   /// with the same stride and close to each other.
1840   InterleavedAccessInfo &InterleaveInfo;
1841 
1842   /// Values to ignore in the cost model.
1843   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1844 
1845   /// Values to ignore in the cost model when VF > 1.
1846   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1847 
1848   /// All element types found in the loop.
1849   SmallPtrSet<Type *, 16> ElementTypesInLoop;
1850 
1851   /// Profitable vector factors.
1852   SmallVector<VectorizationFactor, 8> ProfitableVFs;
1853 };
1854 } // end namespace llvm
1855 
1856 /// Helper struct to manage generating runtime checks for vectorization.
1857 ///
1858 /// The runtime checks are created up-front in temporary blocks to allow better
1859 /// estimating the cost and un-linked from the existing IR. After deciding to
1860 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1861 /// temporary blocks are completely removed.
1862 class GeneratedRTChecks {
1863   /// Basic block which contains the generated SCEV checks, if any.
1864   BasicBlock *SCEVCheckBlock = nullptr;
1865 
1866   /// The value representing the result of the generated SCEV checks. If it is
1867   /// nullptr, either no SCEV checks have been generated or they have been used.
1868   Value *SCEVCheckCond = nullptr;
1869 
1870   /// Basic block which contains the generated memory runtime checks, if any.
1871   BasicBlock *MemCheckBlock = nullptr;
1872 
1873   /// The value representing the result of the generated memory runtime checks.
1874   /// If it is nullptr, either no memory runtime checks have been generated or
1875   /// they have been used.
1876   Value *MemRuntimeCheckCond = nullptr;
1877 
1878   DominatorTree *DT;
1879   LoopInfo *LI;
1880 
1881   SCEVExpander SCEVExp;
1882   SCEVExpander MemCheckExp;
1883 
1884 public:
1885   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1886                     const DataLayout &DL)
1887       : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"),
1888         MemCheckExp(SE, DL, "scev.check") {}
1889 
1890   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1891   /// accurately estimate the cost of the runtime checks. The blocks are
1892   /// un-linked from the IR and is added back during vector code generation. If
1893   /// there is no vector code generation, the check blocks are removed
1894   /// completely.
1895   void Create(Loop *L, const LoopAccessInfo &LAI,
1896               const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1897 
1898     BasicBlock *LoopHeader = L->getHeader();
1899     BasicBlock *Preheader = L->getLoopPreheader();
1900 
1901     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1902     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1903     // may be used by SCEVExpander. The blocks will be un-linked from their
1904     // predecessors and removed from LI & DT at the end of the function.
1905     if (!UnionPred.isAlwaysTrue()) {
1906       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1907                                   nullptr, "vector.scevcheck");
1908 
1909       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1910           &UnionPred, SCEVCheckBlock->getTerminator());
1911     }
1912 
1913     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1914     if (RtPtrChecking.Need) {
1915       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1916       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1917                                  "vector.memcheck");
1918 
1919       auto DiffChecks = RtPtrChecking.getDiffChecks();
1920       if (DiffChecks) {
1921         MemRuntimeCheckCond = addDiffRuntimeChecks(
1922             MemCheckBlock->getTerminator(), L, *DiffChecks, MemCheckExp,
1923             [VF](IRBuilderBase &B, unsigned Bits) {
1924               return getRuntimeVF(B, B.getIntNTy(Bits), VF);
1925             },
1926             IC);
1927       } else {
1928         MemRuntimeCheckCond =
1929             addRuntimeChecks(MemCheckBlock->getTerminator(), L,
1930                              RtPtrChecking.getChecks(), MemCheckExp);
1931       }
1932       assert(MemRuntimeCheckCond &&
1933              "no RT checks generated although RtPtrChecking "
1934              "claimed checks are required");
1935     }
1936 
1937     if (!MemCheckBlock && !SCEVCheckBlock)
1938       return;
1939 
1940     // Unhook the temporary block with the checks, update various places
1941     // accordingly.
1942     if (SCEVCheckBlock)
1943       SCEVCheckBlock->replaceAllUsesWith(Preheader);
1944     if (MemCheckBlock)
1945       MemCheckBlock->replaceAllUsesWith(Preheader);
1946 
1947     if (SCEVCheckBlock) {
1948       SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1949       new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1950       Preheader->getTerminator()->eraseFromParent();
1951     }
1952     if (MemCheckBlock) {
1953       MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1954       new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1955       Preheader->getTerminator()->eraseFromParent();
1956     }
1957 
1958     DT->changeImmediateDominator(LoopHeader, Preheader);
1959     if (MemCheckBlock) {
1960       DT->eraseNode(MemCheckBlock);
1961       LI->removeBlock(MemCheckBlock);
1962     }
1963     if (SCEVCheckBlock) {
1964       DT->eraseNode(SCEVCheckBlock);
1965       LI->removeBlock(SCEVCheckBlock);
1966     }
1967   }
1968 
1969   /// Remove the created SCEV & memory runtime check blocks & instructions, if
1970   /// unused.
1971   ~GeneratedRTChecks() {
1972     SCEVExpanderCleaner SCEVCleaner(SCEVExp);
1973     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
1974     if (!SCEVCheckCond)
1975       SCEVCleaner.markResultUsed();
1976 
1977     if (!MemRuntimeCheckCond)
1978       MemCheckCleaner.markResultUsed();
1979 
1980     if (MemRuntimeCheckCond) {
1981       auto &SE = *MemCheckExp.getSE();
1982       // Memory runtime check generation creates compares that use expanded
1983       // values. Remove them before running the SCEVExpanderCleaners.
1984       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
1985         if (MemCheckExp.isInsertedInstruction(&I))
1986           continue;
1987         SE.forgetValue(&I);
1988         I.eraseFromParent();
1989       }
1990     }
1991     MemCheckCleaner.cleanup();
1992     SCEVCleaner.cleanup();
1993 
1994     if (SCEVCheckCond)
1995       SCEVCheckBlock->eraseFromParent();
1996     if (MemRuntimeCheckCond)
1997       MemCheckBlock->eraseFromParent();
1998   }
1999 
2000   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2001   /// adjusts the branches to branch to the vector preheader or \p Bypass,
2002   /// depending on the generated condition.
2003   BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2004                              BasicBlock *LoopVectorPreHeader,
2005                              BasicBlock *LoopExitBlock) {
2006     if (!SCEVCheckCond)
2007       return nullptr;
2008 
2009     Value *Cond = SCEVCheckCond;
2010     // Mark the check as used, to prevent it from being removed during cleanup.
2011     SCEVCheckCond = nullptr;
2012     if (auto *C = dyn_cast<ConstantInt>(Cond))
2013       if (C->isZero())
2014         return nullptr;
2015 
2016     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2017 
2018     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2019     // Create new preheader for vector loop.
2020     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2021       PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2022 
2023     SCEVCheckBlock->getTerminator()->eraseFromParent();
2024     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2025     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2026                                                 SCEVCheckBlock);
2027 
2028     DT->addNewBlock(SCEVCheckBlock, Pred);
2029     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2030 
2031     ReplaceInstWithInst(SCEVCheckBlock->getTerminator(),
2032                         BranchInst::Create(Bypass, LoopVectorPreHeader, Cond));
2033     return SCEVCheckBlock;
2034   }
2035 
2036   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2037   /// the branches to branch to the vector preheader or \p Bypass, depending on
2038   /// the generated condition.
2039   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2040                                    BasicBlock *LoopVectorPreHeader) {
2041     // Check if we generated code that checks in runtime if arrays overlap.
2042     if (!MemRuntimeCheckCond)
2043       return nullptr;
2044 
2045     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2046     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2047                                                 MemCheckBlock);
2048 
2049     DT->addNewBlock(MemCheckBlock, Pred);
2050     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2051     MemCheckBlock->moveBefore(LoopVectorPreHeader);
2052 
2053     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2054       PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2055 
2056     ReplaceInstWithInst(
2057         MemCheckBlock->getTerminator(),
2058         BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2059     MemCheckBlock->getTerminator()->setDebugLoc(
2060         Pred->getTerminator()->getDebugLoc());
2061 
2062     // Mark the check as used, to prevent it from being removed during cleanup.
2063     MemRuntimeCheckCond = nullptr;
2064     return MemCheckBlock;
2065   }
2066 };
2067 
2068 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2069 // vectorization. The loop needs to be annotated with #pragma omp simd
2070 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2071 // vector length information is not provided, vectorization is not considered
2072 // explicit. Interleave hints are not allowed either. These limitations will be
2073 // relaxed in the future.
2074 // Please, note that we are currently forced to abuse the pragma 'clang
2075 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2076 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2077 // provides *explicit vectorization hints* (LV can bypass legal checks and
2078 // assume that vectorization is legal). However, both hints are implemented
2079 // using the same metadata (llvm.loop.vectorize, processed by
2080 // LoopVectorizeHints). This will be fixed in the future when the native IR
2081 // representation for pragma 'omp simd' is introduced.
2082 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2083                                    OptimizationRemarkEmitter *ORE) {
2084   assert(!OuterLp->isInnermost() && "This is not an outer loop");
2085   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2086 
2087   // Only outer loops with an explicit vectorization hint are supported.
2088   // Unannotated outer loops are ignored.
2089   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2090     return false;
2091 
2092   Function *Fn = OuterLp->getHeader()->getParent();
2093   if (!Hints.allowVectorization(Fn, OuterLp,
2094                                 true /*VectorizeOnlyWhenForced*/)) {
2095     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2096     return false;
2097   }
2098 
2099   if (Hints.getInterleave() > 1) {
2100     // TODO: Interleave support is future work.
2101     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2102                          "outer loops.\n");
2103     Hints.emitRemarkWithHints();
2104     return false;
2105   }
2106 
2107   return true;
2108 }
2109 
2110 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2111                                   OptimizationRemarkEmitter *ORE,
2112                                   SmallVectorImpl<Loop *> &V) {
2113   // Collect inner loops and outer loops without irreducible control flow. For
2114   // now, only collect outer loops that have explicit vectorization hints. If we
2115   // are stress testing the VPlan H-CFG construction, we collect the outermost
2116   // loop of every loop nest.
2117   if (L.isInnermost() || VPlanBuildStressTest ||
2118       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2119     LoopBlocksRPO RPOT(&L);
2120     RPOT.perform(LI);
2121     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2122       V.push_back(&L);
2123       // TODO: Collect inner loops inside marked outer loops in case
2124       // vectorization fails for the outer loop. Do not invoke
2125       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2126       // already known to be reducible. We can use an inherited attribute for
2127       // that.
2128       return;
2129     }
2130   }
2131   for (Loop *InnerL : L)
2132     collectSupportedLoops(*InnerL, LI, ORE, V);
2133 }
2134 
2135 namespace {
2136 
2137 /// The LoopVectorize Pass.
2138 struct LoopVectorize : public FunctionPass {
2139   /// Pass identification, replacement for typeid
2140   static char ID;
2141 
2142   LoopVectorizePass Impl;
2143 
2144   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2145                          bool VectorizeOnlyWhenForced = false)
2146       : FunctionPass(ID),
2147         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2148     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2149   }
2150 
2151   bool runOnFunction(Function &F) override {
2152     if (skipFunction(F))
2153       return false;
2154 
2155     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2156     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2157     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2158     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2159     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2160     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2161     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2162     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2163     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2164     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2165     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2166     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2167     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2168 
2169     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2170         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2171 
2172     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2173                         GetLAA, *ORE, PSI).MadeAnyChange;
2174   }
2175 
2176   void getAnalysisUsage(AnalysisUsage &AU) const override {
2177     AU.addRequired<AssumptionCacheTracker>();
2178     AU.addRequired<BlockFrequencyInfoWrapperPass>();
2179     AU.addRequired<DominatorTreeWrapperPass>();
2180     AU.addRequired<LoopInfoWrapperPass>();
2181     AU.addRequired<ScalarEvolutionWrapperPass>();
2182     AU.addRequired<TargetTransformInfoWrapperPass>();
2183     AU.addRequired<AAResultsWrapperPass>();
2184     AU.addRequired<LoopAccessLegacyAnalysis>();
2185     AU.addRequired<DemandedBitsWrapperPass>();
2186     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2187     AU.addRequired<InjectTLIMappingsLegacy>();
2188 
2189     // We currently do not preserve loopinfo/dominator analyses with outer loop
2190     // vectorization. Until this is addressed, mark these analyses as preserved
2191     // only for non-VPlan-native path.
2192     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2193     if (!EnableVPlanNativePath) {
2194       AU.addPreserved<LoopInfoWrapperPass>();
2195       AU.addPreserved<DominatorTreeWrapperPass>();
2196     }
2197 
2198     AU.addPreserved<BasicAAWrapperPass>();
2199     AU.addPreserved<GlobalsAAWrapperPass>();
2200     AU.addRequired<ProfileSummaryInfoWrapperPass>();
2201   }
2202 };
2203 
2204 } // end anonymous namespace
2205 
2206 //===----------------------------------------------------------------------===//
2207 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2208 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2209 //===----------------------------------------------------------------------===//
2210 
2211 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2212   // We need to place the broadcast of invariant variables outside the loop,
2213   // but only if it's proven safe to do so. Else, broadcast will be inside
2214   // vector loop body.
2215   Instruction *Instr = dyn_cast<Instruction>(V);
2216   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2217                      (!Instr ||
2218                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2219   // Place the code for broadcasting invariant variables in the new preheader.
2220   IRBuilder<>::InsertPointGuard Guard(Builder);
2221   if (SafeToHoist)
2222     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2223 
2224   // Broadcast the scalar into all locations in the vector.
2225   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2226 
2227   return Shuf;
2228 }
2229 
2230 /// This function adds
2231 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
2232 /// to each vector element of Val. The sequence starts at StartIndex.
2233 /// \p Opcode is relevant for FP induction variable.
2234 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
2235                             Instruction::BinaryOps BinOp, ElementCount VF,
2236                             IRBuilderBase &Builder) {
2237   assert(VF.isVector() && "only vector VFs are supported");
2238 
2239   // Create and check the types.
2240   auto *ValVTy = cast<VectorType>(Val->getType());
2241   ElementCount VLen = ValVTy->getElementCount();
2242 
2243   Type *STy = Val->getType()->getScalarType();
2244   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2245          "Induction Step must be an integer or FP");
2246   assert(Step->getType() == STy && "Step has wrong type");
2247 
2248   SmallVector<Constant *, 8> Indices;
2249 
2250   // Create a vector of consecutive numbers from zero to VF.
2251   VectorType *InitVecValVTy = ValVTy;
2252   if (STy->isFloatingPointTy()) {
2253     Type *InitVecValSTy =
2254         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
2255     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2256   }
2257   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2258 
2259   // Splat the StartIdx
2260   Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
2261 
2262   if (STy->isIntegerTy()) {
2263     InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2264     Step = Builder.CreateVectorSplat(VLen, Step);
2265     assert(Step->getType() == Val->getType() && "Invalid step vec");
2266     // FIXME: The newly created binary instructions should contain nsw/nuw
2267     // flags, which can be found from the original scalar operations.
2268     Step = Builder.CreateMul(InitVec, Step);
2269     return Builder.CreateAdd(Val, Step, "induction");
2270   }
2271 
2272   // Floating point induction.
2273   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2274          "Binary Opcode should be specified for FP induction");
2275   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2276   InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
2277 
2278   Step = Builder.CreateVectorSplat(VLen, Step);
2279   Value *MulOp = Builder.CreateFMul(InitVec, Step);
2280   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2281 }
2282 
2283 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
2284 /// variable on which to base the steps, \p Step is the size of the step.
2285 static void buildScalarSteps(Value *ScalarIV, Value *Step,
2286                              const InductionDescriptor &ID, VPValue *Def,
2287                              VPTransformState &State) {
2288   IRBuilderBase &Builder = State.Builder;
2289   // We shouldn't have to build scalar steps if we aren't vectorizing.
2290   assert(State.VF.isVector() && "VF should be greater than one");
2291   // Get the value type and ensure it and the step have the same integer type.
2292   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2293   assert(ScalarIVTy == Step->getType() &&
2294          "Val and Step should have the same type");
2295 
2296   // We build scalar steps for both integer and floating-point induction
2297   // variables. Here, we determine the kind of arithmetic we will perform.
2298   Instruction::BinaryOps AddOp;
2299   Instruction::BinaryOps MulOp;
2300   if (ScalarIVTy->isIntegerTy()) {
2301     AddOp = Instruction::Add;
2302     MulOp = Instruction::Mul;
2303   } else {
2304     AddOp = ID.getInductionOpcode();
2305     MulOp = Instruction::FMul;
2306   }
2307 
2308   // Determine the number of scalars we need to generate for each unroll
2309   // iteration.
2310   bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def);
2311   unsigned Lanes = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2312   // Compute the scalar steps and save the results in State.
2313   Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2314                                      ScalarIVTy->getScalarSizeInBits());
2315   Type *VecIVTy = nullptr;
2316   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2317   if (!FirstLaneOnly && State.VF.isScalable()) {
2318     VecIVTy = VectorType::get(ScalarIVTy, State.VF);
2319     UnitStepVec =
2320         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
2321     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
2322     SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV);
2323   }
2324 
2325   for (unsigned Part = 0; Part < State.UF; ++Part) {
2326     Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
2327 
2328     if (!FirstLaneOnly && State.VF.isScalable()) {
2329       auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
2330       auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2331       if (ScalarIVTy->isFloatingPointTy())
2332         InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2333       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2334       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2335       State.set(Def, Add, Part);
2336       // It's useful to record the lane values too for the known minimum number
2337       // of elements so we do those below. This improves the code quality when
2338       // trying to extract the first element, for example.
2339     }
2340 
2341     if (ScalarIVTy->isFloatingPointTy())
2342       StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2343 
2344     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2345       Value *StartIdx = Builder.CreateBinOp(
2346           AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2347       // The step returned by `createStepForVF` is a runtime-evaluated value
2348       // when VF is scalable. Otherwise, it should be folded into a Constant.
2349       assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
2350              "Expected StartIdx to be folded to a constant when VF is not "
2351              "scalable");
2352       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2353       auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2354       State.set(Def, Add, VPIteration(Part, Lane));
2355     }
2356   }
2357 }
2358 
2359 // Generate code for the induction step. Note that induction steps are
2360 // required to be loop-invariant
2361 static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE,
2362                               Instruction *InsertBefore,
2363                               Loop *OrigLoop = nullptr) {
2364   const DataLayout &DL = SE.getDataLayout();
2365   assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) &&
2366          "Induction step should be loop invariant");
2367   if (auto *E = dyn_cast<SCEVUnknown>(Step))
2368     return E->getValue();
2369 
2370   SCEVExpander Exp(SE, DL, "induction");
2371   return Exp.expandCodeFor(Step, Step->getType(), InsertBefore);
2372 }
2373 
2374 /// Compute the transformed value of Index at offset StartValue using step
2375 /// StepValue.
2376 /// For integer induction, returns StartValue + Index * StepValue.
2377 /// For pointer induction, returns StartValue[Index * StepValue].
2378 /// FIXME: The newly created binary instructions should contain nsw/nuw
2379 /// flags, which can be found from the original scalar operations.
2380 static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index,
2381                                    Value *StartValue, Value *Step,
2382                                    const InductionDescriptor &ID) {
2383   assert(Index->getType()->getScalarType() == Step->getType() &&
2384          "Index scalar type does not match StepValue type");
2385 
2386   // Note: the IR at this point is broken. We cannot use SE to create any new
2387   // SCEV and then expand it, hoping that SCEV's simplification will give us
2388   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2389   // lead to various SCEV crashes. So all we can do is to use builder and rely
2390   // on InstCombine for future simplifications. Here we handle some trivial
2391   // cases only.
2392   auto CreateAdd = [&B](Value *X, Value *Y) {
2393     assert(X->getType() == Y->getType() && "Types don't match!");
2394     if (auto *CX = dyn_cast<ConstantInt>(X))
2395       if (CX->isZero())
2396         return Y;
2397     if (auto *CY = dyn_cast<ConstantInt>(Y))
2398       if (CY->isZero())
2399         return X;
2400     return B.CreateAdd(X, Y);
2401   };
2402 
2403   // We allow X to be a vector type, in which case Y will potentially be
2404   // splatted into a vector with the same element count.
2405   auto CreateMul = [&B](Value *X, Value *Y) {
2406     assert(X->getType()->getScalarType() == Y->getType() &&
2407            "Types don't match!");
2408     if (auto *CX = dyn_cast<ConstantInt>(X))
2409       if (CX->isOne())
2410         return Y;
2411     if (auto *CY = dyn_cast<ConstantInt>(Y))
2412       if (CY->isOne())
2413         return X;
2414     VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2415     if (XVTy && !isa<VectorType>(Y->getType()))
2416       Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2417     return B.CreateMul(X, Y);
2418   };
2419 
2420   switch (ID.getKind()) {
2421   case InductionDescriptor::IK_IntInduction: {
2422     assert(!isa<VectorType>(Index->getType()) &&
2423            "Vector indices not supported for integer inductions yet");
2424     assert(Index->getType() == StartValue->getType() &&
2425            "Index type does not match StartValue type");
2426     if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2427       return B.CreateSub(StartValue, Index);
2428     auto *Offset = CreateMul(Index, Step);
2429     return CreateAdd(StartValue, Offset);
2430   }
2431   case InductionDescriptor::IK_PtrInduction: {
2432     assert(isa<Constant>(Step) &&
2433            "Expected constant step for pointer induction");
2434     return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step));
2435   }
2436   case InductionDescriptor::IK_FpInduction: {
2437     assert(!isa<VectorType>(Index->getType()) &&
2438            "Vector indices not supported for FP inductions yet");
2439     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2440     auto InductionBinOp = ID.getInductionBinOp();
2441     assert(InductionBinOp &&
2442            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2443             InductionBinOp->getOpcode() == Instruction::FSub) &&
2444            "Original bin op should be defined for FP induction");
2445 
2446     Value *MulExp = B.CreateFMul(Step, Index);
2447     return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2448                          "induction");
2449   }
2450   case InductionDescriptor::IK_NoInduction:
2451     return nullptr;
2452   }
2453   llvm_unreachable("invalid enum");
2454 }
2455 
2456 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2457                                                     const VPIteration &Instance,
2458                                                     VPTransformState &State) {
2459   Value *ScalarInst = State.get(Def, Instance);
2460   Value *VectorValue = State.get(Def, Instance.Part);
2461   VectorValue = Builder.CreateInsertElement(
2462       VectorValue, ScalarInst,
2463       Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2464   State.set(Def, VectorValue, Instance.Part);
2465 }
2466 
2467 // Return whether we allow using masked interleave-groups (for dealing with
2468 // strided loads/stores that reside in predicated blocks, or for dealing
2469 // with gaps).
2470 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2471   // If an override option has been passed in for interleaved accesses, use it.
2472   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2473     return EnableMaskedInterleavedMemAccesses;
2474 
2475   return TTI.enableMaskedInterleavedAccessVectorization();
2476 }
2477 
2478 // Try to vectorize the interleave group that \p Instr belongs to.
2479 //
2480 // E.g. Translate following interleaved load group (factor = 3):
2481 //   for (i = 0; i < N; i+=3) {
2482 //     R = Pic[i];             // Member of index 0
2483 //     G = Pic[i+1];           // Member of index 1
2484 //     B = Pic[i+2];           // Member of index 2
2485 //     ... // do something to R, G, B
2486 //   }
2487 // To:
2488 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2489 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2490 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2491 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2492 //
2493 // Or translate following interleaved store group (factor = 3):
2494 //   for (i = 0; i < N; i+=3) {
2495 //     ... do something to R, G, B
2496 //     Pic[i]   = R;           // Member of index 0
2497 //     Pic[i+1] = G;           // Member of index 1
2498 //     Pic[i+2] = B;           // Member of index 2
2499 //   }
2500 // To:
2501 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2502 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2503 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2504 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2505 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2506 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2507     const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2508     VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2509     VPValue *BlockInMask) {
2510   Instruction *Instr = Group->getInsertPos();
2511   const DataLayout &DL = Instr->getModule()->getDataLayout();
2512 
2513   // Prepare for the vector type of the interleaved load/store.
2514   Type *ScalarTy = getLoadStoreType(Instr);
2515   unsigned InterleaveFactor = Group->getFactor();
2516   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2517   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2518 
2519   // Prepare for the new pointers.
2520   SmallVector<Value *, 2> AddrParts;
2521   unsigned Index = Group->getIndex(Instr);
2522 
2523   // TODO: extend the masked interleaved-group support to reversed access.
2524   assert((!BlockInMask || !Group->isReverse()) &&
2525          "Reversed masked interleave-group not supported.");
2526 
2527   // If the group is reverse, adjust the index to refer to the last vector lane
2528   // instead of the first. We adjust the index from the first vector lane,
2529   // rather than directly getting the pointer for lane VF - 1, because the
2530   // pointer operand of the interleaved access is supposed to be uniform. For
2531   // uniform instructions, we're only required to generate a value for the
2532   // first vector lane in each unroll iteration.
2533   if (Group->isReverse())
2534     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2535 
2536   for (unsigned Part = 0; Part < UF; Part++) {
2537     Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2538     setDebugLocFromInst(AddrPart);
2539 
2540     // Notice current instruction could be any index. Need to adjust the address
2541     // to the member of index 0.
2542     //
2543     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2544     //       b = A[i];       // Member of index 0
2545     // Current pointer is pointed to A[i+1], adjust it to A[i].
2546     //
2547     // E.g.  A[i+1] = a;     // Member of index 1
2548     //       A[i]   = b;     // Member of index 0
2549     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2550     // Current pointer is pointed to A[i+2], adjust it to A[i].
2551 
2552     bool InBounds = false;
2553     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2554       InBounds = gep->isInBounds();
2555     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2556     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2557 
2558     // Cast to the vector pointer type.
2559     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2560     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2561     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2562   }
2563 
2564   setDebugLocFromInst(Instr);
2565   Value *PoisonVec = PoisonValue::get(VecTy);
2566 
2567   Value *MaskForGaps = nullptr;
2568   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2569     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2570     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2571   }
2572 
2573   // Vectorize the interleaved load group.
2574   if (isa<LoadInst>(Instr)) {
2575     // For each unroll part, create a wide load for the group.
2576     SmallVector<Value *, 2> NewLoads;
2577     for (unsigned Part = 0; Part < UF; Part++) {
2578       Instruction *NewLoad;
2579       if (BlockInMask || MaskForGaps) {
2580         assert(useMaskedInterleavedAccesses(*TTI) &&
2581                "masked interleaved groups are not allowed.");
2582         Value *GroupMask = MaskForGaps;
2583         if (BlockInMask) {
2584           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2585           Value *ShuffledMask = Builder.CreateShuffleVector(
2586               BlockInMaskPart,
2587               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2588               "interleaved.mask");
2589           GroupMask = MaskForGaps
2590                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2591                                                 MaskForGaps)
2592                           : ShuffledMask;
2593         }
2594         NewLoad =
2595             Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2596                                      GroupMask, PoisonVec, "wide.masked.vec");
2597       }
2598       else
2599         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2600                                             Group->getAlign(), "wide.vec");
2601       Group->addMetadata(NewLoad);
2602       NewLoads.push_back(NewLoad);
2603     }
2604 
2605     // For each member in the group, shuffle out the appropriate data from the
2606     // wide loads.
2607     unsigned J = 0;
2608     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2609       Instruction *Member = Group->getMember(I);
2610 
2611       // Skip the gaps in the group.
2612       if (!Member)
2613         continue;
2614 
2615       auto StrideMask =
2616           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2617       for (unsigned Part = 0; Part < UF; Part++) {
2618         Value *StridedVec = Builder.CreateShuffleVector(
2619             NewLoads[Part], StrideMask, "strided.vec");
2620 
2621         // If this member has different type, cast the result type.
2622         if (Member->getType() != ScalarTy) {
2623           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2624           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2625           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2626         }
2627 
2628         if (Group->isReverse())
2629           StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2630 
2631         State.set(VPDefs[J], StridedVec, Part);
2632       }
2633       ++J;
2634     }
2635     return;
2636   }
2637 
2638   // The sub vector type for current instruction.
2639   auto *SubVT = VectorType::get(ScalarTy, VF);
2640 
2641   // Vectorize the interleaved store group.
2642   MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2643   assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2644          "masked interleaved groups are not allowed.");
2645   assert((!MaskForGaps || !VF.isScalable()) &&
2646          "masking gaps for scalable vectors is not yet supported.");
2647   for (unsigned Part = 0; Part < UF; Part++) {
2648     // Collect the stored vector from each member.
2649     SmallVector<Value *, 4> StoredVecs;
2650     for (unsigned i = 0; i < InterleaveFactor; i++) {
2651       assert((Group->getMember(i) || MaskForGaps) &&
2652              "Fail to get a member from an interleaved store group");
2653       Instruction *Member = Group->getMember(i);
2654 
2655       // Skip the gaps in the group.
2656       if (!Member) {
2657         Value *Undef = PoisonValue::get(SubVT);
2658         StoredVecs.push_back(Undef);
2659         continue;
2660       }
2661 
2662       Value *StoredVec = State.get(StoredValues[i], Part);
2663 
2664       if (Group->isReverse())
2665         StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
2666 
2667       // If this member has different type, cast it to a unified type.
2668 
2669       if (StoredVec->getType() != SubVT)
2670         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2671 
2672       StoredVecs.push_back(StoredVec);
2673     }
2674 
2675     // Concatenate all vectors into a wide vector.
2676     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2677 
2678     // Interleave the elements in the wide vector.
2679     Value *IVec = Builder.CreateShuffleVector(
2680         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2681         "interleaved.vec");
2682 
2683     Instruction *NewStoreInstr;
2684     if (BlockInMask || MaskForGaps) {
2685       Value *GroupMask = MaskForGaps;
2686       if (BlockInMask) {
2687         Value *BlockInMaskPart = State.get(BlockInMask, Part);
2688         Value *ShuffledMask = Builder.CreateShuffleVector(
2689             BlockInMaskPart,
2690             createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2691             "interleaved.mask");
2692         GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And,
2693                                                       ShuffledMask, MaskForGaps)
2694                                 : ShuffledMask;
2695       }
2696       NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2697                                                 Group->getAlign(), GroupMask);
2698     } else
2699       NewStoreInstr =
2700           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2701 
2702     Group->addMetadata(NewStoreInstr);
2703   }
2704 }
2705 
2706 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2707                                                VPReplicateRecipe *RepRecipe,
2708                                                const VPIteration &Instance,
2709                                                bool IfPredicateInstr,
2710                                                VPTransformState &State) {
2711   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2712 
2713   // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2714   // the first lane and part.
2715   if (isa<NoAliasScopeDeclInst>(Instr))
2716     if (!Instance.isFirstIteration())
2717       return;
2718 
2719   // Does this instruction return a value ?
2720   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2721 
2722   Instruction *Cloned = Instr->clone();
2723   if (!IsVoidRetTy)
2724     Cloned->setName(Instr->getName() + ".cloned");
2725 
2726   // If the scalarized instruction contributes to the address computation of a
2727   // widen masked load/store which was in a basic block that needed predication
2728   // and is not predicated after vectorization, we can't propagate
2729   // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized
2730   // instruction could feed a poison value to the base address of the widen
2731   // load/store.
2732   if (State.MayGeneratePoisonRecipes.contains(RepRecipe))
2733     Cloned->dropPoisonGeneratingFlags();
2734 
2735   if (Instr->getDebugLoc())
2736     setDebugLocFromInst(Instr);
2737 
2738   // Replace the operands of the cloned instructions with their scalar
2739   // equivalents in the new loop.
2740   for (auto &I : enumerate(RepRecipe->operands())) {
2741     auto InputInstance = Instance;
2742     VPValue *Operand = I.value();
2743     VPReplicateRecipe *OperandR = dyn_cast<VPReplicateRecipe>(Operand);
2744     if (OperandR && OperandR->isUniform())
2745       InputInstance.Lane = VPLane::getFirstLane();
2746     Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2747   }
2748   State.addNewMetadata(Cloned, Instr);
2749 
2750   // Place the cloned scalar in the new loop.
2751   State.Builder.Insert(Cloned);
2752 
2753   State.set(RepRecipe, Cloned, Instance);
2754 
2755   // If we just cloned a new assumption, add it the assumption cache.
2756   if (auto *II = dyn_cast<AssumeInst>(Cloned))
2757     AC->registerAssumption(II);
2758 
2759   // End if-block.
2760   if (IfPredicateInstr)
2761     PredicatedInstructions.push_back(Cloned);
2762 }
2763 
2764 Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) {
2765   if (TripCount)
2766     return TripCount;
2767 
2768   assert(InsertBlock);
2769   IRBuilder<> Builder(InsertBlock->getTerminator());
2770   // Find the loop boundaries.
2771   ScalarEvolution *SE = PSE.getSE();
2772   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2773   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
2774          "Invalid loop count");
2775 
2776   Type *IdxTy = Legal->getWidestInductionType();
2777   assert(IdxTy && "No type for induction");
2778 
2779   // The exit count might have the type of i64 while the phi is i32. This can
2780   // happen if we have an induction variable that is sign extended before the
2781   // compare. The only way that we get a backedge taken count is that the
2782   // induction variable was signed and as such will not overflow. In such a case
2783   // truncation is legal.
2784   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2785       IdxTy->getPrimitiveSizeInBits())
2786     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2787   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2788 
2789   // Get the total trip count from the count by adding 1.
2790   const SCEV *ExitCount = SE->getAddExpr(
2791       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2792 
2793   const DataLayout &DL = InsertBlock->getModule()->getDataLayout();
2794 
2795   // Expand the trip count and place the new instructions in the preheader.
2796   // Notice that the pre-header does not change, only the loop body.
2797   SCEVExpander Exp(*SE, DL, "induction");
2798 
2799   // Count holds the overall loop count (N).
2800   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2801                                 InsertBlock->getTerminator());
2802 
2803   if (TripCount->getType()->isPointerTy())
2804     TripCount =
2805         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2806                                     InsertBlock->getTerminator());
2807 
2808   return TripCount;
2809 }
2810 
2811 Value *
2812 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2813   if (VectorTripCount)
2814     return VectorTripCount;
2815 
2816   Value *TC = getOrCreateTripCount(InsertBlock);
2817   IRBuilder<> Builder(InsertBlock->getTerminator());
2818 
2819   Type *Ty = TC->getType();
2820   // This is where we can make the step a runtime constant.
2821   Value *Step = createStepForVF(Builder, Ty, VF, UF);
2822 
2823   // If the tail is to be folded by masking, round the number of iterations N
2824   // up to a multiple of Step instead of rounding down. This is done by first
2825   // adding Step-1 and then rounding down. Note that it's ok if this addition
2826   // overflows: the vector induction variable will eventually wrap to zero given
2827   // that it starts at zero and its Step is a power of two; the loop will then
2828   // exit, with the last early-exit vector comparison also producing all-true.
2829   // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2830   // is accounted for in emitIterationCountCheck that adds an overflow check.
2831   if (Cost->foldTailByMasking()) {
2832     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2833            "VF*UF must be a power of 2 when folding tail by masking");
2834     Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
2835     TC = Builder.CreateAdd(
2836         TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
2837   }
2838 
2839   // Now we need to generate the expression for the part of the loop that the
2840   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2841   // iterations are not required for correctness, or N - Step, otherwise. Step
2842   // is equal to the vectorization factor (number of SIMD elements) times the
2843   // unroll factor (number of SIMD instructions).
2844   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2845 
2846   // There are cases where we *must* run at least one iteration in the remainder
2847   // loop.  See the cost model for when this can happen.  If the step evenly
2848   // divides the trip count, we set the remainder to be equal to the step. If
2849   // the step does not evenly divide the trip count, no adjustment is necessary
2850   // since there will already be scalar iterations. Note that the minimum
2851   // iterations check ensures that N >= Step.
2852   if (Cost->requiresScalarEpilogue(VF)) {
2853     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2854     R = Builder.CreateSelect(IsZero, Step, R);
2855   }
2856 
2857   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2858 
2859   return VectorTripCount;
2860 }
2861 
2862 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2863                                                    const DataLayout &DL) {
2864   // Verify that V is a vector type with same number of elements as DstVTy.
2865   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
2866   unsigned VF = DstFVTy->getNumElements();
2867   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
2868   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2869   Type *SrcElemTy = SrcVecTy->getElementType();
2870   Type *DstElemTy = DstFVTy->getElementType();
2871   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2872          "Vector elements must have same size");
2873 
2874   // Do a direct cast if element types are castable.
2875   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2876     return Builder.CreateBitOrPointerCast(V, DstFVTy);
2877   }
2878   // V cannot be directly casted to desired vector type.
2879   // May happen when V is a floating point vector but DstVTy is a vector of
2880   // pointers or vice-versa. Handle this using a two-step bitcast using an
2881   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2882   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2883          "Only one type should be a pointer type");
2884   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2885          "Only one type should be a floating point type");
2886   Type *IntTy =
2887       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2888   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
2889   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2890   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2891 }
2892 
2893 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2894   Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
2895   // Reuse existing vector loop preheader for TC checks.
2896   // Note that new preheader block is generated for vector loop.
2897   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2898   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2899 
2900   // Generate code to check if the loop's trip count is less than VF * UF, or
2901   // equal to it in case a scalar epilogue is required; this implies that the
2902   // vector trip count is zero. This check also covers the case where adding one
2903   // to the backedge-taken count overflowed leading to an incorrect trip count
2904   // of zero. In this case we will also jump to the scalar loop.
2905   auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
2906                                             : ICmpInst::ICMP_ULT;
2907 
2908   // If tail is to be folded, vector loop takes care of all iterations.
2909   Type *CountTy = Count->getType();
2910   Value *CheckMinIters = Builder.getFalse();
2911   Value *Step = createStepForVF(Builder, CountTy, VF, UF);
2912   if (!Cost->foldTailByMasking())
2913     CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
2914   else if (VF.isScalable()) {
2915     // vscale is not necessarily a power-of-2, which means we cannot guarantee
2916     // an overflow to zero when updating induction variables and so an
2917     // additional overflow check is required before entering the vector loop.
2918 
2919     // Get the maximum unsigned value for the type.
2920     Value *MaxUIntTripCount =
2921         ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2922     Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2923 
2924     // Don't execute the vector loop if (UMax - n) < (VF * UF).
2925     CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, Step);
2926   }
2927   // Create new preheader for vector loop.
2928   LoopVectorPreHeader =
2929       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2930                  "vector.ph");
2931 
2932   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2933                                DT->getNode(Bypass)->getIDom()) &&
2934          "TC check is expected to dominate Bypass");
2935 
2936   // Update dominator for Bypass & LoopExit (if needed).
2937   DT->changeImmediateDominator(Bypass, TCCheckBlock);
2938   if (!Cost->requiresScalarEpilogue(VF))
2939     // If there is an epilogue which must run, there's no edge from the
2940     // middle block to exit blocks  and thus no need to update the immediate
2941     // dominator of the exit blocks.
2942     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2943 
2944   ReplaceInstWithInst(
2945       TCCheckBlock->getTerminator(),
2946       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
2947   LoopBypassBlocks.push_back(TCCheckBlock);
2948 }
2949 
2950 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
2951 
2952   BasicBlock *const SCEVCheckBlock =
2953       RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
2954   if (!SCEVCheckBlock)
2955     return nullptr;
2956 
2957   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
2958            (OptForSizeBasedOnProfile &&
2959             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
2960          "Cannot SCEV check stride or overflow when optimizing for size");
2961 
2962 
2963   // Update dominator only if this is first RT check.
2964   if (LoopBypassBlocks.empty()) {
2965     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2966     if (!Cost->requiresScalarEpilogue(VF))
2967       // If there is an epilogue which must run, there's no edge from the
2968       // middle block to exit blocks  and thus no need to update the immediate
2969       // dominator of the exit blocks.
2970       DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2971   }
2972 
2973   LoopBypassBlocks.push_back(SCEVCheckBlock);
2974   AddedSafetyChecks = true;
2975   return SCEVCheckBlock;
2976 }
2977 
2978 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
2979   // VPlan-native path does not do any analysis for runtime checks currently.
2980   if (EnableVPlanNativePath)
2981     return nullptr;
2982 
2983   BasicBlock *const MemCheckBlock =
2984       RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
2985 
2986   // Check if we generated code that checks in runtime if arrays overlap. We put
2987   // the checks into a separate block to make the more common case of few
2988   // elements faster.
2989   if (!MemCheckBlock)
2990     return nullptr;
2991 
2992   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
2993     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2994            "Cannot emit memory checks when optimizing for size, unless forced "
2995            "to vectorize.");
2996     ORE->emit([&]() {
2997       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2998                                         OrigLoop->getStartLoc(),
2999                                         OrigLoop->getHeader())
3000              << "Code-size may be reduced by not forcing "
3001                 "vectorization, or by source-code modifications "
3002                 "eliminating the need for runtime checks "
3003                 "(e.g., adding 'restrict').";
3004     });
3005   }
3006 
3007   LoopBypassBlocks.push_back(MemCheckBlock);
3008 
3009   AddedSafetyChecks = true;
3010 
3011   return MemCheckBlock;
3012 }
3013 
3014 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3015   LoopScalarBody = OrigLoop->getHeader();
3016   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3017   assert(LoopVectorPreHeader && "Invalid loop structure");
3018   LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3019   assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) &&
3020          "multiple exit loop without required epilogue?");
3021 
3022   LoopMiddleBlock =
3023       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3024                  LI, nullptr, Twine(Prefix) + "middle.block");
3025   LoopScalarPreHeader =
3026       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3027                  nullptr, Twine(Prefix) + "scalar.ph");
3028 
3029   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3030 
3031   // Set up the middle block terminator.  Two cases:
3032   // 1) If we know that we must execute the scalar epilogue, emit an
3033   //    unconditional branch.
3034   // 2) Otherwise, we must have a single unique exit block (due to how we
3035   //    implement the multiple exit case).  In this case, set up a conditonal
3036   //    branch from the middle block to the loop scalar preheader, and the
3037   //    exit block.  completeLoopSkeleton will update the condition to use an
3038   //    iteration check, if required to decide whether to execute the remainder.
3039   BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ?
3040     BranchInst::Create(LoopScalarPreHeader) :
3041     BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3042                        Builder.getTrue());
3043   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3044   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3045 
3046   // Update dominator for loop exit. During skeleton creation, only the vector
3047   // pre-header and the middle block are created. The vector loop is entirely
3048   // created during VPlan exection.
3049   if (!Cost->requiresScalarEpilogue(VF))
3050     // If there is an epilogue which must run, there's no edge from the
3051     // middle block to exit blocks  and thus no need to update the immediate
3052     // dominator of the exit blocks.
3053     DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3054 }
3055 
3056 void InnerLoopVectorizer::createInductionResumeValues(
3057     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3058   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3059           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3060          "Inconsistent information about additional bypass.");
3061 
3062   Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3063   assert(VectorTripCount && "Expected valid arguments");
3064   // We are going to resume the execution of the scalar loop.
3065   // Go over all of the induction variables that we found and fix the
3066   // PHIs that are left in the scalar version of the loop.
3067   // The starting values of PHI nodes depend on the counter of the last
3068   // iteration in the vectorized loop.
3069   // If we come from a bypass edge then we need to start from the original
3070   // start value.
3071   Instruction *OldInduction = Legal->getPrimaryInduction();
3072   for (auto &InductionEntry : Legal->getInductionVars()) {
3073     PHINode *OrigPhi = InductionEntry.first;
3074     InductionDescriptor II = InductionEntry.second;
3075 
3076     Value *&EndValue = IVEndValues[OrigPhi];
3077     Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3078     if (OrigPhi == OldInduction) {
3079       // We know what the end value is.
3080       EndValue = VectorTripCount;
3081     } else {
3082       IRBuilder<> B(LoopVectorPreHeader->getTerminator());
3083 
3084       // Fast-math-flags propagate from the original induction instruction.
3085       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3086         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3087 
3088       Type *StepType = II.getStep()->getType();
3089       Instruction::CastOps CastOp =
3090           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3091       Value *VTC = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.vtc");
3092       Value *Step =
3093           CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3094       EndValue = emitTransformedIndex(B, VTC, II.getStartValue(), Step, II);
3095       EndValue->setName("ind.end");
3096 
3097       // Compute the end value for the additional bypass (if applicable).
3098       if (AdditionalBypass.first) {
3099         B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3100         CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3101                                          StepType, true);
3102         Value *Step =
3103             CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3104         VTC =
3105             B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.vtc");
3106         EndValueFromAdditionalBypass =
3107             emitTransformedIndex(B, VTC, II.getStartValue(), Step, II);
3108         EndValueFromAdditionalBypass->setName("ind.end");
3109       }
3110     }
3111 
3112     // Create phi nodes to merge from the  backedge-taken check block.
3113     PHINode *BCResumeVal =
3114         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3115                         LoopScalarPreHeader->getTerminator());
3116     // Copy original phi DL over to the new one.
3117     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3118 
3119     // The new PHI merges the original incoming value, in case of a bypass,
3120     // or the value at the end of the vectorized loop.
3121     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3122 
3123     // Fix the scalar body counter (PHI node).
3124     // The old induction's phi node in the scalar body needs the truncated
3125     // value.
3126     for (BasicBlock *BB : LoopBypassBlocks)
3127       BCResumeVal->addIncoming(II.getStartValue(), BB);
3128 
3129     if (AdditionalBypass.first)
3130       BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3131                                             EndValueFromAdditionalBypass);
3132 
3133     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3134   }
3135 }
3136 
3137 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(MDNode *OrigLoopID) {
3138   // The trip counts should be cached by now.
3139   Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
3140   Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3141 
3142   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3143 
3144   // Add a check in the middle block to see if we have completed
3145   // all of the iterations in the first vector loop.  Three cases:
3146   // 1) If we require a scalar epilogue, there is no conditional branch as
3147   //    we unconditionally branch to the scalar preheader.  Do nothing.
3148   // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3149   //    Thus if tail is to be folded, we know we don't need to run the
3150   //    remainder and we can use the previous value for the condition (true).
3151   // 3) Otherwise, construct a runtime check.
3152   if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) {
3153     Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3154                                         Count, VectorTripCount, "cmp.n",
3155                                         LoopMiddleBlock->getTerminator());
3156 
3157     // Here we use the same DebugLoc as the scalar loop latch terminator instead
3158     // of the corresponding compare because they may have ended up with
3159     // different line numbers and we want to avoid awkward line stepping while
3160     // debugging. Eg. if the compare has got a line number inside the loop.
3161     CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3162     cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3163   }
3164 
3165 #ifdef EXPENSIVE_CHECKS
3166   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3167 #endif
3168 
3169   return LoopVectorPreHeader;
3170 }
3171 
3172 std::pair<BasicBlock *, Value *>
3173 InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3174   /*
3175    In this function we generate a new loop. The new loop will contain
3176    the vectorized instructions while the old loop will continue to run the
3177    scalar remainder.
3178 
3179        [ ] <-- loop iteration number check.
3180     /   |
3181    /    v
3182   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3183   |  /  |
3184   | /   v
3185   ||   [ ]     <-- vector pre header.
3186   |/    |
3187   |     v
3188   |    [  ] \
3189   |    [  ]_|   <-- vector loop (created during VPlan execution).
3190   |     |
3191   |     v
3192   \   -[ ]   <--- middle-block.
3193    \/   |
3194    /\   v
3195    | ->[ ]     <--- new preheader.
3196    |    |
3197  (opt)  v      <-- edge from middle to exit iff epilogue is not required.
3198    |   [ ] \
3199    |   [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue).
3200     \   |
3201      \  v
3202       >[ ]     <-- exit block(s).
3203    ...
3204    */
3205 
3206   // Get the metadata of the original loop before it gets modified.
3207   MDNode *OrigLoopID = OrigLoop->getLoopID();
3208 
3209   // Workaround!  Compute the trip count of the original loop and cache it
3210   // before we start modifying the CFG.  This code has a systemic problem
3211   // wherein it tries to run analysis over partially constructed IR; this is
3212   // wrong, and not simply for SCEV.  The trip count of the original loop
3213   // simply happens to be prone to hitting this in practice.  In theory, we
3214   // can hit the same issue for any SCEV, or ValueTracking query done during
3215   // mutation.  See PR49900.
3216   getOrCreateTripCount(OrigLoop->getLoopPreheader());
3217 
3218   // Create an empty vector loop, and prepare basic blocks for the runtime
3219   // checks.
3220   createVectorLoopSkeleton("");
3221 
3222   // Now, compare the new count to zero. If it is zero skip the vector loop and
3223   // jump to the scalar loop. This check also covers the case where the
3224   // backedge-taken count is uint##_max: adding one to it will overflow leading
3225   // to an incorrect trip count of zero. In this (rare) case we will also jump
3226   // to the scalar loop.
3227   emitIterationCountCheck(LoopScalarPreHeader);
3228 
3229   // Generate the code to check any assumptions that we've made for SCEV
3230   // expressions.
3231   emitSCEVChecks(LoopScalarPreHeader);
3232 
3233   // Generate the code that checks in runtime if arrays overlap. We put the
3234   // checks into a separate block to make the more common case of few elements
3235   // faster.
3236   emitMemRuntimeChecks(LoopScalarPreHeader);
3237 
3238   // Emit phis for the new starting index of the scalar loop.
3239   createInductionResumeValues();
3240 
3241   return {completeLoopSkeleton(OrigLoopID), nullptr};
3242 }
3243 
3244 // Fix up external users of the induction variable. At this point, we are
3245 // in LCSSA form, with all external PHIs that use the IV having one input value,
3246 // coming from the remainder loop. We need those PHIs to also have a correct
3247 // value for the IV when arriving directly from the middle block.
3248 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3249                                        const InductionDescriptor &II,
3250                                        Value *VectorTripCount, Value *EndValue,
3251                                        BasicBlock *MiddleBlock,
3252                                        BasicBlock *VectorHeader, VPlan &Plan) {
3253   // There are two kinds of external IV usages - those that use the value
3254   // computed in the last iteration (the PHI) and those that use the penultimate
3255   // value (the value that feeds into the phi from the loop latch).
3256   // We allow both, but they, obviously, have different values.
3257 
3258   assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3259 
3260   DenseMap<Value *, Value *> MissingVals;
3261 
3262   // An external user of the last iteration's value should see the value that
3263   // the remainder loop uses to initialize its own IV.
3264   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3265   for (User *U : PostInc->users()) {
3266     Instruction *UI = cast<Instruction>(U);
3267     if (!OrigLoop->contains(UI)) {
3268       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3269       MissingVals[UI] = EndValue;
3270     }
3271   }
3272 
3273   // An external user of the penultimate value need to see EndValue - Step.
3274   // The simplest way to get this is to recompute it from the constituent SCEVs,
3275   // that is Start + (Step * (CRD - 1)).
3276   for (User *U : OrigPhi->users()) {
3277     auto *UI = cast<Instruction>(U);
3278     if (!OrigLoop->contains(UI)) {
3279       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3280 
3281       IRBuilder<> B(MiddleBlock->getTerminator());
3282 
3283       // Fast-math-flags propagate from the original induction instruction.
3284       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3285         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3286 
3287       Value *CountMinusOne = B.CreateSub(
3288           VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
3289       Value *CMO =
3290           !II.getStep()->getType()->isIntegerTy()
3291               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3292                              II.getStep()->getType())
3293               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3294       CMO->setName("cast.cmo");
3295 
3296       Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(),
3297                                     VectorHeader->getTerminator());
3298       Value *Escape =
3299           emitTransformedIndex(B, CMO, II.getStartValue(), Step, II);
3300       Escape->setName("ind.escape");
3301       MissingVals[UI] = Escape;
3302     }
3303   }
3304 
3305   for (auto &I : MissingVals) {
3306     PHINode *PHI = cast<PHINode>(I.first);
3307     // One corner case we have to handle is two IVs "chasing" each-other,
3308     // that is %IV2 = phi [...], [ %IV1, %latch ]
3309     // In this case, if IV1 has an external use, we need to avoid adding both
3310     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3311     // don't already have an incoming value for the middle block.
3312     if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
3313       PHI->addIncoming(I.second, MiddleBlock);
3314       Plan.removeLiveOut(PHI);
3315     }
3316   }
3317 }
3318 
3319 namespace {
3320 
3321 struct CSEDenseMapInfo {
3322   static bool canHandle(const Instruction *I) {
3323     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3324            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3325   }
3326 
3327   static inline Instruction *getEmptyKey() {
3328     return DenseMapInfo<Instruction *>::getEmptyKey();
3329   }
3330 
3331   static inline Instruction *getTombstoneKey() {
3332     return DenseMapInfo<Instruction *>::getTombstoneKey();
3333   }
3334 
3335   static unsigned getHashValue(const Instruction *I) {
3336     assert(canHandle(I) && "Unknown instruction!");
3337     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3338                                                            I->value_op_end()));
3339   }
3340 
3341   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3342     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3343         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3344       return LHS == RHS;
3345     return LHS->isIdenticalTo(RHS);
3346   }
3347 };
3348 
3349 } // end anonymous namespace
3350 
3351 ///Perform cse of induction variable instructions.
3352 static void cse(BasicBlock *BB) {
3353   // Perform simple cse.
3354   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3355   for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3356     if (!CSEDenseMapInfo::canHandle(&In))
3357       continue;
3358 
3359     // Check if we can replace this instruction with any of the
3360     // visited instructions.
3361     if (Instruction *V = CSEMap.lookup(&In)) {
3362       In.replaceAllUsesWith(V);
3363       In.eraseFromParent();
3364       continue;
3365     }
3366 
3367     CSEMap[&In] = &In;
3368   }
3369 }
3370 
3371 InstructionCost
3372 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3373                                               bool &NeedToScalarize) const {
3374   Function *F = CI->getCalledFunction();
3375   Type *ScalarRetTy = CI->getType();
3376   SmallVector<Type *, 4> Tys, ScalarTys;
3377   for (auto &ArgOp : CI->args())
3378     ScalarTys.push_back(ArgOp->getType());
3379 
3380   // Estimate cost of scalarized vector call. The source operands are assumed
3381   // to be vectors, so we need to extract individual elements from there,
3382   // execute VF scalar calls, and then gather the result into the vector return
3383   // value.
3384   InstructionCost ScalarCallCost =
3385       TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
3386   if (VF.isScalar())
3387     return ScalarCallCost;
3388 
3389   // Compute corresponding vector type for return value and arguments.
3390   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3391   for (Type *ScalarTy : ScalarTys)
3392     Tys.push_back(ToVectorTy(ScalarTy, VF));
3393 
3394   // Compute costs of unpacking argument values for the scalar calls and
3395   // packing the return values to a vector.
3396   InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
3397 
3398   InstructionCost Cost =
3399       ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3400 
3401   // If we can't emit a vector call for this function, then the currently found
3402   // cost is the cost we need to return.
3403   NeedToScalarize = true;
3404   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3405   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3406 
3407   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3408     return Cost;
3409 
3410   // If the corresponding vector cost is cheaper, return its cost.
3411   InstructionCost VectorCallCost =
3412       TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
3413   if (VectorCallCost < Cost) {
3414     NeedToScalarize = false;
3415     Cost = VectorCallCost;
3416   }
3417   return Cost;
3418 }
3419 
3420 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3421   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3422     return Elt;
3423   return VectorType::get(Elt, VF);
3424 }
3425 
3426 InstructionCost
3427 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3428                                                    ElementCount VF) const {
3429   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3430   assert(ID && "Expected intrinsic call!");
3431   Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3432   FastMathFlags FMF;
3433   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3434     FMF = FPMO->getFastMathFlags();
3435 
3436   SmallVector<const Value *> Arguments(CI->args());
3437   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3438   SmallVector<Type *> ParamTys;
3439   std::transform(FTy->param_begin(), FTy->param_end(),
3440                  std::back_inserter(ParamTys),
3441                  [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3442 
3443   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3444                                     dyn_cast<IntrinsicInst>(CI));
3445   return TTI.getIntrinsicInstrCost(CostAttrs,
3446                                    TargetTransformInfo::TCK_RecipThroughput);
3447 }
3448 
3449 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3450   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3451   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3452   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3453 }
3454 
3455 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3456   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3457   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3458   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3459 }
3460 
3461 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3462   // For every instruction `I` in MinBWs, truncate the operands, create a
3463   // truncated version of `I` and reextend its result. InstCombine runs
3464   // later and will remove any ext/trunc pairs.
3465   SmallPtrSet<Value *, 4> Erased;
3466   for (const auto &KV : Cost->getMinimalBitwidths()) {
3467     // If the value wasn't vectorized, we must maintain the original scalar
3468     // type. The absence of the value from State indicates that it
3469     // wasn't vectorized.
3470     // FIXME: Should not rely on getVPValue at this point.
3471     VPValue *Def = State.Plan->getVPValue(KV.first, true);
3472     if (!State.hasAnyVectorValue(Def))
3473       continue;
3474     for (unsigned Part = 0; Part < UF; ++Part) {
3475       Value *I = State.get(Def, Part);
3476       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3477         continue;
3478       Type *OriginalTy = I->getType();
3479       Type *ScalarTruncatedTy =
3480           IntegerType::get(OriginalTy->getContext(), KV.second);
3481       auto *TruncatedTy = VectorType::get(
3482           ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
3483       if (TruncatedTy == OriginalTy)
3484         continue;
3485 
3486       IRBuilder<> B(cast<Instruction>(I));
3487       auto ShrinkOperand = [&](Value *V) -> Value * {
3488         if (auto *ZI = dyn_cast<ZExtInst>(V))
3489           if (ZI->getSrcTy() == TruncatedTy)
3490             return ZI->getOperand(0);
3491         return B.CreateZExtOrTrunc(V, TruncatedTy);
3492       };
3493 
3494       // The actual instruction modification depends on the instruction type,
3495       // unfortunately.
3496       Value *NewI = nullptr;
3497       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3498         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3499                              ShrinkOperand(BO->getOperand(1)));
3500 
3501         // Any wrapping introduced by shrinking this operation shouldn't be
3502         // considered undefined behavior. So, we can't unconditionally copy
3503         // arithmetic wrapping flags to NewI.
3504         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3505       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3506         NewI =
3507             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3508                          ShrinkOperand(CI->getOperand(1)));
3509       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3510         NewI = B.CreateSelect(SI->getCondition(),
3511                               ShrinkOperand(SI->getTrueValue()),
3512                               ShrinkOperand(SI->getFalseValue()));
3513       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3514         switch (CI->getOpcode()) {
3515         default:
3516           llvm_unreachable("Unhandled cast!");
3517         case Instruction::Trunc:
3518           NewI = ShrinkOperand(CI->getOperand(0));
3519           break;
3520         case Instruction::SExt:
3521           NewI = B.CreateSExtOrTrunc(
3522               CI->getOperand(0),
3523               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3524           break;
3525         case Instruction::ZExt:
3526           NewI = B.CreateZExtOrTrunc(
3527               CI->getOperand(0),
3528               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3529           break;
3530         }
3531       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3532         auto Elements0 =
3533             cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
3534         auto *O0 = B.CreateZExtOrTrunc(
3535             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3536         auto Elements1 =
3537             cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
3538         auto *O1 = B.CreateZExtOrTrunc(
3539             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3540 
3541         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3542       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3543         // Don't do anything with the operands, just extend the result.
3544         continue;
3545       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3546         auto Elements =
3547             cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
3548         auto *O0 = B.CreateZExtOrTrunc(
3549             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3550         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3551         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3552       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3553         auto Elements =
3554             cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
3555         auto *O0 = B.CreateZExtOrTrunc(
3556             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3557         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3558       } else {
3559         // If we don't know what to do, be conservative and don't do anything.
3560         continue;
3561       }
3562 
3563       // Lastly, extend the result.
3564       NewI->takeName(cast<Instruction>(I));
3565       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3566       I->replaceAllUsesWith(Res);
3567       cast<Instruction>(I)->eraseFromParent();
3568       Erased.insert(I);
3569       State.reset(Def, Res, Part);
3570     }
3571   }
3572 
3573   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3574   for (const auto &KV : Cost->getMinimalBitwidths()) {
3575     // If the value wasn't vectorized, we must maintain the original scalar
3576     // type. The absence of the value from State indicates that it
3577     // wasn't vectorized.
3578     // FIXME: Should not rely on getVPValue at this point.
3579     VPValue *Def = State.Plan->getVPValue(KV.first, true);
3580     if (!State.hasAnyVectorValue(Def))
3581       continue;
3582     for (unsigned Part = 0; Part < UF; ++Part) {
3583       Value *I = State.get(Def, Part);
3584       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3585       if (Inst && Inst->use_empty()) {
3586         Value *NewI = Inst->getOperand(0);
3587         Inst->eraseFromParent();
3588         State.reset(Def, NewI, Part);
3589       }
3590     }
3591   }
3592 }
3593 
3594 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
3595                                             VPlan &Plan) {
3596   // Insert truncates and extends for any truncated instructions as hints to
3597   // InstCombine.
3598   if (VF.isVector())
3599     truncateToMinimalBitwidths(State);
3600 
3601   // Fix widened non-induction PHIs by setting up the PHI operands.
3602   if (EnableVPlanNativePath)
3603     fixNonInductionPHIs(Plan, State);
3604 
3605   // At this point every instruction in the original loop is widened to a
3606   // vector form. Now we need to fix the recurrences in the loop. These PHI
3607   // nodes are currently empty because we did not want to introduce cycles.
3608   // This is the second stage of vectorizing recurrences.
3609   fixCrossIterationPHIs(State);
3610 
3611   // Forget the original basic block.
3612   PSE.getSE()->forgetLoop(OrigLoop);
3613 
3614   VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock();
3615   Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
3616   if (Cost->requiresScalarEpilogue(VF)) {
3617     // No edge from the middle block to the unique exit block has been inserted
3618     // and there is nothing to fix from vector loop; phis should have incoming
3619     // from scalar loop only.
3620     Plan.clearLiveOuts();
3621   } else {
3622     // If we inserted an edge from the middle block to the unique exit block,
3623     // update uses outside the loop (phis) to account for the newly inserted
3624     // edge.
3625 
3626     // Fix-up external users of the induction variables.
3627     for (auto &Entry : Legal->getInductionVars())
3628       fixupIVUsers(Entry.first, Entry.second,
3629                    getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
3630                    IVEndValues[Entry.first], LoopMiddleBlock,
3631                    VectorLoop->getHeader(), Plan);
3632   }
3633 
3634   // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
3635   // in the exit block, so update the builder.
3636   State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI());
3637   for (auto &KV : Plan.getLiveOuts())
3638     KV.second->fixPhi(Plan, State);
3639 
3640   for (Instruction *PI : PredicatedInstructions)
3641     sinkScalarOperands(&*PI);
3642 
3643   // Remove redundant induction instructions.
3644   cse(VectorLoop->getHeader());
3645 
3646   // Set/update profile weights for the vector and remainder loops as original
3647   // loop iterations are now distributed among them. Note that original loop
3648   // represented by LoopScalarBody becomes remainder loop after vectorization.
3649   //
3650   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3651   // end up getting slightly roughened result but that should be OK since
3652   // profile is not inherently precise anyway. Note also possible bypass of
3653   // vector code caused by legality checks is ignored, assigning all the weight
3654   // to the vector loop, optimistically.
3655   //
3656   // For scalable vectorization we can't know at compile time how many iterations
3657   // of the loop are handled in one vector iteration, so instead assume a pessimistic
3658   // vscale of '1'.
3659   setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop,
3660                                LI->getLoopFor(LoopScalarBody),
3661                                VF.getKnownMinValue() * UF);
3662 }
3663 
3664 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
3665   // In order to support recurrences we need to be able to vectorize Phi nodes.
3666   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3667   // stage #2: We now need to fix the recurrences by adding incoming edges to
3668   // the currently empty PHI nodes. At this point every instruction in the
3669   // original loop is widened to a vector form so we can use them to construct
3670   // the incoming edges.
3671   VPBasicBlock *Header =
3672       State.Plan->getVectorLoopRegion()->getEntryBasicBlock();
3673   for (VPRecipeBase &R : Header->phis()) {
3674     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
3675       fixReduction(ReductionPhi, State);
3676     else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3677       fixFirstOrderRecurrence(FOR, State);
3678   }
3679 }
3680 
3681 void InnerLoopVectorizer::fixFirstOrderRecurrence(
3682     VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
3683   // This is the second phase of vectorizing first-order recurrences. An
3684   // overview of the transformation is described below. Suppose we have the
3685   // following loop.
3686   //
3687   //   for (int i = 0; i < n; ++i)
3688   //     b[i] = a[i] - a[i - 1];
3689   //
3690   // There is a first-order recurrence on "a". For this loop, the shorthand
3691   // scalar IR looks like:
3692   //
3693   //   scalar.ph:
3694   //     s_init = a[-1]
3695   //     br scalar.body
3696   //
3697   //   scalar.body:
3698   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3699   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3700   //     s2 = a[i]
3701   //     b[i] = s2 - s1
3702   //     br cond, scalar.body, ...
3703   //
3704   // In this example, s1 is a recurrence because it's value depends on the
3705   // previous iteration. In the first phase of vectorization, we created a
3706   // vector phi v1 for s1. We now complete the vectorization and produce the
3707   // shorthand vector IR shown below (for VF = 4, UF = 1).
3708   //
3709   //   vector.ph:
3710   //     v_init = vector(..., ..., ..., a[-1])
3711   //     br vector.body
3712   //
3713   //   vector.body
3714   //     i = phi [0, vector.ph], [i+4, vector.body]
3715   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3716   //     v2 = a[i, i+1, i+2, i+3];
3717   //     v3 = vector(v1(3), v2(0, 1, 2))
3718   //     b[i, i+1, i+2, i+3] = v2 - v3
3719   //     br cond, vector.body, middle.block
3720   //
3721   //   middle.block:
3722   //     x = v2(3)
3723   //     br scalar.ph
3724   //
3725   //   scalar.ph:
3726   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3727   //     br scalar.body
3728   //
3729   // After execution completes the vector loop, we extract the next value of
3730   // the recurrence (x) to use as the initial value in the scalar loop.
3731 
3732   // Extract the last vector element in the middle block. This will be the
3733   // initial value for the recurrence when jumping to the scalar loop.
3734   VPValue *PreviousDef = PhiR->getBackedgeValue();
3735   Value *Incoming = State.get(PreviousDef, UF - 1);
3736   auto *ExtractForScalar = Incoming;
3737   auto *IdxTy = Builder.getInt32Ty();
3738   if (VF.isVector()) {
3739     auto *One = ConstantInt::get(IdxTy, 1);
3740     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3741     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3742     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3743     ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
3744                                                     "vector.recur.extract");
3745   }
3746   // Extract the second last element in the middle block if the
3747   // Phi is used outside the loop. We need to extract the phi itself
3748   // and not the last element (the phi update in the current iteration). This
3749   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3750   // when the scalar loop is not run at all.
3751   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3752   if (VF.isVector()) {
3753     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3754     auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
3755     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3756         Incoming, Idx, "vector.recur.extract.for.phi");
3757   } else if (UF > 1)
3758     // When loop is unrolled without vectorizing, initialize
3759     // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
3760     // of `Incoming`. This is analogous to the vectorized case above: extracting
3761     // the second last element when VF > 1.
3762     ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
3763 
3764   // Fix the initial value of the original recurrence in the scalar loop.
3765   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3766   PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
3767   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3768   auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
3769   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3770     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3771     Start->addIncoming(Incoming, BB);
3772   }
3773 
3774   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3775   Phi->setName("scalar.recur");
3776 
3777   // Finally, fix users of the recurrence outside the loop. The users will need
3778   // either the last value of the scalar recurrence or the last value of the
3779   // vector recurrence we extracted in the middle block. Since the loop is in
3780   // LCSSA form, we just need to find all the phi nodes for the original scalar
3781   // recurrence in the exit block, and then add an edge for the middle block.
3782   // Note that LCSSA does not imply single entry when the original scalar loop
3783   // had multiple exiting edges (as we always run the last iteration in the
3784   // scalar epilogue); in that case, there is no edge from middle to exit and
3785   // and thus no phis which needed updated.
3786   if (!Cost->requiresScalarEpilogue(VF))
3787     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
3788       if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) {
3789         LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3790         State.Plan->removeLiveOut(&LCSSAPhi);
3791       }
3792 }
3793 
3794 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
3795                                        VPTransformState &State) {
3796   PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
3797   // Get it's reduction variable descriptor.
3798   assert(Legal->isReductionVariable(OrigPhi) &&
3799          "Unable to find the reduction variable");
3800   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
3801 
3802   RecurKind RK = RdxDesc.getRecurrenceKind();
3803   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3804   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3805   setDebugLocFromInst(ReductionStartValue);
3806 
3807   VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
3808   // This is the vector-clone of the value that leaves the loop.
3809   Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
3810 
3811   // Wrap flags are in general invalid after vectorization, clear them.
3812   clearReductionWrapFlags(PhiR, State);
3813 
3814   // Before each round, move the insertion point right between
3815   // the PHIs and the values we are going to write.
3816   // This allows us to write both PHINodes and the extractelement
3817   // instructions.
3818   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3819 
3820   setDebugLocFromInst(LoopExitInst);
3821 
3822   Type *PhiTy = OrigPhi->getType();
3823 
3824   VPBasicBlock *LatchVPBB =
3825       PhiR->getParent()->getEnclosingLoopRegion()->getExitingBasicBlock();
3826   BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB];
3827   // If tail is folded by masking, the vector value to leave the loop should be
3828   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3829   // instead of the former. For an inloop reduction the reduction will already
3830   // be predicated, and does not need to be handled here.
3831   if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
3832     for (unsigned Part = 0; Part < UF; ++Part) {
3833       Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
3834       SelectInst *Sel = nullptr;
3835       for (User *U : VecLoopExitInst->users()) {
3836         if (isa<SelectInst>(U)) {
3837           assert(!Sel && "Reduction exit feeding two selects");
3838           Sel = cast<SelectInst>(U);
3839         } else
3840           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3841       }
3842       assert(Sel && "Reduction exit feeds no select");
3843       State.reset(LoopExitInstDef, Sel, Part);
3844 
3845       if (isa<FPMathOperator>(Sel))
3846         Sel->setFastMathFlags(RdxDesc.getFastMathFlags());
3847 
3848       // If the target can create a predicated operator for the reduction at no
3849       // extra cost in the loop (for example a predicated vadd), it can be
3850       // cheaper for the select to remain in the loop than be sunk out of it,
3851       // and so use the select value for the phi instead of the old
3852       // LoopExitValue.
3853       if (PreferPredicatedReductionSelect ||
3854           TTI->preferPredicatedReductionSelect(
3855               RdxDesc.getOpcode(), PhiTy,
3856               TargetTransformInfo::ReductionFlags())) {
3857         auto *VecRdxPhi =
3858             cast<PHINode>(State.get(PhiR, Part));
3859         VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel);
3860       }
3861     }
3862   }
3863 
3864   // If the vector reduction can be performed in a smaller type, we truncate
3865   // then extend the loop exit value to enable InstCombine to evaluate the
3866   // entire expression in the smaller type.
3867   if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
3868     assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
3869     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3870     Builder.SetInsertPoint(VectorLoopLatch->getTerminator());
3871     VectorParts RdxParts(UF);
3872     for (unsigned Part = 0; Part < UF; ++Part) {
3873       RdxParts[Part] = State.get(LoopExitInstDef, Part);
3874       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3875       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3876                                         : Builder.CreateZExt(Trunc, VecTy);
3877       for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users()))
3878         if (U != Trunc) {
3879           U->replaceUsesOfWith(RdxParts[Part], Extnd);
3880           RdxParts[Part] = Extnd;
3881         }
3882     }
3883     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3884     for (unsigned Part = 0; Part < UF; ++Part) {
3885       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3886       State.reset(LoopExitInstDef, RdxParts[Part], Part);
3887     }
3888   }
3889 
3890   // Reduce all of the unrolled parts into a single vector.
3891   Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
3892   unsigned Op = RecurrenceDescriptor::getOpcode(RK);
3893 
3894   // The middle block terminator has already been assigned a DebugLoc here (the
3895   // OrigLoop's single latch terminator). We want the whole middle block to
3896   // appear to execute on this line because: (a) it is all compiler generated,
3897   // (b) these instructions are always executed after evaluating the latch
3898   // conditional branch, and (c) other passes may add new predecessors which
3899   // terminate on this line. This is the easiest way to ensure we don't
3900   // accidentally cause an extra step back into the loop while debugging.
3901   setDebugLocFromInst(LoopMiddleBlock->getTerminator());
3902   if (PhiR->isOrdered())
3903     ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
3904   else {
3905     // Floating-point operations should have some FMF to enable the reduction.
3906     IRBuilderBase::FastMathFlagGuard FMFG(Builder);
3907     Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
3908     for (unsigned Part = 1; Part < UF; ++Part) {
3909       Value *RdxPart = State.get(LoopExitInstDef, Part);
3910       if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
3911         ReducedPartRdx = Builder.CreateBinOp(
3912             (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
3913       } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK))
3914         ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK,
3915                                            ReducedPartRdx, RdxPart);
3916       else
3917         ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
3918     }
3919   }
3920 
3921   // Create the reduction after the loop. Note that inloop reductions create the
3922   // target reduction in the loop using a Reduction recipe.
3923   if (VF.isVector() && !PhiR->isInLoop()) {
3924     ReducedPartRdx =
3925         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi);
3926     // If the reduction can be performed in a smaller type, we need to extend
3927     // the reduction to the wider type before we branch to the original loop.
3928     if (PhiTy != RdxDesc.getRecurrenceType())
3929       ReducedPartRdx = RdxDesc.isSigned()
3930                            ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
3931                            : Builder.CreateZExt(ReducedPartRdx, PhiTy);
3932   }
3933 
3934   PHINode *ResumePhi =
3935       dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
3936 
3937   // Create a phi node that merges control-flow from the backedge-taken check
3938   // block and the middle block.
3939   PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
3940                                         LoopScalarPreHeader->getTerminator());
3941 
3942   // If we are fixing reductions in the epilogue loop then we should already
3943   // have created a bc.merge.rdx Phi after the main vector body. Ensure that
3944   // we carry over the incoming values correctly.
3945   for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
3946     if (Incoming == LoopMiddleBlock)
3947       BCBlockPhi->addIncoming(ReducedPartRdx, Incoming);
3948     else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming))
3949       BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
3950                               Incoming);
3951     else
3952       BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
3953   }
3954 
3955   // Set the resume value for this reduction
3956   ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
3957 
3958   // If there were stores of the reduction value to a uniform memory address
3959   // inside the loop, create the final store here.
3960   if (StoreInst *SI = RdxDesc.IntermediateStore) {
3961     StoreInst *NewSI =
3962         Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand());
3963     propagateMetadata(NewSI, SI);
3964 
3965     // If the reduction value is used in other places,
3966     // then let the code below create PHI's for that.
3967   }
3968 
3969   // Now, we need to fix the users of the reduction variable
3970   // inside and outside of the scalar remainder loop.
3971 
3972   // We know that the loop is in LCSSA form. We need to update the PHI nodes
3973   // in the exit blocks.  See comment on analogous loop in
3974   // fixFirstOrderRecurrence for a more complete explaination of the logic.
3975   if (!Cost->requiresScalarEpilogue(VF))
3976     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
3977       if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) {
3978         LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3979         State.Plan->removeLiveOut(&LCSSAPhi);
3980       }
3981 
3982   // Fix the scalar loop reduction variable with the incoming reduction sum
3983   // from the vector body and from the backedge value.
3984   int IncomingEdgeBlockIdx =
3985       OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
3986   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3987   // Pick the other block.
3988   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3989   OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3990   OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3991 }
3992 
3993 void InnerLoopVectorizer::clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
3994                                                   VPTransformState &State) {
3995   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
3996   RecurKind RK = RdxDesc.getRecurrenceKind();
3997   if (RK != RecurKind::Add && RK != RecurKind::Mul)
3998     return;
3999 
4000   SmallVector<VPValue *, 8> Worklist;
4001   SmallPtrSet<VPValue *, 8> Visited;
4002   Worklist.push_back(PhiR);
4003   Visited.insert(PhiR);
4004 
4005   while (!Worklist.empty()) {
4006     VPValue *Cur = Worklist.pop_back_val();
4007     for (unsigned Part = 0; Part < UF; ++Part) {
4008       Value *V = State.get(Cur, Part);
4009       if (!isa<OverflowingBinaryOperator>(V))
4010         break;
4011       cast<Instruction>(V)->dropPoisonGeneratingFlags();
4012       }
4013 
4014       for (VPUser *U : Cur->users()) {
4015         auto *UserRecipe = dyn_cast<VPRecipeBase>(U);
4016         if (!UserRecipe)
4017           continue;
4018         for (VPValue *V : UserRecipe->definedValues())
4019           if (Visited.insert(V).second)
4020             Worklist.push_back(V);
4021       }
4022   }
4023 }
4024 
4025 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4026   // The basic block and loop containing the predicated instruction.
4027   auto *PredBB = PredInst->getParent();
4028   auto *VectorLoop = LI->getLoopFor(PredBB);
4029 
4030   // Initialize a worklist with the operands of the predicated instruction.
4031   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4032 
4033   // Holds instructions that we need to analyze again. An instruction may be
4034   // reanalyzed if we don't yet know if we can sink it or not.
4035   SmallVector<Instruction *, 8> InstsToReanalyze;
4036 
4037   // Returns true if a given use occurs in the predicated block. Phi nodes use
4038   // their operands in their corresponding predecessor blocks.
4039   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4040     auto *I = cast<Instruction>(U.getUser());
4041     BasicBlock *BB = I->getParent();
4042     if (auto *Phi = dyn_cast<PHINode>(I))
4043       BB = Phi->getIncomingBlock(
4044           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4045     return BB == PredBB;
4046   };
4047 
4048   // Iteratively sink the scalarized operands of the predicated instruction
4049   // into the block we created for it. When an instruction is sunk, it's
4050   // operands are then added to the worklist. The algorithm ends after one pass
4051   // through the worklist doesn't sink a single instruction.
4052   bool Changed;
4053   do {
4054     // Add the instructions that need to be reanalyzed to the worklist, and
4055     // reset the changed indicator.
4056     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4057     InstsToReanalyze.clear();
4058     Changed = false;
4059 
4060     while (!Worklist.empty()) {
4061       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4062 
4063       // We can't sink an instruction if it is a phi node, is not in the loop,
4064       // or may have side effects.
4065       if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
4066           I->mayHaveSideEffects())
4067         continue;
4068 
4069       // If the instruction is already in PredBB, check if we can sink its
4070       // operands. In that case, VPlan's sinkScalarOperands() succeeded in
4071       // sinking the scalar instruction I, hence it appears in PredBB; but it
4072       // may have failed to sink I's operands (recursively), which we try
4073       // (again) here.
4074       if (I->getParent() == PredBB) {
4075         Worklist.insert(I->op_begin(), I->op_end());
4076         continue;
4077       }
4078 
4079       // It's legal to sink the instruction if all its uses occur in the
4080       // predicated block. Otherwise, there's nothing to do yet, and we may
4081       // need to reanalyze the instruction.
4082       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4083         InstsToReanalyze.push_back(I);
4084         continue;
4085       }
4086 
4087       // Move the instruction to the beginning of the predicated block, and add
4088       // it's operands to the worklist.
4089       I->moveBefore(&*PredBB->getFirstInsertionPt());
4090       Worklist.insert(I->op_begin(), I->op_end());
4091 
4092       // The sinking may have enabled other instructions to be sunk, so we will
4093       // need to iterate.
4094       Changed = true;
4095     }
4096   } while (Changed);
4097 }
4098 
4099 void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan,
4100                                               VPTransformState &State) {
4101   auto Iter = depth_first(
4102       VPBlockRecursiveTraversalWrapper<VPBlockBase *>(Plan.getEntry()));
4103   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
4104     for (VPRecipeBase &P : VPBB->phis()) {
4105       VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
4106       if (!VPPhi)
4107         continue;
4108       PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4109       // Make sure the builder has a valid insert point.
4110       Builder.SetInsertPoint(NewPhi);
4111       for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4112         VPValue *Inc = VPPhi->getIncomingValue(i);
4113         VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4114         NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4115       }
4116     }
4117   }
4118 }
4119 
4120 bool InnerLoopVectorizer::useOrderedReductions(
4121     const RecurrenceDescriptor &RdxDesc) {
4122   return Cost->useOrderedReductions(RdxDesc);
4123 }
4124 
4125 /// A helper function for checking whether an integer division-related
4126 /// instruction may divide by zero (in which case it must be predicated if
4127 /// executed conditionally in the scalar code).
4128 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4129 /// Non-zero divisors that are non compile-time constants will not be
4130 /// converted into multiplication, so we will still end up scalarizing
4131 /// the division, but can do so w/o predication.
4132 static bool mayDivideByZero(Instruction &I) {
4133   assert((I.getOpcode() == Instruction::UDiv ||
4134           I.getOpcode() == Instruction::SDiv ||
4135           I.getOpcode() == Instruction::URem ||
4136           I.getOpcode() == Instruction::SRem) &&
4137          "Unexpected instruction");
4138   Value *Divisor = I.getOperand(1);
4139   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4140   return !CInt || CInt->isZero();
4141 }
4142 
4143 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4144                                                VPUser &ArgOperands,
4145                                                VPTransformState &State) {
4146   assert(!isa<DbgInfoIntrinsic>(I) &&
4147          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4148   setDebugLocFromInst(&I);
4149 
4150   Module *M = I.getParent()->getParent()->getParent();
4151   auto *CI = cast<CallInst>(&I);
4152 
4153   SmallVector<Type *, 4> Tys;
4154   for (Value *ArgOperand : CI->args())
4155     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4156 
4157   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4158 
4159   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4160   // version of the instruction.
4161   // Is it beneficial to perform intrinsic call compared to lib call?
4162   bool NeedToScalarize = false;
4163   InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4164   InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
4165   bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
4166   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4167          "Instruction should be scalarized elsewhere.");
4168   assert((IntrinsicCost.isValid() || CallCost.isValid()) &&
4169          "Either the intrinsic cost or vector call cost must be valid");
4170 
4171   for (unsigned Part = 0; Part < UF; ++Part) {
4172     SmallVector<Type *, 2> TysForDecl = {CI->getType()};
4173     SmallVector<Value *, 4> Args;
4174     for (auto &I : enumerate(ArgOperands.operands())) {
4175       // Some intrinsics have a scalar argument - don't replace it with a
4176       // vector.
4177       Value *Arg;
4178       if (!UseVectorIntrinsic ||
4179           !isVectorIntrinsicWithScalarOpAtArg(ID, I.index()))
4180         Arg = State.get(I.value(), Part);
4181       else
4182         Arg = State.get(I.value(), VPIteration(0, 0));
4183       if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I.index()))
4184         TysForDecl.push_back(Arg->getType());
4185       Args.push_back(Arg);
4186     }
4187 
4188     Function *VectorF;
4189     if (UseVectorIntrinsic) {
4190       // Use vector version of the intrinsic.
4191       if (VF.isVector())
4192         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4193       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4194       assert(VectorF && "Can't retrieve vector intrinsic.");
4195     } else {
4196       // Use vector version of the function call.
4197       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4198 #ifndef NDEBUG
4199       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4200              "Can't create vector function.");
4201 #endif
4202         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4203     }
4204       SmallVector<OperandBundleDef, 1> OpBundles;
4205       CI->getOperandBundlesAsDefs(OpBundles);
4206       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4207 
4208       if (isa<FPMathOperator>(V))
4209         V->copyFastMathFlags(CI);
4210 
4211       State.set(Def, V, Part);
4212       State.addMetadata(V, &I);
4213   }
4214 }
4215 
4216 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4217   // We should not collect Scalars more than once per VF. Right now, this
4218   // function is called from collectUniformsAndScalars(), which already does
4219   // this check. Collecting Scalars for VF=1 does not make any sense.
4220   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4221          "This function should not be visited twice for the same VF");
4222 
4223   // This avoids any chances of creating a REPLICATE recipe during planning
4224   // since that would result in generation of scalarized code during execution,
4225   // which is not supported for scalable vectors.
4226   if (VF.isScalable()) {
4227     Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
4228     return;
4229   }
4230 
4231   SmallSetVector<Instruction *, 8> Worklist;
4232 
4233   // These sets are used to seed the analysis with pointers used by memory
4234   // accesses that will remain scalar.
4235   SmallSetVector<Instruction *, 8> ScalarPtrs;
4236   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4237   auto *Latch = TheLoop->getLoopLatch();
4238 
4239   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4240   // The pointer operands of loads and stores will be scalar as long as the
4241   // memory access is not a gather or scatter operation. The value operand of a
4242   // store will remain scalar if the store is scalarized.
4243   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4244     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4245     assert(WideningDecision != CM_Unknown &&
4246            "Widening decision should be ready at this moment");
4247     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4248       if (Ptr == Store->getValueOperand())
4249         return WideningDecision == CM_Scalarize;
4250     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4251            "Ptr is neither a value or pointer operand");
4252     return WideningDecision != CM_GatherScatter;
4253   };
4254 
4255   // A helper that returns true if the given value is a bitcast or
4256   // getelementptr instruction contained in the loop.
4257   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4258     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4259             isa<GetElementPtrInst>(V)) &&
4260            !TheLoop->isLoopInvariant(V);
4261   };
4262 
4263   // A helper that evaluates a memory access's use of a pointer. If the use will
4264   // be a scalar use and the pointer is only used by memory accesses, we place
4265   // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4266   // PossibleNonScalarPtrs.
4267   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4268     // We only care about bitcast and getelementptr instructions contained in
4269     // the loop.
4270     if (!isLoopVaryingBitCastOrGEP(Ptr))
4271       return;
4272 
4273     // If the pointer has already been identified as scalar (e.g., if it was
4274     // also identified as uniform), there's nothing to do.
4275     auto *I = cast<Instruction>(Ptr);
4276     if (Worklist.count(I))
4277       return;
4278 
4279     // If the use of the pointer will be a scalar use, and all users of the
4280     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4281     // place the pointer in PossibleNonScalarPtrs.
4282     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4283           return isa<LoadInst>(U) || isa<StoreInst>(U);
4284         }))
4285       ScalarPtrs.insert(I);
4286     else
4287       PossibleNonScalarPtrs.insert(I);
4288   };
4289 
4290   // We seed the scalars analysis with three classes of instructions: (1)
4291   // instructions marked uniform-after-vectorization and (2) bitcast,
4292   // getelementptr and (pointer) phi instructions used by memory accesses
4293   // requiring a scalar use.
4294   //
4295   // (1) Add to the worklist all instructions that have been identified as
4296   // uniform-after-vectorization.
4297   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4298 
4299   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4300   // memory accesses requiring a scalar use. The pointer operands of loads and
4301   // stores will be scalar as long as the memory accesses is not a gather or
4302   // scatter operation. The value operand of a store will remain scalar if the
4303   // store is scalarized.
4304   for (auto *BB : TheLoop->blocks())
4305     for (auto &I : *BB) {
4306       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4307         evaluatePtrUse(Load, Load->getPointerOperand());
4308       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4309         evaluatePtrUse(Store, Store->getPointerOperand());
4310         evaluatePtrUse(Store, Store->getValueOperand());
4311       }
4312     }
4313   for (auto *I : ScalarPtrs)
4314     if (!PossibleNonScalarPtrs.count(I)) {
4315       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4316       Worklist.insert(I);
4317     }
4318 
4319   // Insert the forced scalars.
4320   // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
4321   // induction variable when the PHI user is scalarized.
4322   auto ForcedScalar = ForcedScalars.find(VF);
4323   if (ForcedScalar != ForcedScalars.end())
4324     for (auto *I : ForcedScalar->second)
4325       Worklist.insert(I);
4326 
4327   // Expand the worklist by looking through any bitcasts and getelementptr
4328   // instructions we've already identified as scalar. This is similar to the
4329   // expansion step in collectLoopUniforms(); however, here we're only
4330   // expanding to include additional bitcasts and getelementptr instructions.
4331   unsigned Idx = 0;
4332   while (Idx != Worklist.size()) {
4333     Instruction *Dst = Worklist[Idx++];
4334     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4335       continue;
4336     auto *Src = cast<Instruction>(Dst->getOperand(0));
4337     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4338           auto *J = cast<Instruction>(U);
4339           return !TheLoop->contains(J) || Worklist.count(J) ||
4340                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4341                   isScalarUse(J, Src));
4342         })) {
4343       Worklist.insert(Src);
4344       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4345     }
4346   }
4347 
4348   // An induction variable will remain scalar if all users of the induction
4349   // variable and induction variable update remain scalar.
4350   for (auto &Induction : Legal->getInductionVars()) {
4351     auto *Ind = Induction.first;
4352     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4353 
4354     // If tail-folding is applied, the primary induction variable will be used
4355     // to feed a vector compare.
4356     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4357       continue;
4358 
4359     // Returns true if \p Indvar is a pointer induction that is used directly by
4360     // load/store instruction \p I.
4361     auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
4362                                               Instruction *I) {
4363       return Induction.second.getKind() ==
4364                  InductionDescriptor::IK_PtrInduction &&
4365              (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
4366              Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
4367     };
4368 
4369     // Determine if all users of the induction variable are scalar after
4370     // vectorization.
4371     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4372       auto *I = cast<Instruction>(U);
4373       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4374              IsDirectLoadStoreFromPtrIndvar(Ind, I);
4375     });
4376     if (!ScalarInd)
4377       continue;
4378 
4379     // Determine if all users of the induction variable update instruction are
4380     // scalar after vectorization.
4381     auto ScalarIndUpdate =
4382         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4383           auto *I = cast<Instruction>(U);
4384           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4385                  IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
4386         });
4387     if (!ScalarIndUpdate)
4388       continue;
4389 
4390     // The induction variable and its update instruction will remain scalar.
4391     Worklist.insert(Ind);
4392     Worklist.insert(IndUpdate);
4393     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4394     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4395                       << "\n");
4396   }
4397 
4398   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4399 }
4400 
4401 bool LoopVectorizationCostModel::isScalarWithPredication(
4402     Instruction *I, ElementCount VF) const {
4403   if (!blockNeedsPredicationForAnyReason(I->getParent()))
4404     return false;
4405   switch(I->getOpcode()) {
4406   default:
4407     break;
4408   case Instruction::Load:
4409   case Instruction::Store: {
4410     if (!Legal->isMaskRequired(I))
4411       return false;
4412     auto *Ptr = getLoadStorePointerOperand(I);
4413     auto *Ty = getLoadStoreType(I);
4414     Type *VTy = Ty;
4415     if (VF.isVector())
4416       VTy = VectorType::get(Ty, VF);
4417     const Align Alignment = getLoadStoreAlignment(I);
4418     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4419                                 TTI.isLegalMaskedGather(VTy, Alignment))
4420                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4421                                 TTI.isLegalMaskedScatter(VTy, Alignment));
4422   }
4423   case Instruction::UDiv:
4424   case Instruction::SDiv:
4425   case Instruction::SRem:
4426   case Instruction::URem:
4427     return mayDivideByZero(*I);
4428   }
4429   return false;
4430 }
4431 
4432 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4433     Instruction *I, ElementCount VF) {
4434   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4435   assert(getWideningDecision(I, VF) == CM_Unknown &&
4436          "Decision should not be set yet.");
4437   auto *Group = getInterleavedAccessGroup(I);
4438   assert(Group && "Must have a group.");
4439 
4440   // If the instruction's allocated size doesn't equal it's type size, it
4441   // requires padding and will be scalarized.
4442   auto &DL = I->getModule()->getDataLayout();
4443   auto *ScalarTy = getLoadStoreType(I);
4444   if (hasIrregularType(ScalarTy, DL))
4445     return false;
4446 
4447   // If the group involves a non-integral pointer, we may not be able to
4448   // losslessly cast all values to a common type.
4449   unsigned InterleaveFactor = Group->getFactor();
4450   bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
4451   for (unsigned i = 0; i < InterleaveFactor; i++) {
4452     Instruction *Member = Group->getMember(i);
4453     if (!Member)
4454       continue;
4455     auto *MemberTy = getLoadStoreType(Member);
4456     bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
4457     // Don't coerce non-integral pointers to integers or vice versa.
4458     if (MemberNI != ScalarNI) {
4459       // TODO: Consider adding special nullptr value case here
4460       return false;
4461     } else if (MemberNI && ScalarNI &&
4462                ScalarTy->getPointerAddressSpace() !=
4463                MemberTy->getPointerAddressSpace()) {
4464       return false;
4465     }
4466   }
4467 
4468   // Check if masking is required.
4469   // A Group may need masking for one of two reasons: it resides in a block that
4470   // needs predication, or it was decided to use masking to deal with gaps
4471   // (either a gap at the end of a load-access that may result in a speculative
4472   // load, or any gaps in a store-access).
4473   bool PredicatedAccessRequiresMasking =
4474       blockNeedsPredicationForAnyReason(I->getParent()) &&
4475       Legal->isMaskRequired(I);
4476   bool LoadAccessWithGapsRequiresEpilogMasking =
4477       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4478       !isScalarEpilogueAllowed();
4479   bool StoreAccessWithGapsRequiresMasking =
4480       isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4481   if (!PredicatedAccessRequiresMasking &&
4482       !LoadAccessWithGapsRequiresEpilogMasking &&
4483       !StoreAccessWithGapsRequiresMasking)
4484     return true;
4485 
4486   // If masked interleaving is required, we expect that the user/target had
4487   // enabled it, because otherwise it either wouldn't have been created or
4488   // it should have been invalidated by the CostModel.
4489   assert(useMaskedInterleavedAccesses(TTI) &&
4490          "Masked interleave-groups for predicated accesses are not enabled.");
4491 
4492   if (Group->isReverse())
4493     return false;
4494 
4495   auto *Ty = getLoadStoreType(I);
4496   const Align Alignment = getLoadStoreAlignment(I);
4497   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4498                           : TTI.isLegalMaskedStore(Ty, Alignment);
4499 }
4500 
4501 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4502     Instruction *I, ElementCount VF) {
4503   // Get and ensure we have a valid memory instruction.
4504   assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
4505 
4506   auto *Ptr = getLoadStorePointerOperand(I);
4507   auto *ScalarTy = getLoadStoreType(I);
4508 
4509   // In order to be widened, the pointer should be consecutive, first of all.
4510   if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
4511     return false;
4512 
4513   // If the instruction is a store located in a predicated block, it will be
4514   // scalarized.
4515   if (isScalarWithPredication(I, VF))
4516     return false;
4517 
4518   // If the instruction's allocated size doesn't equal it's type size, it
4519   // requires padding and will be scalarized.
4520   auto &DL = I->getModule()->getDataLayout();
4521   if (hasIrregularType(ScalarTy, DL))
4522     return false;
4523 
4524   return true;
4525 }
4526 
4527 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4528   // We should not collect Uniforms more than once per VF. Right now,
4529   // this function is called from collectUniformsAndScalars(), which
4530   // already does this check. Collecting Uniforms for VF=1 does not make any
4531   // sense.
4532 
4533   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
4534          "This function should not be visited twice for the same VF");
4535 
4536   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4537   // not analyze again.  Uniforms.count(VF) will return 1.
4538   Uniforms[VF].clear();
4539 
4540   // We now know that the loop is vectorizable!
4541   // Collect instructions inside the loop that will remain uniform after
4542   // vectorization.
4543 
4544   // Global values, params and instructions outside of current loop are out of
4545   // scope.
4546   auto isOutOfScope = [&](Value *V) -> bool {
4547     Instruction *I = dyn_cast<Instruction>(V);
4548     return (!I || !TheLoop->contains(I));
4549   };
4550 
4551   // Worklist containing uniform instructions demanding lane 0.
4552   SetVector<Instruction *> Worklist;
4553   BasicBlock *Latch = TheLoop->getLoopLatch();
4554 
4555   // Add uniform instructions demanding lane 0 to the worklist. Instructions
4556   // that are scalar with predication must not be considered uniform after
4557   // vectorization, because that would create an erroneous replicating region
4558   // where only a single instance out of VF should be formed.
4559   // TODO: optimize such seldom cases if found important, see PR40816.
4560   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4561     if (isOutOfScope(I)) {
4562       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
4563                         << *I << "\n");
4564       return;
4565     }
4566     if (isScalarWithPredication(I, VF)) {
4567       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4568                         << *I << "\n");
4569       return;
4570     }
4571     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4572     Worklist.insert(I);
4573   };
4574 
4575   // Start with the conditional branch. If the branch condition is an
4576   // instruction contained in the loop that is only used by the branch, it is
4577   // uniform.
4578   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4579   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4580     addToWorklistIfAllowed(Cmp);
4581 
4582   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
4583     InstWidening WideningDecision = getWideningDecision(I, VF);
4584     assert(WideningDecision != CM_Unknown &&
4585            "Widening decision should be ready at this moment");
4586 
4587     // A uniform memory op is itself uniform.  We exclude uniform stores
4588     // here as they demand the last lane, not the first one.
4589     if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
4590       assert(WideningDecision == CM_Scalarize);
4591       return true;
4592     }
4593 
4594     return (WideningDecision == CM_Widen ||
4595             WideningDecision == CM_Widen_Reverse ||
4596             WideningDecision == CM_Interleave);
4597   };
4598 
4599 
4600   // Returns true if Ptr is the pointer operand of a memory access instruction
4601   // I, and I is known to not require scalarization.
4602   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4603     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4604   };
4605 
4606   // Holds a list of values which are known to have at least one uniform use.
4607   // Note that there may be other uses which aren't uniform.  A "uniform use"
4608   // here is something which only demands lane 0 of the unrolled iterations;
4609   // it does not imply that all lanes produce the same value (e.g. this is not
4610   // the usual meaning of uniform)
4611   SetVector<Value *> HasUniformUse;
4612 
4613   // Scan the loop for instructions which are either a) known to have only
4614   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
4615   for (auto *BB : TheLoop->blocks())
4616     for (auto &I : *BB) {
4617       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
4618         switch (II->getIntrinsicID()) {
4619         case Intrinsic::sideeffect:
4620         case Intrinsic::experimental_noalias_scope_decl:
4621         case Intrinsic::assume:
4622         case Intrinsic::lifetime_start:
4623         case Intrinsic::lifetime_end:
4624           if (TheLoop->hasLoopInvariantOperands(&I))
4625             addToWorklistIfAllowed(&I);
4626           break;
4627         default:
4628           break;
4629         }
4630       }
4631 
4632       // ExtractValue instructions must be uniform, because the operands are
4633       // known to be loop-invariant.
4634       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
4635         assert(isOutOfScope(EVI->getAggregateOperand()) &&
4636                "Expected aggregate value to be loop invariant");
4637         addToWorklistIfAllowed(EVI);
4638         continue;
4639       }
4640 
4641       // If there's no pointer operand, there's nothing to do.
4642       auto *Ptr = getLoadStorePointerOperand(&I);
4643       if (!Ptr)
4644         continue;
4645 
4646       // A uniform memory op is itself uniform.  We exclude uniform stores
4647       // here as they demand the last lane, not the first one.
4648       if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
4649         addToWorklistIfAllowed(&I);
4650 
4651       if (isUniformDecision(&I, VF)) {
4652         assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
4653         HasUniformUse.insert(Ptr);
4654       }
4655     }
4656 
4657   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
4658   // demanding) users.  Since loops are assumed to be in LCSSA form, this
4659   // disallows uses outside the loop as well.
4660   for (auto *V : HasUniformUse) {
4661     if (isOutOfScope(V))
4662       continue;
4663     auto *I = cast<Instruction>(V);
4664     auto UsersAreMemAccesses =
4665       llvm::all_of(I->users(), [&](User *U) -> bool {
4666         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
4667       });
4668     if (UsersAreMemAccesses)
4669       addToWorklistIfAllowed(I);
4670   }
4671 
4672   // Expand Worklist in topological order: whenever a new instruction
4673   // is added , its users should be already inside Worklist.  It ensures
4674   // a uniform instruction will only be used by uniform instructions.
4675   unsigned idx = 0;
4676   while (idx != Worklist.size()) {
4677     Instruction *I = Worklist[idx++];
4678 
4679     for (auto OV : I->operand_values()) {
4680       // isOutOfScope operands cannot be uniform instructions.
4681       if (isOutOfScope(OV))
4682         continue;
4683       // First order recurrence Phi's should typically be considered
4684       // non-uniform.
4685       auto *OP = dyn_cast<PHINode>(OV);
4686       if (OP && Legal->isFirstOrderRecurrence(OP))
4687         continue;
4688       // If all the users of the operand are uniform, then add the
4689       // operand into the uniform worklist.
4690       auto *OI = cast<Instruction>(OV);
4691       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4692             auto *J = cast<Instruction>(U);
4693             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
4694           }))
4695         addToWorklistIfAllowed(OI);
4696     }
4697   }
4698 
4699   // For an instruction to be added into Worklist above, all its users inside
4700   // the loop should also be in Worklist. However, this condition cannot be
4701   // true for phi nodes that form a cyclic dependence. We must process phi
4702   // nodes separately. An induction variable will remain uniform if all users
4703   // of the induction variable and induction variable update remain uniform.
4704   // The code below handles both pointer and non-pointer induction variables.
4705   for (auto &Induction : Legal->getInductionVars()) {
4706     auto *Ind = Induction.first;
4707     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4708 
4709     // Determine if all users of the induction variable are uniform after
4710     // vectorization.
4711     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4712       auto *I = cast<Instruction>(U);
4713       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4714              isVectorizedMemAccessUse(I, Ind);
4715     });
4716     if (!UniformInd)
4717       continue;
4718 
4719     // Determine if all users of the induction variable update instruction are
4720     // uniform after vectorization.
4721     auto UniformIndUpdate =
4722         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4723           auto *I = cast<Instruction>(U);
4724           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4725                  isVectorizedMemAccessUse(I, IndUpdate);
4726         });
4727     if (!UniformIndUpdate)
4728       continue;
4729 
4730     // The induction variable and its update instruction will remain uniform.
4731     addToWorklistIfAllowed(Ind);
4732     addToWorklistIfAllowed(IndUpdate);
4733   }
4734 
4735   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4736 }
4737 
4738 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4739   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4740 
4741   if (Legal->getRuntimePointerChecking()->Need) {
4742     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4743         "runtime pointer checks needed. Enable vectorization of this "
4744         "loop with '#pragma clang loop vectorize(enable)' when "
4745         "compiling with -Os/-Oz",
4746         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4747     return true;
4748   }
4749 
4750   if (!PSE.getPredicate().isAlwaysTrue()) {
4751     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4752         "runtime SCEV checks needed. Enable vectorization of this "
4753         "loop with '#pragma clang loop vectorize(enable)' when "
4754         "compiling with -Os/-Oz",
4755         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4756     return true;
4757   }
4758 
4759   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4760   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4761     reportVectorizationFailure("Runtime stride check for small trip count",
4762         "runtime stride == 1 checks needed. Enable vectorization of "
4763         "this loop without such check by compiling with -Os/-Oz",
4764         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4765     return true;
4766   }
4767 
4768   return false;
4769 }
4770 
4771 ElementCount
4772 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
4773   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
4774     return ElementCount::getScalable(0);
4775 
4776   if (Hints->isScalableVectorizationDisabled()) {
4777     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
4778                             "ScalableVectorizationDisabled", ORE, TheLoop);
4779     return ElementCount::getScalable(0);
4780   }
4781 
4782   LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
4783 
4784   auto MaxScalableVF = ElementCount::getScalable(
4785       std::numeric_limits<ElementCount::ScalarTy>::max());
4786 
4787   // Test that the loop-vectorizer can legalize all operations for this MaxVF.
4788   // FIXME: While for scalable vectors this is currently sufficient, this should
4789   // be replaced by a more detailed mechanism that filters out specific VFs,
4790   // instead of invalidating vectorization for a whole set of VFs based on the
4791   // MaxVF.
4792 
4793   // Disable scalable vectorization if the loop contains unsupported reductions.
4794   if (!canVectorizeReductions(MaxScalableVF)) {
4795     reportVectorizationInfo(
4796         "Scalable vectorization not supported for the reduction "
4797         "operations found in this loop.",
4798         "ScalableVFUnfeasible", ORE, TheLoop);
4799     return ElementCount::getScalable(0);
4800   }
4801 
4802   // Disable scalable vectorization if the loop contains any instructions
4803   // with element types not supported for scalable vectors.
4804   if (any_of(ElementTypesInLoop, [&](Type *Ty) {
4805         return !Ty->isVoidTy() &&
4806                !this->TTI.isElementTypeLegalForScalableVector(Ty);
4807       })) {
4808     reportVectorizationInfo("Scalable vectorization is not supported "
4809                             "for all element types found in this loop.",
4810                             "ScalableVFUnfeasible", ORE, TheLoop);
4811     return ElementCount::getScalable(0);
4812   }
4813 
4814   if (Legal->isSafeForAnyVectorWidth())
4815     return MaxScalableVF;
4816 
4817   // Limit MaxScalableVF by the maximum safe dependence distance.
4818   Optional<unsigned> MaxVScale = TTI.getMaxVScale();
4819   if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange))
4820     MaxVScale =
4821         TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
4822   MaxScalableVF = ElementCount::getScalable(
4823       MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
4824   if (!MaxScalableVF)
4825     reportVectorizationInfo(
4826         "Max legal vector width too small, scalable vectorization "
4827         "unfeasible.",
4828         "ScalableVFUnfeasible", ORE, TheLoop);
4829 
4830   return MaxScalableVF;
4831 }
4832 
4833 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
4834     unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) {
4835   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4836   unsigned SmallestType, WidestType;
4837   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4838 
4839   // Get the maximum safe dependence distance in bits computed by LAA.
4840   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4841   // the memory accesses that is most restrictive (involved in the smallest
4842   // dependence distance).
4843   unsigned MaxSafeElements =
4844       PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
4845 
4846   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
4847   auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
4848 
4849   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
4850                     << ".\n");
4851   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
4852                     << ".\n");
4853 
4854   // First analyze the UserVF, fall back if the UserVF should be ignored.
4855   if (UserVF) {
4856     auto MaxSafeUserVF =
4857         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
4858 
4859     if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
4860       // If `VF=vscale x N` is safe, then so is `VF=N`
4861       if (UserVF.isScalable())
4862         return FixedScalableVFPair(
4863             ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
4864       else
4865         return UserVF;
4866     }
4867 
4868     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
4869 
4870     // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
4871     // is better to ignore the hint and let the compiler choose a suitable VF.
4872     if (!UserVF.isScalable()) {
4873       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4874                         << " is unsafe, clamping to max safe VF="
4875                         << MaxSafeFixedVF << ".\n");
4876       ORE->emit([&]() {
4877         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4878                                           TheLoop->getStartLoc(),
4879                                           TheLoop->getHeader())
4880                << "User-specified vectorization factor "
4881                << ore::NV("UserVectorizationFactor", UserVF)
4882                << " is unsafe, clamping to maximum safe vectorization factor "
4883                << ore::NV("VectorizationFactor", MaxSafeFixedVF);
4884       });
4885       return MaxSafeFixedVF;
4886     }
4887 
4888     if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
4889       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4890                         << " is ignored because scalable vectors are not "
4891                            "available.\n");
4892       ORE->emit([&]() {
4893         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4894                                           TheLoop->getStartLoc(),
4895                                           TheLoop->getHeader())
4896                << "User-specified vectorization factor "
4897                << ore::NV("UserVectorizationFactor", UserVF)
4898                << " is ignored because the target does not support scalable "
4899                   "vectors. The compiler will pick a more suitable value.";
4900       });
4901     } else {
4902       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4903                         << " is unsafe. Ignoring scalable UserVF.\n");
4904       ORE->emit([&]() {
4905         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4906                                           TheLoop->getStartLoc(),
4907                                           TheLoop->getHeader())
4908                << "User-specified vectorization factor "
4909                << ore::NV("UserVectorizationFactor", UserVF)
4910                << " is unsafe. Ignoring the hint to let the compiler pick a "
4911                   "more suitable value.";
4912       });
4913     }
4914   }
4915 
4916   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4917                     << " / " << WidestType << " bits.\n");
4918 
4919   FixedScalableVFPair Result(ElementCount::getFixed(1),
4920                              ElementCount::getScalable(0));
4921   if (auto MaxVF =
4922           getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
4923                                   MaxSafeFixedVF, FoldTailByMasking))
4924     Result.FixedVF = MaxVF;
4925 
4926   if (auto MaxVF =
4927           getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
4928                                   MaxSafeScalableVF, FoldTailByMasking))
4929     if (MaxVF.isScalable()) {
4930       Result.ScalableVF = MaxVF;
4931       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
4932                         << "\n");
4933     }
4934 
4935   return Result;
4936 }
4937 
4938 FixedScalableVFPair
4939 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
4940   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4941     // TODO: It may by useful to do since it's still likely to be dynamically
4942     // uniform if the target can skip.
4943     reportVectorizationFailure(
4944         "Not inserting runtime ptr check for divergent target",
4945         "runtime pointer checks needed. Not enabled for divergent target",
4946         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4947     return FixedScalableVFPair::getNone();
4948   }
4949 
4950   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4951   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4952   if (TC == 1) {
4953     reportVectorizationFailure("Single iteration (non) loop",
4954         "loop trip count is one, irrelevant for vectorization",
4955         "SingleIterationLoop", ORE, TheLoop);
4956     return FixedScalableVFPair::getNone();
4957   }
4958 
4959   switch (ScalarEpilogueStatus) {
4960   case CM_ScalarEpilogueAllowed:
4961     return computeFeasibleMaxVF(TC, UserVF, false);
4962   case CM_ScalarEpilogueNotAllowedUsePredicate:
4963     LLVM_FALLTHROUGH;
4964   case CM_ScalarEpilogueNotNeededUsePredicate:
4965     LLVM_DEBUG(
4966         dbgs() << "LV: vector predicate hint/switch found.\n"
4967                << "LV: Not allowing scalar epilogue, creating predicated "
4968                << "vector loop.\n");
4969     break;
4970   case CM_ScalarEpilogueNotAllowedLowTripLoop:
4971     // fallthrough as a special case of OptForSize
4972   case CM_ScalarEpilogueNotAllowedOptSize:
4973     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4974       LLVM_DEBUG(
4975           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4976     else
4977       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4978                         << "count.\n");
4979 
4980     // Bail if runtime checks are required, which are not good when optimising
4981     // for size.
4982     if (runtimeChecksRequired())
4983       return FixedScalableVFPair::getNone();
4984 
4985     break;
4986   }
4987 
4988   // The only loops we can vectorize without a scalar epilogue, are loops with
4989   // a bottom-test and a single exiting block. We'd have to handle the fact
4990   // that not every instruction executes on the last iteration.  This will
4991   // require a lane mask which varies through the vector loop body.  (TODO)
4992   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
4993     // If there was a tail-folding hint/switch, but we can't fold the tail by
4994     // masking, fallback to a vectorization with a scalar epilogue.
4995     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
4996       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
4997                            "scalar epilogue instead.\n");
4998       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
4999       return computeFeasibleMaxVF(TC, UserVF, false);
5000     }
5001     return FixedScalableVFPair::getNone();
5002   }
5003 
5004   // Now try the tail folding
5005 
5006   // Invalidate interleave groups that require an epilogue if we can't mask
5007   // the interleave-group.
5008   if (!useMaskedInterleavedAccesses(TTI)) {
5009     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5010            "No decisions should have been taken at this point");
5011     // Note: There is no need to invalidate any cost modeling decisions here, as
5012     // non where taken so far.
5013     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5014   }
5015 
5016   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true);
5017   // Avoid tail folding if the trip count is known to be a multiple of any VF
5018   // we chose.
5019   // FIXME: The condition below pessimises the case for fixed-width vectors,
5020   // when scalable VFs are also candidates for vectorization.
5021   if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
5022     ElementCount MaxFixedVF = MaxFactors.FixedVF;
5023     assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&
5024            "MaxFixedVF must be a power of 2");
5025     unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
5026                                    : MaxFixedVF.getFixedValue();
5027     ScalarEvolution *SE = PSE.getSE();
5028     const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5029     const SCEV *ExitCount = SE->getAddExpr(
5030         BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5031     const SCEV *Rem = SE->getURemExpr(
5032         SE->applyLoopGuards(ExitCount, TheLoop),
5033         SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5034     if (Rem->isZero()) {
5035       // Accept MaxFixedVF if we do not have a tail.
5036       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5037       return MaxFactors;
5038     }
5039   }
5040 
5041   // If we don't know the precise trip count, or if the trip count that we
5042   // found modulo the vectorization factor is not zero, try to fold the tail
5043   // by masking.
5044   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5045   if (Legal->prepareToFoldTailByMasking()) {
5046     FoldTailByMasking = true;
5047     return MaxFactors;
5048   }
5049 
5050   // If there was a tail-folding hint/switch, but we can't fold the tail by
5051   // masking, fallback to a vectorization with a scalar epilogue.
5052   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5053     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5054                          "scalar epilogue instead.\n");
5055     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5056     return MaxFactors;
5057   }
5058 
5059   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5060     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5061     return FixedScalableVFPair::getNone();
5062   }
5063 
5064   if (TC == 0) {
5065     reportVectorizationFailure(
5066         "Unable to calculate the loop count due to complex control flow",
5067         "unable to calculate the loop count due to complex control flow",
5068         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5069     return FixedScalableVFPair::getNone();
5070   }
5071 
5072   reportVectorizationFailure(
5073       "Cannot optimize for size and vectorize at the same time.",
5074       "cannot optimize for size and vectorize at the same time. "
5075       "Enable vectorization of this loop with '#pragma clang loop "
5076       "vectorize(enable)' when compiling with -Os/-Oz",
5077       "NoTailLoopWithOptForSize", ORE, TheLoop);
5078   return FixedScalableVFPair::getNone();
5079 }
5080 
5081 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
5082     unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
5083     ElementCount MaxSafeVF, bool FoldTailByMasking) {
5084   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
5085   TypeSize WidestRegister = TTI.getRegisterBitWidth(
5086       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5087                            : TargetTransformInfo::RGK_FixedWidthVector);
5088 
5089   // Convenience function to return the minimum of two ElementCounts.
5090   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
5091     assert((LHS.isScalable() == RHS.isScalable()) &&
5092            "Scalable flags must match");
5093     return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
5094   };
5095 
5096   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5097   // Note that both WidestRegister and WidestType may not be a powers of 2.
5098   auto MaxVectorElementCount = ElementCount::get(
5099       PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType),
5100       ComputeScalableMaxVF);
5101   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
5102   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5103                     << (MaxVectorElementCount * WidestType) << " bits.\n");
5104 
5105   if (!MaxVectorElementCount) {
5106     LLVM_DEBUG(dbgs() << "LV: The target has no "
5107                       << (ComputeScalableMaxVF ? "scalable" : "fixed")
5108                       << " vector registers.\n");
5109     return ElementCount::getFixed(1);
5110   }
5111 
5112   const auto TripCountEC = ElementCount::getFixed(ConstTripCount);
5113   if (ConstTripCount &&
5114       ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) &&
5115       (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {
5116     // If loop trip count (TC) is known at compile time there is no point in
5117     // choosing VF greater than TC (as done in the loop below). Select maximum
5118     // power of two which doesn't exceed TC.
5119     // If MaxVectorElementCount is scalable, we only fall back on a fixed VF
5120     // when the TC is less than or equal to the known number of lanes.
5121     auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount);
5122     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
5123                          "exceeding the constant trip count: "
5124                       << ClampedConstTripCount << "\n");
5125     return ElementCount::getFixed(ClampedConstTripCount);
5126   }
5127 
5128   TargetTransformInfo::RegisterKind RegKind =
5129       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5130                            : TargetTransformInfo::RGK_FixedWidthVector;
5131   ElementCount MaxVF = MaxVectorElementCount;
5132   if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
5133                             TTI.shouldMaximizeVectorBandwidth(RegKind))) {
5134     auto MaxVectorElementCountMaxBW = ElementCount::get(
5135         PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
5136         ComputeScalableMaxVF);
5137     MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
5138 
5139     // Collect all viable vectorization factors larger than the default MaxVF
5140     // (i.e. MaxVectorElementCount).
5141     SmallVector<ElementCount, 8> VFs;
5142     for (ElementCount VS = MaxVectorElementCount * 2;
5143          ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
5144       VFs.push_back(VS);
5145 
5146     // For each VF calculate its register usage.
5147     auto RUs = calculateRegisterUsage(VFs);
5148 
5149     // Select the largest VF which doesn't require more registers than existing
5150     // ones.
5151     for (int i = RUs.size() - 1; i >= 0; --i) {
5152       bool Selected = true;
5153       for (auto &pair : RUs[i].MaxLocalUsers) {
5154         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5155         if (pair.second > TargetNumRegisters)
5156           Selected = false;
5157       }
5158       if (Selected) {
5159         MaxVF = VFs[i];
5160         break;
5161       }
5162     }
5163     if (ElementCount MinVF =
5164             TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
5165       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
5166         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5167                           << ") with target's minimum: " << MinVF << '\n');
5168         MaxVF = MinVF;
5169       }
5170     }
5171 
5172     // Invalidate any widening decisions we might have made, in case the loop
5173     // requires prediction (decided later), but we have already made some
5174     // load/store widening decisions.
5175     invalidateCostModelingDecisions();
5176   }
5177   return MaxVF;
5178 }
5179 
5180 Optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const {
5181   if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
5182     auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
5183     auto Min = Attr.getVScaleRangeMin();
5184     auto Max = Attr.getVScaleRangeMax();
5185     if (Max && Min == Max)
5186       return Max;
5187   }
5188 
5189   return TTI.getVScaleForTuning();
5190 }
5191 
5192 bool LoopVectorizationCostModel::isMoreProfitable(
5193     const VectorizationFactor &A, const VectorizationFactor &B) const {
5194   InstructionCost CostA = A.Cost;
5195   InstructionCost CostB = B.Cost;
5196 
5197   unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
5198 
5199   if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking &&
5200       MaxTripCount) {
5201     // If we are folding the tail and the trip count is a known (possibly small)
5202     // constant, the trip count will be rounded up to an integer number of
5203     // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),
5204     // which we compare directly. When not folding the tail, the total cost will
5205     // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is
5206     // approximated with the per-lane cost below instead of using the tripcount
5207     // as here.
5208     auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue());
5209     auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue());
5210     return RTCostA < RTCostB;
5211   }
5212 
5213   // Improve estimate for the vector width if it is scalable.
5214   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
5215   unsigned EstimatedWidthB = B.Width.getKnownMinValue();
5216   if (Optional<unsigned> VScale = getVScaleForTuning()) {
5217     if (A.Width.isScalable())
5218       EstimatedWidthA *= VScale.getValue();
5219     if (B.Width.isScalable())
5220       EstimatedWidthB *= VScale.getValue();
5221   }
5222 
5223   // Assume vscale may be larger than 1 (or the value being tuned for),
5224   // so that scalable vectorization is slightly favorable over fixed-width
5225   // vectorization.
5226   if (A.Width.isScalable() && !B.Width.isScalable())
5227     return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
5228 
5229   // To avoid the need for FP division:
5230   //      (CostA / A.Width) < (CostB / B.Width)
5231   // <=>  (CostA * B.Width) < (CostB * A.Width)
5232   return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
5233 }
5234 
5235 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
5236     const ElementCountSet &VFCandidates) {
5237   InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
5238   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
5239   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
5240   assert(VFCandidates.count(ElementCount::getFixed(1)) &&
5241          "Expected Scalar VF to be a candidate");
5242 
5243   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
5244                                        ExpectedCost);
5245   VectorizationFactor ChosenFactor = ScalarCost;
5246 
5247   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5248   if (ForceVectorization && VFCandidates.size() > 1) {
5249     // Ignore scalar width, because the user explicitly wants vectorization.
5250     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5251     // evaluation.
5252     ChosenFactor.Cost = InstructionCost::getMax();
5253   }
5254 
5255   SmallVector<InstructionVFPair> InvalidCosts;
5256   for (const auto &i : VFCandidates) {
5257     // The cost for scalar VF=1 is already calculated, so ignore it.
5258     if (i.isScalar())
5259       continue;
5260 
5261     VectorizationCostTy C = expectedCost(i, &InvalidCosts);
5262     VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost);
5263 
5264 #ifndef NDEBUG
5265     unsigned AssumedMinimumVscale = 1;
5266     if (Optional<unsigned> VScale = getVScaleForTuning())
5267       AssumedMinimumVscale = *VScale;
5268     unsigned Width =
5269         Candidate.Width.isScalable()
5270             ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5271             : Candidate.Width.getFixedValue();
5272     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5273                       << " costs: " << (Candidate.Cost / Width));
5274     if (i.isScalable())
5275       LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
5276                         << AssumedMinimumVscale << ")");
5277     LLVM_DEBUG(dbgs() << ".\n");
5278 #endif
5279 
5280     if (!C.second && !ForceVectorization) {
5281       LLVM_DEBUG(
5282           dbgs() << "LV: Not considering vector loop of width " << i
5283                  << " because it will not generate any vector instructions.\n");
5284       continue;
5285     }
5286 
5287     // If profitable add it to ProfitableVF list.
5288     if (isMoreProfitable(Candidate, ScalarCost))
5289       ProfitableVFs.push_back(Candidate);
5290 
5291     if (isMoreProfitable(Candidate, ChosenFactor))
5292       ChosenFactor = Candidate;
5293   }
5294 
5295   // Emit a report of VFs with invalid costs in the loop.
5296   if (!InvalidCosts.empty()) {
5297     // Group the remarks per instruction, keeping the instruction order from
5298     // InvalidCosts.
5299     std::map<Instruction *, unsigned> Numbering;
5300     unsigned I = 0;
5301     for (auto &Pair : InvalidCosts)
5302       if (!Numbering.count(Pair.first))
5303         Numbering[Pair.first] = I++;
5304 
5305     // Sort the list, first on instruction(number) then on VF.
5306     llvm::sort(InvalidCosts,
5307                [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
5308                  if (Numbering[A.first] != Numbering[B.first])
5309                    return Numbering[A.first] < Numbering[B.first];
5310                  ElementCountComparator ECC;
5311                  return ECC(A.second, B.second);
5312                });
5313 
5314     // For a list of ordered instruction-vf pairs:
5315     //   [(load, vf1), (load, vf2), (store, vf1)]
5316     // Group the instructions together to emit separate remarks for:
5317     //   load  (vf1, vf2)
5318     //   store (vf1)
5319     auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
5320     auto Subset = ArrayRef<InstructionVFPair>();
5321     do {
5322       if (Subset.empty())
5323         Subset = Tail.take_front(1);
5324 
5325       Instruction *I = Subset.front().first;
5326 
5327       // If the next instruction is different, or if there are no other pairs,
5328       // emit a remark for the collated subset. e.g.
5329       //   [(load, vf1), (load, vf2))]
5330       // to emit:
5331       //  remark: invalid costs for 'load' at VF=(vf, vf2)
5332       if (Subset == Tail || Tail[Subset.size()].first != I) {
5333         std::string OutString;
5334         raw_string_ostream OS(OutString);
5335         assert(!Subset.empty() && "Unexpected empty range");
5336         OS << "Instruction with invalid costs prevented vectorization at VF=(";
5337         for (auto &Pair : Subset)
5338           OS << (Pair.second == Subset.front().second ? "" : ", ")
5339              << Pair.second;
5340         OS << "):";
5341         if (auto *CI = dyn_cast<CallInst>(I))
5342           OS << " call to " << CI->getCalledFunction()->getName();
5343         else
5344           OS << " " << I->getOpcodeName();
5345         OS.flush();
5346         reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
5347         Tail = Tail.drop_front(Subset.size());
5348         Subset = {};
5349       } else
5350         // Grow the subset by one element
5351         Subset = Tail.take_front(Subset.size() + 1);
5352     } while (!Tail.empty());
5353   }
5354 
5355   if (!EnableCondStoresVectorization && NumPredStores) {
5356     reportVectorizationFailure("There are conditional stores.",
5357         "store that is conditionally executed prevents vectorization",
5358         "ConditionalStore", ORE, TheLoop);
5359     ChosenFactor = ScalarCost;
5360   }
5361 
5362   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
5363                  ChosenFactor.Cost >= ScalarCost.Cost) dbgs()
5364              << "LV: Vectorization seems to be not beneficial, "
5365              << "but was forced by a user.\n");
5366   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
5367   return ChosenFactor;
5368 }
5369 
5370 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
5371     const Loop &L, ElementCount VF) const {
5372   // Cross iteration phis such as reductions need special handling and are
5373   // currently unsupported.
5374   if (any_of(L.getHeader()->phis(),
5375              [&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); }))
5376     return false;
5377 
5378   // Phis with uses outside of the loop require special handling and are
5379   // currently unsupported.
5380   for (auto &Entry : Legal->getInductionVars()) {
5381     // Look for uses of the value of the induction at the last iteration.
5382     Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
5383     for (User *U : PostInc->users())
5384       if (!L.contains(cast<Instruction>(U)))
5385         return false;
5386     // Look for uses of penultimate value of the induction.
5387     for (User *U : Entry.first->users())
5388       if (!L.contains(cast<Instruction>(U)))
5389         return false;
5390   }
5391 
5392   // Induction variables that are widened require special handling that is
5393   // currently not supported.
5394   if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
5395         return !(this->isScalarAfterVectorization(Entry.first, VF) ||
5396                  this->isProfitableToScalarize(Entry.first, VF));
5397       }))
5398     return false;
5399 
5400   // Epilogue vectorization code has not been auditted to ensure it handles
5401   // non-latch exits properly.  It may be fine, but it needs auditted and
5402   // tested.
5403   if (L.getExitingBlock() != L.getLoopLatch())
5404     return false;
5405 
5406   return true;
5407 }
5408 
5409 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5410     const ElementCount VF) const {
5411   // FIXME: We need a much better cost-model to take different parameters such
5412   // as register pressure, code size increase and cost of extra branches into
5413   // account. For now we apply a very crude heuristic and only consider loops
5414   // with vectorization factors larger than a certain value.
5415   // We also consider epilogue vectorization unprofitable for targets that don't
5416   // consider interleaving beneficial (eg. MVE).
5417   if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
5418     return false;
5419   // FIXME: We should consider changing the threshold for scalable
5420   // vectors to take VScaleForTuning into account.
5421   if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF)
5422     return true;
5423   return false;
5424 }
5425 
5426 VectorizationFactor
5427 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
5428     const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
5429   VectorizationFactor Result = VectorizationFactor::Disabled();
5430   if (!EnableEpilogueVectorization) {
5431     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
5432     return Result;
5433   }
5434 
5435   if (!isScalarEpilogueAllowed()) {
5436     LLVM_DEBUG(
5437         dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
5438                   "allowed.\n";);
5439     return Result;
5440   }
5441 
5442   // Not really a cost consideration, but check for unsupported cases here to
5443   // simplify the logic.
5444   if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
5445     LLVM_DEBUG(
5446         dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
5447                   "not a supported candidate.\n";);
5448     return Result;
5449   }
5450 
5451   if (EpilogueVectorizationForceVF > 1) {
5452     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
5453     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
5454     if (LVP.hasPlanWithVF(ForcedEC))
5455       return {ForcedEC, 0, 0};
5456     else {
5457       LLVM_DEBUG(
5458           dbgs()
5459               << "LEV: Epilogue vectorization forced factor is not viable.\n";);
5460       return Result;
5461     }
5462   }
5463 
5464   if (TheLoop->getHeader()->getParent()->hasOptSize() ||
5465       TheLoop->getHeader()->getParent()->hasMinSize()) {
5466     LLVM_DEBUG(
5467         dbgs()
5468             << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
5469     return Result;
5470   }
5471 
5472   if (!isEpilogueVectorizationProfitable(MainLoopVF)) {
5473     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
5474                          "this loop\n");
5475     return Result;
5476   }
5477 
5478   // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
5479   // the main loop handles 8 lanes per iteration. We could still benefit from
5480   // vectorizing the epilogue loop with VF=4.
5481   ElementCount EstimatedRuntimeVF = MainLoopVF;
5482   if (MainLoopVF.isScalable()) {
5483     EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
5484     if (Optional<unsigned> VScale = getVScaleForTuning())
5485       EstimatedRuntimeVF *= *VScale;
5486   }
5487 
5488   for (auto &NextVF : ProfitableVFs)
5489     if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
5490           ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) ||
5491          ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) &&
5492         (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) &&
5493         LVP.hasPlanWithVF(NextVF.Width))
5494       Result = NextVF;
5495 
5496   if (Result != VectorizationFactor::Disabled())
5497     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5498                       << Result.Width << "\n";);
5499   return Result;
5500 }
5501 
5502 std::pair<unsigned, unsigned>
5503 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5504   unsigned MinWidth = -1U;
5505   unsigned MaxWidth = 8;
5506   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5507   // For in-loop reductions, no element types are added to ElementTypesInLoop
5508   // if there are no loads/stores in the loop. In this case, check through the
5509   // reduction variables to determine the maximum width.
5510   if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
5511     // Reset MaxWidth so that we can find the smallest type used by recurrences
5512     // in the loop.
5513     MaxWidth = -1U;
5514     for (auto &PhiDescriptorPair : Legal->getReductionVars()) {
5515       const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
5516       // When finding the min width used by the recurrence we need to account
5517       // for casts on the input operands of the recurrence.
5518       MaxWidth = std::min<unsigned>(
5519           MaxWidth, std::min<unsigned>(
5520                         RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
5521                         RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
5522     }
5523   } else {
5524     for (Type *T : ElementTypesInLoop) {
5525       MinWidth = std::min<unsigned>(
5526           MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5527       MaxWidth = std::max<unsigned>(
5528           MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5529     }
5530   }
5531   return {MinWidth, MaxWidth};
5532 }
5533 
5534 void LoopVectorizationCostModel::collectElementTypesForWidening() {
5535   ElementTypesInLoop.clear();
5536   // For each block.
5537   for (BasicBlock *BB : TheLoop->blocks()) {
5538     // For each instruction in the loop.
5539     for (Instruction &I : BB->instructionsWithoutDebug()) {
5540       Type *T = I.getType();
5541 
5542       // Skip ignored values.
5543       if (ValuesToIgnore.count(&I))
5544         continue;
5545 
5546       // Only examine Loads, Stores and PHINodes.
5547       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5548         continue;
5549 
5550       // Examine PHI nodes that are reduction variables. Update the type to
5551       // account for the recurrence type.
5552       if (auto *PN = dyn_cast<PHINode>(&I)) {
5553         if (!Legal->isReductionVariable(PN))
5554           continue;
5555         const RecurrenceDescriptor &RdxDesc =
5556             Legal->getReductionVars().find(PN)->second;
5557         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
5558             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
5559                                       RdxDesc.getRecurrenceType(),
5560                                       TargetTransformInfo::ReductionFlags()))
5561           continue;
5562         T = RdxDesc.getRecurrenceType();
5563       }
5564 
5565       // Examine the stored values.
5566       if (auto *ST = dyn_cast<StoreInst>(&I))
5567         T = ST->getValueOperand()->getType();
5568 
5569       assert(T->isSized() &&
5570              "Expected the load/store/recurrence type to be sized");
5571 
5572       ElementTypesInLoop.insert(T);
5573     }
5574   }
5575 }
5576 
5577 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5578                                                            unsigned LoopCost) {
5579   // -- The interleave heuristics --
5580   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5581   // There are many micro-architectural considerations that we can't predict
5582   // at this level. For example, frontend pressure (on decode or fetch) due to
5583   // code size, or the number and capabilities of the execution ports.
5584   //
5585   // We use the following heuristics to select the interleave count:
5586   // 1. If the code has reductions, then we interleave to break the cross
5587   // iteration dependency.
5588   // 2. If the loop is really small, then we interleave to reduce the loop
5589   // overhead.
5590   // 3. We don't interleave if we think that we will spill registers to memory
5591   // due to the increased register pressure.
5592 
5593   if (!isScalarEpilogueAllowed())
5594     return 1;
5595 
5596   // We used the distance for the interleave count.
5597   if (Legal->getMaxSafeDepDistBytes() != -1U)
5598     return 1;
5599 
5600   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5601   const bool HasReductions = !Legal->getReductionVars().empty();
5602   // Do not interleave loops with a relatively small known or estimated trip
5603   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5604   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5605   // because with the above conditions interleaving can expose ILP and break
5606   // cross iteration dependences for reductions.
5607   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5608       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5609     return 1;
5610 
5611   // If we did not calculate the cost for VF (because the user selected the VF)
5612   // then we calculate the cost of VF here.
5613   if (LoopCost == 0) {
5614     InstructionCost C = expectedCost(VF).first;
5615     assert(C.isValid() && "Expected to have chosen a VF with valid cost");
5616     LoopCost = *C.getValue();
5617 
5618     // Loop body is free and there is no need for interleaving.
5619     if (LoopCost == 0)
5620       return 1;
5621   }
5622 
5623   RegisterUsage R = calculateRegisterUsage({VF})[0];
5624   // We divide by these constants so assume that we have at least one
5625   // instruction that uses at least one register.
5626   for (auto& pair : R.MaxLocalUsers) {
5627     pair.second = std::max(pair.second, 1U);
5628   }
5629 
5630   // We calculate the interleave count using the following formula.
5631   // Subtract the number of loop invariants from the number of available
5632   // registers. These registers are used by all of the interleaved instances.
5633   // Next, divide the remaining registers by the number of registers that is
5634   // required by the loop, in order to estimate how many parallel instances
5635   // fit without causing spills. All of this is rounded down if necessary to be
5636   // a power of two. We want power of two interleave count to simplify any
5637   // addressing operations or alignment considerations.
5638   // We also want power of two interleave counts to ensure that the induction
5639   // variable of the vector loop wraps to zero, when tail is folded by masking;
5640   // this currently happens when OptForSize, in which case IC is set to 1 above.
5641   unsigned IC = UINT_MAX;
5642 
5643   for (auto& pair : R.MaxLocalUsers) {
5644     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5645     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5646                       << " registers of "
5647                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5648     if (VF.isScalar()) {
5649       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5650         TargetNumRegisters = ForceTargetNumScalarRegs;
5651     } else {
5652       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5653         TargetNumRegisters = ForceTargetNumVectorRegs;
5654     }
5655     unsigned MaxLocalUsers = pair.second;
5656     unsigned LoopInvariantRegs = 0;
5657     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5658       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5659 
5660     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5661     // Don't count the induction variable as interleaved.
5662     if (EnableIndVarRegisterHeur) {
5663       TmpIC =
5664           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5665                         std::max(1U, (MaxLocalUsers - 1)));
5666     }
5667 
5668     IC = std::min(IC, TmpIC);
5669   }
5670 
5671   // Clamp the interleave ranges to reasonable counts.
5672   unsigned MaxInterleaveCount =
5673       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
5674 
5675   // Check if the user has overridden the max.
5676   if (VF.isScalar()) {
5677     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5678       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5679   } else {
5680     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5681       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5682   }
5683 
5684   // If trip count is known or estimated compile time constant, limit the
5685   // interleave count to be less than the trip count divided by VF, provided it
5686   // is at least 1.
5687   //
5688   // For scalable vectors we can't know if interleaving is beneficial. It may
5689   // not be beneficial for small loops if none of the lanes in the second vector
5690   // iterations is enabled. However, for larger loops, there is likely to be a
5691   // similar benefit as for fixed-width vectors. For now, we choose to leave
5692   // the InterleaveCount as if vscale is '1', although if some information about
5693   // the vector is known (e.g. min vector size), we can make a better decision.
5694   if (BestKnownTC) {
5695     MaxInterleaveCount =
5696         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
5697     // Make sure MaxInterleaveCount is greater than 0.
5698     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
5699   }
5700 
5701   assert(MaxInterleaveCount > 0 &&
5702          "Maximum interleave count must be greater than 0");
5703 
5704   // Clamp the calculated IC to be between the 1 and the max interleave count
5705   // that the target and trip count allows.
5706   if (IC > MaxInterleaveCount)
5707     IC = MaxInterleaveCount;
5708   else
5709     // Make sure IC is greater than 0.
5710     IC = std::max(1u, IC);
5711 
5712   assert(IC > 0 && "Interleave count must be greater than 0.");
5713 
5714   // Interleave if we vectorized this loop and there is a reduction that could
5715   // benefit from interleaving.
5716   if (VF.isVector() && HasReductions) {
5717     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5718     return IC;
5719   }
5720 
5721   // For any scalar loop that either requires runtime checks or predication we
5722   // are better off leaving this to the unroller. Note that if we've already
5723   // vectorized the loop we will have done the runtime check and so interleaving
5724   // won't require further checks.
5725   bool ScalarInterleavingRequiresPredication =
5726       (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5727          return Legal->blockNeedsPredication(BB);
5728        }));
5729   bool ScalarInterleavingRequiresRuntimePointerCheck =
5730       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5731 
5732   // We want to interleave small loops in order to reduce the loop overhead and
5733   // potentially expose ILP opportunities.
5734   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5735                     << "LV: IC is " << IC << '\n'
5736                     << "LV: VF is " << VF << '\n');
5737   const bool AggressivelyInterleaveReductions =
5738       TTI.enableAggressiveInterleaving(HasReductions);
5739   if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5740       !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5741     // We assume that the cost overhead is 1 and we use the cost model
5742     // to estimate the cost of the loop and interleave until the cost of the
5743     // loop overhead is about 5% of the cost of the loop.
5744     unsigned SmallIC =
5745         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5746 
5747     // Interleave until store/load ports (estimated by max interleave count) are
5748     // saturated.
5749     unsigned NumStores = Legal->getNumStores();
5750     unsigned NumLoads = Legal->getNumLoads();
5751     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5752     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5753 
5754     // There is little point in interleaving for reductions containing selects
5755     // and compares when VF=1 since it may just create more overhead than it's
5756     // worth for loops with small trip counts. This is because we still have to
5757     // do the final reduction after the loop.
5758     bool HasSelectCmpReductions =
5759         HasReductions &&
5760         any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5761           const RecurrenceDescriptor &RdxDesc = Reduction.second;
5762           return RecurrenceDescriptor::isSelectCmpRecurrenceKind(
5763               RdxDesc.getRecurrenceKind());
5764         });
5765     if (HasSelectCmpReductions) {
5766       LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5767       return 1;
5768     }
5769 
5770     // If we have a scalar reduction (vector reductions are already dealt with
5771     // by this point), we can increase the critical path length if the loop
5772     // we're interleaving is inside another loop. For tree-wise reductions
5773     // set the limit to 2, and for ordered reductions it's best to disable
5774     // interleaving entirely.
5775     if (HasReductions && TheLoop->getLoopDepth() > 1) {
5776       bool HasOrderedReductions =
5777           any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5778             const RecurrenceDescriptor &RdxDesc = Reduction.second;
5779             return RdxDesc.isOrdered();
5780           });
5781       if (HasOrderedReductions) {
5782         LLVM_DEBUG(
5783             dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5784         return 1;
5785       }
5786 
5787       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5788       SmallIC = std::min(SmallIC, F);
5789       StoresIC = std::min(StoresIC, F);
5790       LoadsIC = std::min(LoadsIC, F);
5791     }
5792 
5793     if (EnableLoadStoreRuntimeInterleave &&
5794         std::max(StoresIC, LoadsIC) > SmallIC) {
5795       LLVM_DEBUG(
5796           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5797       return std::max(StoresIC, LoadsIC);
5798     }
5799 
5800     // If there are scalar reductions and TTI has enabled aggressive
5801     // interleaving for reductions, we will interleave to expose ILP.
5802     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
5803         AggressivelyInterleaveReductions) {
5804       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5805       // Interleave no less than SmallIC but not as aggressive as the normal IC
5806       // to satisfy the rare situation when resources are too limited.
5807       return std::max(IC / 2, SmallIC);
5808     } else {
5809       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5810       return SmallIC;
5811     }
5812   }
5813 
5814   // Interleave if this is a large loop (small loops are already dealt with by
5815   // this point) that could benefit from interleaving.
5816   if (AggressivelyInterleaveReductions) {
5817     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5818     return IC;
5819   }
5820 
5821   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5822   return 1;
5823 }
5824 
5825 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5826 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5827   // This function calculates the register usage by measuring the highest number
5828   // of values that are alive at a single location. Obviously, this is a very
5829   // rough estimation. We scan the loop in a topological order in order and
5830   // assign a number to each instruction. We use RPO to ensure that defs are
5831   // met before their users. We assume that each instruction that has in-loop
5832   // users starts an interval. We record every time that an in-loop value is
5833   // used, so we have a list of the first and last occurrences of each
5834   // instruction. Next, we transpose this data structure into a multi map that
5835   // holds the list of intervals that *end* at a specific location. This multi
5836   // map allows us to perform a linear search. We scan the instructions linearly
5837   // and record each time that a new interval starts, by placing it in a set.
5838   // If we find this value in the multi-map then we remove it from the set.
5839   // The max register usage is the maximum size of the set.
5840   // We also search for instructions that are defined outside the loop, but are
5841   // used inside the loop. We need this number separately from the max-interval
5842   // usage number because when we unroll, loop-invariant values do not take
5843   // more register.
5844   LoopBlocksDFS DFS(TheLoop);
5845   DFS.perform(LI);
5846 
5847   RegisterUsage RU;
5848 
5849   // Each 'key' in the map opens a new interval. The values
5850   // of the map are the index of the 'last seen' usage of the
5851   // instruction that is the key.
5852   using IntervalMap = DenseMap<Instruction *, unsigned>;
5853 
5854   // Maps instruction to its index.
5855   SmallVector<Instruction *, 64> IdxToInstr;
5856   // Marks the end of each interval.
5857   IntervalMap EndPoint;
5858   // Saves the list of instruction indices that are used in the loop.
5859   SmallPtrSet<Instruction *, 8> Ends;
5860   // Saves the list of values that are used in the loop but are
5861   // defined outside the loop, such as arguments and constants.
5862   SmallPtrSet<Value *, 8> LoopInvariants;
5863 
5864   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5865     for (Instruction &I : BB->instructionsWithoutDebug()) {
5866       IdxToInstr.push_back(&I);
5867 
5868       // Save the end location of each USE.
5869       for (Value *U : I.operands()) {
5870         auto *Instr = dyn_cast<Instruction>(U);
5871 
5872         // Ignore non-instruction values such as arguments, constants, etc.
5873         if (!Instr)
5874           continue;
5875 
5876         // If this instruction is outside the loop then record it and continue.
5877         if (!TheLoop->contains(Instr)) {
5878           LoopInvariants.insert(Instr);
5879           continue;
5880         }
5881 
5882         // Overwrite previous end points.
5883         EndPoint[Instr] = IdxToInstr.size();
5884         Ends.insert(Instr);
5885       }
5886     }
5887   }
5888 
5889   // Saves the list of intervals that end with the index in 'key'.
5890   using InstrList = SmallVector<Instruction *, 2>;
5891   DenseMap<unsigned, InstrList> TransposeEnds;
5892 
5893   // Transpose the EndPoints to a list of values that end at each index.
5894   for (auto &Interval : EndPoint)
5895     TransposeEnds[Interval.second].push_back(Interval.first);
5896 
5897   SmallPtrSet<Instruction *, 8> OpenIntervals;
5898   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5899   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5900 
5901   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5902 
5903   auto GetRegUsage = [&TTI = TTI](Type *Ty, ElementCount VF) -> unsigned {
5904     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
5905       return 0;
5906     return TTI.getRegUsageForType(VectorType::get(Ty, VF));
5907   };
5908 
5909   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5910     Instruction *I = IdxToInstr[i];
5911 
5912     // Remove all of the instructions that end at this location.
5913     InstrList &List = TransposeEnds[i];
5914     for (Instruction *ToRemove : List)
5915       OpenIntervals.erase(ToRemove);
5916 
5917     // Ignore instructions that are never used within the loop.
5918     if (!Ends.count(I))
5919       continue;
5920 
5921     // Skip ignored values.
5922     if (ValuesToIgnore.count(I))
5923       continue;
5924 
5925     // For each VF find the maximum usage of registers.
5926     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5927       // Count the number of live intervals.
5928       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5929 
5930       if (VFs[j].isScalar()) {
5931         for (auto Inst : OpenIntervals) {
5932           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5933           if (RegUsage.find(ClassID) == RegUsage.end())
5934             RegUsage[ClassID] = 1;
5935           else
5936             RegUsage[ClassID] += 1;
5937         }
5938       } else {
5939         collectUniformsAndScalars(VFs[j]);
5940         for (auto Inst : OpenIntervals) {
5941           // Skip ignored values for VF > 1.
5942           if (VecValuesToIgnore.count(Inst))
5943             continue;
5944           if (isScalarAfterVectorization(Inst, VFs[j])) {
5945             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5946             if (RegUsage.find(ClassID) == RegUsage.end())
5947               RegUsage[ClassID] = 1;
5948             else
5949               RegUsage[ClassID] += 1;
5950           } else {
5951             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
5952             if (RegUsage.find(ClassID) == RegUsage.end())
5953               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
5954             else
5955               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5956           }
5957         }
5958       }
5959 
5960       for (auto& pair : RegUsage) {
5961         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
5962           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
5963         else
5964           MaxUsages[j][pair.first] = pair.second;
5965       }
5966     }
5967 
5968     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5969                       << OpenIntervals.size() << '\n');
5970 
5971     // Add the current instruction to the list of open intervals.
5972     OpenIntervals.insert(I);
5973   }
5974 
5975   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5976     SmallMapVector<unsigned, unsigned, 4> Invariant;
5977 
5978     for (auto Inst : LoopInvariants) {
5979       unsigned Usage =
5980           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
5981       unsigned ClassID =
5982           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
5983       if (Invariant.find(ClassID) == Invariant.end())
5984         Invariant[ClassID] = Usage;
5985       else
5986         Invariant[ClassID] += Usage;
5987     }
5988 
5989     LLVM_DEBUG({
5990       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5991       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5992              << " item\n";
5993       for (const auto &pair : MaxUsages[i]) {
5994         dbgs() << "LV(REG): RegisterClass: "
5995                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5996                << " registers\n";
5997       }
5998       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5999              << " item\n";
6000       for (const auto &pair : Invariant) {
6001         dbgs() << "LV(REG): RegisterClass: "
6002                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6003                << " registers\n";
6004       }
6005     });
6006 
6007     RU.LoopInvariantRegs = Invariant;
6008     RU.MaxLocalUsers = MaxUsages[i];
6009     RUs[i] = RU;
6010   }
6011 
6012   return RUs;
6013 }
6014 
6015 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
6016                                                            ElementCount VF) {
6017   // TODO: Cost model for emulated masked load/store is completely
6018   // broken. This hack guides the cost model to use an artificially
6019   // high enough value to practically disable vectorization with such
6020   // operations, except where previously deployed legality hack allowed
6021   // using very low cost values. This is to avoid regressions coming simply
6022   // from moving "masked load/store" check from legality to cost model.
6023   // Masked Load/Gather emulation was previously never allowed.
6024   // Limited number of Masked Store/Scatter emulation was allowed.
6025   assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction");
6026   return isa<LoadInst>(I) ||
6027          (isa<StoreInst>(I) &&
6028           NumPredStores > NumberOfStoresToPredicate);
6029 }
6030 
6031 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6032   // If we aren't vectorizing the loop, or if we've already collected the
6033   // instructions to scalarize, there's nothing to do. Collection may already
6034   // have occurred if we have a user-selected VF and are now computing the
6035   // expected cost for interleaving.
6036   if (VF.isScalar() || VF.isZero() ||
6037       InstsToScalarize.find(VF) != InstsToScalarize.end())
6038     return;
6039 
6040   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6041   // not profitable to scalarize any instructions, the presence of VF in the
6042   // map will indicate that we've analyzed it already.
6043   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6044 
6045   // Find all the instructions that are scalar with predication in the loop and
6046   // determine if it would be better to not if-convert the blocks they are in.
6047   // If so, we also record the instructions to scalarize.
6048   for (BasicBlock *BB : TheLoop->blocks()) {
6049     if (!blockNeedsPredicationForAnyReason(BB))
6050       continue;
6051     for (Instruction &I : *BB)
6052       if (isScalarWithPredication(&I, VF)) {
6053         ScalarCostsTy ScalarCosts;
6054         // Do not apply discount if scalable, because that would lead to
6055         // invalid scalarization costs.
6056         // Do not apply discount logic if hacked cost is needed
6057         // for emulated masked memrefs.
6058         if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) &&
6059             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6060           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6061         // Remember that BB will remain after vectorization.
6062         PredicatedBBsAfterVectorization.insert(BB);
6063       }
6064   }
6065 }
6066 
6067 int LoopVectorizationCostModel::computePredInstDiscount(
6068     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
6069   assert(!isUniformAfterVectorization(PredInst, VF) &&
6070          "Instruction marked uniform-after-vectorization will be predicated");
6071 
6072   // Initialize the discount to zero, meaning that the scalar version and the
6073   // vector version cost the same.
6074   InstructionCost Discount = 0;
6075 
6076   // Holds instructions to analyze. The instructions we visit are mapped in
6077   // ScalarCosts. Those instructions are the ones that would be scalarized if
6078   // we find that the scalar version costs less.
6079   SmallVector<Instruction *, 8> Worklist;
6080 
6081   // Returns true if the given instruction can be scalarized.
6082   auto canBeScalarized = [&](Instruction *I) -> bool {
6083     // We only attempt to scalarize instructions forming a single-use chain
6084     // from the original predicated block that would otherwise be vectorized.
6085     // Although not strictly necessary, we give up on instructions we know will
6086     // already be scalar to avoid traversing chains that are unlikely to be
6087     // beneficial.
6088     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6089         isScalarAfterVectorization(I, VF))
6090       return false;
6091 
6092     // If the instruction is scalar with predication, it will be analyzed
6093     // separately. We ignore it within the context of PredInst.
6094     if (isScalarWithPredication(I, VF))
6095       return false;
6096 
6097     // If any of the instruction's operands are uniform after vectorization,
6098     // the instruction cannot be scalarized. This prevents, for example, a
6099     // masked load from being scalarized.
6100     //
6101     // We assume we will only emit a value for lane zero of an instruction
6102     // marked uniform after vectorization, rather than VF identical values.
6103     // Thus, if we scalarize an instruction that uses a uniform, we would
6104     // create uses of values corresponding to the lanes we aren't emitting code
6105     // for. This behavior can be changed by allowing getScalarValue to clone
6106     // the lane zero values for uniforms rather than asserting.
6107     for (Use &U : I->operands())
6108       if (auto *J = dyn_cast<Instruction>(U.get()))
6109         if (isUniformAfterVectorization(J, VF))
6110           return false;
6111 
6112     // Otherwise, we can scalarize the instruction.
6113     return true;
6114   };
6115 
6116   // Compute the expected cost discount from scalarizing the entire expression
6117   // feeding the predicated instruction. We currently only consider expressions
6118   // that are single-use instruction chains.
6119   Worklist.push_back(PredInst);
6120   while (!Worklist.empty()) {
6121     Instruction *I = Worklist.pop_back_val();
6122 
6123     // If we've already analyzed the instruction, there's nothing to do.
6124     if (ScalarCosts.find(I) != ScalarCosts.end())
6125       continue;
6126 
6127     // Compute the cost of the vector instruction. Note that this cost already
6128     // includes the scalarization overhead of the predicated instruction.
6129     InstructionCost VectorCost = getInstructionCost(I, VF).first;
6130 
6131     // Compute the cost of the scalarized instruction. This cost is the cost of
6132     // the instruction as if it wasn't if-converted and instead remained in the
6133     // predicated block. We will scale this cost by block probability after
6134     // computing the scalarization overhead.
6135     InstructionCost ScalarCost =
6136         VF.getFixedValue() *
6137         getInstructionCost(I, ElementCount::getFixed(1)).first;
6138 
6139     // Compute the scalarization overhead of needed insertelement instructions
6140     // and phi nodes.
6141     if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
6142       ScalarCost += TTI.getScalarizationOverhead(
6143           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6144           APInt::getAllOnes(VF.getFixedValue()), true, false);
6145       ScalarCost +=
6146           VF.getFixedValue() *
6147           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6148     }
6149 
6150     // Compute the scalarization overhead of needed extractelement
6151     // instructions. For each of the instruction's operands, if the operand can
6152     // be scalarized, add it to the worklist; otherwise, account for the
6153     // overhead.
6154     for (Use &U : I->operands())
6155       if (auto *J = dyn_cast<Instruction>(U.get())) {
6156         assert(VectorType::isValidElementType(J->getType()) &&
6157                "Instruction has non-scalar type");
6158         if (canBeScalarized(J))
6159           Worklist.push_back(J);
6160         else if (needsExtract(J, VF)) {
6161           ScalarCost += TTI.getScalarizationOverhead(
6162               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6163               APInt::getAllOnes(VF.getFixedValue()), false, true);
6164         }
6165       }
6166 
6167     // Scale the total scalar cost by block probability.
6168     ScalarCost /= getReciprocalPredBlockProb();
6169 
6170     // Compute the discount. A non-negative discount means the vector version
6171     // of the instruction costs more, and scalarizing would be beneficial.
6172     Discount += VectorCost - ScalarCost;
6173     ScalarCosts[I] = ScalarCost;
6174   }
6175 
6176   return *Discount.getValue();
6177 }
6178 
6179 LoopVectorizationCostModel::VectorizationCostTy
6180 LoopVectorizationCostModel::expectedCost(
6181     ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
6182   VectorizationCostTy Cost;
6183 
6184   // For each block.
6185   for (BasicBlock *BB : TheLoop->blocks()) {
6186     VectorizationCostTy BlockCost;
6187 
6188     // For each instruction in the old loop.
6189     for (Instruction &I : BB->instructionsWithoutDebug()) {
6190       // Skip ignored values.
6191       if (ValuesToIgnore.count(&I) ||
6192           (VF.isVector() && VecValuesToIgnore.count(&I)))
6193         continue;
6194 
6195       VectorizationCostTy C = getInstructionCost(&I, VF);
6196 
6197       // Check if we should override the cost.
6198       if (C.first.isValid() &&
6199           ForceTargetInstructionCost.getNumOccurrences() > 0)
6200         C.first = InstructionCost(ForceTargetInstructionCost);
6201 
6202       // Keep a list of instructions with invalid costs.
6203       if (Invalid && !C.first.isValid())
6204         Invalid->emplace_back(&I, VF);
6205 
6206       BlockCost.first += C.first;
6207       BlockCost.second |= C.second;
6208       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6209                         << " for VF " << VF << " For instruction: " << I
6210                         << '\n');
6211     }
6212 
6213     // If we are vectorizing a predicated block, it will have been
6214     // if-converted. This means that the block's instructions (aside from
6215     // stores and instructions that may divide by zero) will now be
6216     // unconditionally executed. For the scalar case, we may not always execute
6217     // the predicated block, if it is an if-else block. Thus, scale the block's
6218     // cost by the probability of executing it. blockNeedsPredication from
6219     // Legal is used so as to not include all blocks in tail folded loops.
6220     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6221       BlockCost.first /= getReciprocalPredBlockProb();
6222 
6223     Cost.first += BlockCost.first;
6224     Cost.second |= BlockCost.second;
6225   }
6226 
6227   return Cost;
6228 }
6229 
6230 /// Gets Address Access SCEV after verifying that the access pattern
6231 /// is loop invariant except the induction variable dependence.
6232 ///
6233 /// This SCEV can be sent to the Target in order to estimate the address
6234 /// calculation cost.
6235 static const SCEV *getAddressAccessSCEV(
6236               Value *Ptr,
6237               LoopVectorizationLegality *Legal,
6238               PredicatedScalarEvolution &PSE,
6239               const Loop *TheLoop) {
6240 
6241   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6242   if (!Gep)
6243     return nullptr;
6244 
6245   // We are looking for a gep with all loop invariant indices except for one
6246   // which should be an induction variable.
6247   auto SE = PSE.getSE();
6248   unsigned NumOperands = Gep->getNumOperands();
6249   for (unsigned i = 1; i < NumOperands; ++i) {
6250     Value *Opd = Gep->getOperand(i);
6251     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6252         !Legal->isInductionVariable(Opd))
6253       return nullptr;
6254   }
6255 
6256   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6257   return PSE.getSCEV(Ptr);
6258 }
6259 
6260 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6261   return Legal->hasStride(I->getOperand(0)) ||
6262          Legal->hasStride(I->getOperand(1));
6263 }
6264 
6265 InstructionCost
6266 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6267                                                         ElementCount VF) {
6268   assert(VF.isVector() &&
6269          "Scalarization cost of instruction implies vectorization.");
6270   if (VF.isScalable())
6271     return InstructionCost::getInvalid();
6272 
6273   Type *ValTy = getLoadStoreType(I);
6274   auto SE = PSE.getSE();
6275 
6276   unsigned AS = getLoadStoreAddressSpace(I);
6277   Value *Ptr = getLoadStorePointerOperand(I);
6278   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6279   // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
6280   //       that it is being called from this specific place.
6281 
6282   // Figure out whether the access is strided and get the stride value
6283   // if it's known in compile time
6284   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6285 
6286   // Get the cost of the scalar memory instruction and address computation.
6287   InstructionCost Cost =
6288       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6289 
6290   // Don't pass *I here, since it is scalar but will actually be part of a
6291   // vectorized loop where the user of it is a vectorized instruction.
6292   const Align Alignment = getLoadStoreAlignment(I);
6293   Cost += VF.getKnownMinValue() *
6294           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6295                               AS, TTI::TCK_RecipThroughput);
6296 
6297   // Get the overhead of the extractelement and insertelement instructions
6298   // we might create due to scalarization.
6299   Cost += getScalarizationOverhead(I, VF);
6300 
6301   // If we have a predicated load/store, it will need extra i1 extracts and
6302   // conditional branches, but may not be executed for each vector lane. Scale
6303   // the cost by the probability of executing the predicated block.
6304   if (isPredicatedInst(I, VF)) {
6305     Cost /= getReciprocalPredBlockProb();
6306 
6307     // Add the cost of an i1 extract and a branch
6308     auto *Vec_i1Ty =
6309         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
6310     Cost += TTI.getScalarizationOverhead(
6311         Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
6312         /*Insert=*/false, /*Extract=*/true);
6313     Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
6314 
6315     if (useEmulatedMaskMemRefHack(I, VF))
6316       // Artificially setting to a high enough value to practically disable
6317       // vectorization with such operations.
6318       Cost = 3000000;
6319   }
6320 
6321   return Cost;
6322 }
6323 
6324 InstructionCost
6325 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6326                                                     ElementCount VF) {
6327   Type *ValTy = getLoadStoreType(I);
6328   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6329   Value *Ptr = getLoadStorePointerOperand(I);
6330   unsigned AS = getLoadStoreAddressSpace(I);
6331   int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
6332   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6333 
6334   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6335          "Stride should be 1 or -1 for consecutive memory access");
6336   const Align Alignment = getLoadStoreAlignment(I);
6337   InstructionCost Cost = 0;
6338   if (Legal->isMaskRequired(I))
6339     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6340                                       CostKind);
6341   else
6342     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6343                                 CostKind, I);
6344 
6345   bool Reverse = ConsecutiveStride < 0;
6346   if (Reverse)
6347     Cost +=
6348         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
6349   return Cost;
6350 }
6351 
6352 InstructionCost
6353 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6354                                                 ElementCount VF) {
6355   assert(Legal->isUniformMemOp(*I));
6356 
6357   Type *ValTy = getLoadStoreType(I);
6358   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6359   const Align Alignment = getLoadStoreAlignment(I);
6360   unsigned AS = getLoadStoreAddressSpace(I);
6361   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6362   if (isa<LoadInst>(I)) {
6363     return TTI.getAddressComputationCost(ValTy) +
6364            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6365                                CostKind) +
6366            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6367   }
6368   StoreInst *SI = cast<StoreInst>(I);
6369 
6370   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6371   return TTI.getAddressComputationCost(ValTy) +
6372          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6373                              CostKind) +
6374          (isLoopInvariantStoreValue
6375               ? 0
6376               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6377                                        VF.getKnownMinValue() - 1));
6378 }
6379 
6380 InstructionCost
6381 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6382                                                  ElementCount VF) {
6383   Type *ValTy = getLoadStoreType(I);
6384   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6385   const Align Alignment = getLoadStoreAlignment(I);
6386   const Value *Ptr = getLoadStorePointerOperand(I);
6387 
6388   return TTI.getAddressComputationCost(VectorTy) +
6389          TTI.getGatherScatterOpCost(
6390              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6391              TargetTransformInfo::TCK_RecipThroughput, I);
6392 }
6393 
6394 InstructionCost
6395 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6396                                                    ElementCount VF) {
6397   // TODO: Once we have support for interleaving with scalable vectors
6398   // we can calculate the cost properly here.
6399   if (VF.isScalable())
6400     return InstructionCost::getInvalid();
6401 
6402   Type *ValTy = getLoadStoreType(I);
6403   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6404   unsigned AS = getLoadStoreAddressSpace(I);
6405 
6406   auto Group = getInterleavedAccessGroup(I);
6407   assert(Group && "Fail to get an interleaved access group.");
6408 
6409   unsigned InterleaveFactor = Group->getFactor();
6410   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6411 
6412   // Holds the indices of existing members in the interleaved group.
6413   SmallVector<unsigned, 4> Indices;
6414   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
6415     if (Group->getMember(IF))
6416       Indices.push_back(IF);
6417 
6418   // Calculate the cost of the whole interleaved group.
6419   bool UseMaskForGaps =
6420       (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6421       (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
6422   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
6423       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6424       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6425 
6426   if (Group->isReverse()) {
6427     // TODO: Add support for reversed masked interleaved access.
6428     assert(!Legal->isMaskRequired(I) &&
6429            "Reverse masked interleaved access not supported.");
6430     Cost +=
6431         Group->getNumMembers() *
6432         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
6433   }
6434   return Cost;
6435 }
6436 
6437 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
6438     Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
6439   using namespace llvm::PatternMatch;
6440   // Early exit for no inloop reductions
6441   if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6442     return None;
6443   auto *VectorTy = cast<VectorType>(Ty);
6444 
6445   // We are looking for a pattern of, and finding the minimal acceptable cost:
6446   //  reduce(mul(ext(A), ext(B))) or
6447   //  reduce(mul(A, B)) or
6448   //  reduce(ext(A)) or
6449   //  reduce(A).
6450   // The basic idea is that we walk down the tree to do that, finding the root
6451   // reduction instruction in InLoopReductionImmediateChains. From there we find
6452   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6453   // of the components. If the reduction cost is lower then we return it for the
6454   // reduction instruction and 0 for the other instructions in the pattern. If
6455   // it is not we return an invalid cost specifying the orignal cost method
6456   // should be used.
6457   Instruction *RetI = I;
6458   if (match(RetI, m_ZExtOrSExt(m_Value()))) {
6459     if (!RetI->hasOneUser())
6460       return None;
6461     RetI = RetI->user_back();
6462   }
6463   if (match(RetI, m_Mul(m_Value(), m_Value())) &&
6464       RetI->user_back()->getOpcode() == Instruction::Add) {
6465     if (!RetI->hasOneUser())
6466       return None;
6467     RetI = RetI->user_back();
6468   }
6469 
6470   // Test if the found instruction is a reduction, and if not return an invalid
6471   // cost specifying the parent to use the original cost modelling.
6472   if (!InLoopReductionImmediateChains.count(RetI))
6473     return None;
6474 
6475   // Find the reduction this chain is a part of and calculate the basic cost of
6476   // the reduction on its own.
6477   Instruction *LastChain = InLoopReductionImmediateChains[RetI];
6478   Instruction *ReductionPhi = LastChain;
6479   while (!isa<PHINode>(ReductionPhi))
6480     ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
6481 
6482   const RecurrenceDescriptor &RdxDesc =
6483       Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
6484 
6485   InstructionCost BaseCost = TTI.getArithmeticReductionCost(
6486       RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
6487 
6488   // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6489   // normal fmul instruction to the cost of the fadd reduction.
6490   if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
6491     BaseCost +=
6492         TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
6493 
6494   // If we're using ordered reductions then we can just return the base cost
6495   // here, since getArithmeticReductionCost calculates the full ordered
6496   // reduction cost when FP reassociation is not allowed.
6497   if (useOrderedReductions(RdxDesc))
6498     return BaseCost;
6499 
6500   // Get the operand that was not the reduction chain and match it to one of the
6501   // patterns, returning the better cost if it is found.
6502   Instruction *RedOp = RetI->getOperand(1) == LastChain
6503                            ? dyn_cast<Instruction>(RetI->getOperand(0))
6504                            : dyn_cast<Instruction>(RetI->getOperand(1));
6505 
6506   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6507 
6508   Instruction *Op0, *Op1;
6509   if (RedOp &&
6510       match(RedOp,
6511             m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
6512       match(Op0, m_ZExtOrSExt(m_Value())) &&
6513       Op0->getOpcode() == Op1->getOpcode() &&
6514       Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6515       !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
6516       (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
6517 
6518     // Matched reduce(ext(mul(ext(A), ext(B)))
6519     // Note that the extend opcodes need to all match, or if A==B they will have
6520     // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6521     // which is equally fine.
6522     bool IsUnsigned = isa<ZExtInst>(Op0);
6523     auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6524     auto *MulType = VectorType::get(Op0->getType(), VectorTy);
6525 
6526     InstructionCost ExtCost =
6527         TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
6528                              TTI::CastContextHint::None, CostKind, Op0);
6529     InstructionCost MulCost =
6530         TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
6531     InstructionCost Ext2Cost =
6532         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
6533                              TTI::CastContextHint::None, CostKind, RedOp);
6534 
6535     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6536         /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6537         CostKind);
6538 
6539     if (RedCost.isValid() &&
6540         RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6541       return I == RetI ? RedCost : 0;
6542   } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
6543              !TheLoop->isLoopInvariant(RedOp)) {
6544     // Matched reduce(ext(A))
6545     bool IsUnsigned = isa<ZExtInst>(RedOp);
6546     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6547     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6548         /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6549         CostKind);
6550 
6551     InstructionCost ExtCost =
6552         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6553                              TTI::CastContextHint::None, CostKind, RedOp);
6554     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6555       return I == RetI ? RedCost : 0;
6556   } else if (RedOp &&
6557              match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
6558     if (match(Op0, m_ZExtOrSExt(m_Value())) &&
6559         Op0->getOpcode() == Op1->getOpcode() &&
6560         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
6561       bool IsUnsigned = isa<ZExtInst>(Op0);
6562       Type *Op0Ty = Op0->getOperand(0)->getType();
6563       Type *Op1Ty = Op1->getOperand(0)->getType();
6564       Type *LargestOpTy =
6565           Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
6566                                                                     : Op0Ty;
6567       auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
6568 
6569       // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of
6570       // different sizes. We take the largest type as the ext to reduce, and add
6571       // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6572       InstructionCost ExtCost0 = TTI.getCastInstrCost(
6573           Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
6574           TTI::CastContextHint::None, CostKind, Op0);
6575       InstructionCost ExtCost1 = TTI.getCastInstrCost(
6576           Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
6577           TTI::CastContextHint::None, CostKind, Op1);
6578       InstructionCost MulCost =
6579           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6580 
6581       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6582           /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6583           CostKind);
6584       InstructionCost ExtraExtCost = 0;
6585       if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
6586         Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
6587         ExtraExtCost = TTI.getCastInstrCost(
6588             ExtraExtOp->getOpcode(), ExtType,
6589             VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
6590             TTI::CastContextHint::None, CostKind, ExtraExtOp);
6591       }
6592 
6593       if (RedCost.isValid() &&
6594           (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6595         return I == RetI ? RedCost : 0;
6596     } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
6597       // Matched reduce(mul())
6598       InstructionCost MulCost =
6599           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6600 
6601       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6602           /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy,
6603           CostKind);
6604 
6605       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
6606         return I == RetI ? RedCost : 0;
6607     }
6608   }
6609 
6610   return I == RetI ? Optional<InstructionCost>(BaseCost) : None;
6611 }
6612 
6613 InstructionCost
6614 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6615                                                      ElementCount VF) {
6616   // Calculate scalar cost only. Vectorization cost should be ready at this
6617   // moment.
6618   if (VF.isScalar()) {
6619     Type *ValTy = getLoadStoreType(I);
6620     const Align Alignment = getLoadStoreAlignment(I);
6621     unsigned AS = getLoadStoreAddressSpace(I);
6622 
6623     return TTI.getAddressComputationCost(ValTy) +
6624            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6625                                TTI::TCK_RecipThroughput, I);
6626   }
6627   return getWideningCost(I, VF);
6628 }
6629 
6630 LoopVectorizationCostModel::VectorizationCostTy
6631 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6632                                                ElementCount VF) {
6633   // If we know that this instruction will remain uniform, check the cost of
6634   // the scalar version.
6635   if (isUniformAfterVectorization(I, VF))
6636     VF = ElementCount::getFixed(1);
6637 
6638   if (VF.isVector() && isProfitableToScalarize(I, VF))
6639     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6640 
6641   // Forced scalars do not have any scalarization overhead.
6642   auto ForcedScalar = ForcedScalars.find(VF);
6643   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6644     auto InstSet = ForcedScalar->second;
6645     if (InstSet.count(I))
6646       return VectorizationCostTy(
6647           (getInstructionCost(I, ElementCount::getFixed(1)).first *
6648            VF.getKnownMinValue()),
6649           false);
6650   }
6651 
6652   Type *VectorTy;
6653   InstructionCost C = getInstructionCost(I, VF, VectorTy);
6654 
6655   bool TypeNotScalarized = false;
6656   if (VF.isVector() && VectorTy->isVectorTy()) {
6657     if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) {
6658       if (VF.isScalable())
6659         // <vscale x 1 x iN> is assumed to be profitable over iN because
6660         // scalable registers are a distinct register class from scalar ones.
6661         // If we ever find a target which wants to lower scalable vectors
6662         // back to scalars, we'll need to update this code to explicitly
6663         // ask TTI about the register class uses for each part.
6664         TypeNotScalarized = NumParts <= VF.getKnownMinValue();
6665       else
6666         TypeNotScalarized = NumParts < VF.getKnownMinValue();
6667     } else
6668       C = InstructionCost::getInvalid();
6669   }
6670   return VectorizationCostTy(C, TypeNotScalarized);
6671 }
6672 
6673 InstructionCost
6674 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6675                                                      ElementCount VF) const {
6676 
6677   // There is no mechanism yet to create a scalable scalarization loop,
6678   // so this is currently Invalid.
6679   if (VF.isScalable())
6680     return InstructionCost::getInvalid();
6681 
6682   if (VF.isScalar())
6683     return 0;
6684 
6685   InstructionCost Cost = 0;
6686   Type *RetTy = ToVectorTy(I->getType(), VF);
6687   if (!RetTy->isVoidTy() &&
6688       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6689     Cost += TTI.getScalarizationOverhead(
6690         cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true,
6691         false);
6692 
6693   // Some targets keep addresses scalar.
6694   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6695     return Cost;
6696 
6697   // Some targets support efficient element stores.
6698   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6699     return Cost;
6700 
6701   // Collect operands to consider.
6702   CallInst *CI = dyn_cast<CallInst>(I);
6703   Instruction::op_range Ops = CI ? CI->args() : I->operands();
6704 
6705   // Skip operands that do not require extraction/scalarization and do not incur
6706   // any overhead.
6707   SmallVector<Type *> Tys;
6708   for (auto *V : filterExtractingOperands(Ops, VF))
6709     Tys.push_back(MaybeVectorizeType(V->getType(), VF));
6710   return Cost + TTI.getOperandsScalarizationOverhead(
6711                     filterExtractingOperands(Ops, VF), Tys);
6712 }
6713 
6714 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6715   if (VF.isScalar())
6716     return;
6717   NumPredStores = 0;
6718   for (BasicBlock *BB : TheLoop->blocks()) {
6719     // For each instruction in the old loop.
6720     for (Instruction &I : *BB) {
6721       Value *Ptr =  getLoadStorePointerOperand(&I);
6722       if (!Ptr)
6723         continue;
6724 
6725       // TODO: We should generate better code and update the cost model for
6726       // predicated uniform stores. Today they are treated as any other
6727       // predicated store (see added test cases in
6728       // invariant-store-vectorization.ll).
6729       if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6730         NumPredStores++;
6731 
6732       if (Legal->isUniformMemOp(I)) {
6733         // TODO: Avoid replicating loads and stores instead of
6734         // relying on instcombine to remove them.
6735         // Load: Scalar load + broadcast
6736         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6737         InstructionCost Cost;
6738         if (isa<StoreInst>(&I) && VF.isScalable() &&
6739             isLegalGatherOrScatter(&I, VF)) {
6740           Cost = getGatherScatterCost(&I, VF);
6741           setWideningDecision(&I, VF, CM_GatherScatter, Cost);
6742         } else {
6743           Cost = getUniformMemOpCost(&I, VF);
6744           setWideningDecision(&I, VF, CM_Scalarize, Cost);
6745         }
6746         continue;
6747       }
6748 
6749       // We assume that widening is the best solution when possible.
6750       if (memoryInstructionCanBeWidened(&I, VF)) {
6751         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6752         int ConsecutiveStride = Legal->isConsecutivePtr(
6753             getLoadStoreType(&I), getLoadStorePointerOperand(&I));
6754         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6755                "Expected consecutive stride.");
6756         InstWidening Decision =
6757             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6758         setWideningDecision(&I, VF, Decision, Cost);
6759         continue;
6760       }
6761 
6762       // Choose between Interleaving, Gather/Scatter or Scalarization.
6763       InstructionCost InterleaveCost = InstructionCost::getInvalid();
6764       unsigned NumAccesses = 1;
6765       if (isAccessInterleaved(&I)) {
6766         auto Group = getInterleavedAccessGroup(&I);
6767         assert(Group && "Fail to get an interleaved access group.");
6768 
6769         // Make one decision for the whole group.
6770         if (getWideningDecision(&I, VF) != CM_Unknown)
6771           continue;
6772 
6773         NumAccesses = Group->getNumMembers();
6774         if (interleavedAccessCanBeWidened(&I, VF))
6775           InterleaveCost = getInterleaveGroupCost(&I, VF);
6776       }
6777 
6778       InstructionCost GatherScatterCost =
6779           isLegalGatherOrScatter(&I, VF)
6780               ? getGatherScatterCost(&I, VF) * NumAccesses
6781               : InstructionCost::getInvalid();
6782 
6783       InstructionCost ScalarizationCost =
6784           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6785 
6786       // Choose better solution for the current VF,
6787       // write down this decision and use it during vectorization.
6788       InstructionCost Cost;
6789       InstWidening Decision;
6790       if (InterleaveCost <= GatherScatterCost &&
6791           InterleaveCost < ScalarizationCost) {
6792         Decision = CM_Interleave;
6793         Cost = InterleaveCost;
6794       } else if (GatherScatterCost < ScalarizationCost) {
6795         Decision = CM_GatherScatter;
6796         Cost = GatherScatterCost;
6797       } else {
6798         Decision = CM_Scalarize;
6799         Cost = ScalarizationCost;
6800       }
6801       // If the instructions belongs to an interleave group, the whole group
6802       // receives the same decision. The whole group receives the cost, but
6803       // the cost will actually be assigned to one instruction.
6804       if (auto Group = getInterleavedAccessGroup(&I))
6805         setWideningDecision(Group, VF, Decision, Cost);
6806       else
6807         setWideningDecision(&I, VF, Decision, Cost);
6808     }
6809   }
6810 
6811   // Make sure that any load of address and any other address computation
6812   // remains scalar unless there is gather/scatter support. This avoids
6813   // inevitable extracts into address registers, and also has the benefit of
6814   // activating LSR more, since that pass can't optimize vectorized
6815   // addresses.
6816   if (TTI.prefersVectorizedAddressing())
6817     return;
6818 
6819   // Start with all scalar pointer uses.
6820   SmallPtrSet<Instruction *, 8> AddrDefs;
6821   for (BasicBlock *BB : TheLoop->blocks())
6822     for (Instruction &I : *BB) {
6823       Instruction *PtrDef =
6824         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6825       if (PtrDef && TheLoop->contains(PtrDef) &&
6826           getWideningDecision(&I, VF) != CM_GatherScatter)
6827         AddrDefs.insert(PtrDef);
6828     }
6829 
6830   // Add all instructions used to generate the addresses.
6831   SmallVector<Instruction *, 4> Worklist;
6832   append_range(Worklist, AddrDefs);
6833   while (!Worklist.empty()) {
6834     Instruction *I = Worklist.pop_back_val();
6835     for (auto &Op : I->operands())
6836       if (auto *InstOp = dyn_cast<Instruction>(Op))
6837         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6838             AddrDefs.insert(InstOp).second)
6839           Worklist.push_back(InstOp);
6840   }
6841 
6842   for (auto *I : AddrDefs) {
6843     if (isa<LoadInst>(I)) {
6844       // Setting the desired widening decision should ideally be handled in
6845       // by cost functions, but since this involves the task of finding out
6846       // if the loaded register is involved in an address computation, it is
6847       // instead changed here when we know this is the case.
6848       InstWidening Decision = getWideningDecision(I, VF);
6849       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6850         // Scalarize a widened load of address.
6851         setWideningDecision(
6852             I, VF, CM_Scalarize,
6853             (VF.getKnownMinValue() *
6854              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6855       else if (auto Group = getInterleavedAccessGroup(I)) {
6856         // Scalarize an interleave group of address loads.
6857         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6858           if (Instruction *Member = Group->getMember(I))
6859             setWideningDecision(
6860                 Member, VF, CM_Scalarize,
6861                 (VF.getKnownMinValue() *
6862                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6863         }
6864       }
6865     } else
6866       // Make sure I gets scalarized and a cost estimate without
6867       // scalarization overhead.
6868       ForcedScalars[VF].insert(I);
6869   }
6870 }
6871 
6872 InstructionCost
6873 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
6874                                                Type *&VectorTy) {
6875   Type *RetTy = I->getType();
6876   if (canTruncateToMinimalBitwidth(I, VF))
6877     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6878   auto SE = PSE.getSE();
6879   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6880 
6881   auto hasSingleCopyAfterVectorization = [this](Instruction *I,
6882                                                 ElementCount VF) -> bool {
6883     if (VF.isScalar())
6884       return true;
6885 
6886     auto Scalarized = InstsToScalarize.find(VF);
6887     assert(Scalarized != InstsToScalarize.end() &&
6888            "VF not yet analyzed for scalarization profitability");
6889     return !Scalarized->second.count(I) &&
6890            llvm::all_of(I->users(), [&](User *U) {
6891              auto *UI = cast<Instruction>(U);
6892              return !Scalarized->second.count(UI);
6893            });
6894   };
6895   (void) hasSingleCopyAfterVectorization;
6896 
6897   if (isScalarAfterVectorization(I, VF)) {
6898     // With the exception of GEPs and PHIs, after scalarization there should
6899     // only be one copy of the instruction generated in the loop. This is
6900     // because the VF is either 1, or any instructions that need scalarizing
6901     // have already been dealt with by the the time we get here. As a result,
6902     // it means we don't have to multiply the instruction cost by VF.
6903     assert(I->getOpcode() == Instruction::GetElementPtr ||
6904            I->getOpcode() == Instruction::PHI ||
6905            (I->getOpcode() == Instruction::BitCast &&
6906             I->getType()->isPointerTy()) ||
6907            hasSingleCopyAfterVectorization(I, VF));
6908     VectorTy = RetTy;
6909   } else
6910     VectorTy = ToVectorTy(RetTy, VF);
6911 
6912   // TODO: We need to estimate the cost of intrinsic calls.
6913   switch (I->getOpcode()) {
6914   case Instruction::GetElementPtr:
6915     // We mark this instruction as zero-cost because the cost of GEPs in
6916     // vectorized code depends on whether the corresponding memory instruction
6917     // is scalarized or not. Therefore, we handle GEPs with the memory
6918     // instruction cost.
6919     return 0;
6920   case Instruction::Br: {
6921     // In cases of scalarized and predicated instructions, there will be VF
6922     // predicated blocks in the vectorized loop. Each branch around these
6923     // blocks requires also an extract of its vector compare i1 element.
6924     bool ScalarPredicatedBB = false;
6925     BranchInst *BI = cast<BranchInst>(I);
6926     if (VF.isVector() && BI->isConditional() &&
6927         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
6928          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
6929       ScalarPredicatedBB = true;
6930 
6931     if (ScalarPredicatedBB) {
6932       // Not possible to scalarize scalable vector with predicated instructions.
6933       if (VF.isScalable())
6934         return InstructionCost::getInvalid();
6935       // Return cost for branches around scalarized and predicated blocks.
6936       auto *Vec_i1Ty =
6937           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6938       return (
6939           TTI.getScalarizationOverhead(
6940               Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) +
6941           (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6942     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6943       // The back-edge branch will remain, as will all scalar branches.
6944       return TTI.getCFInstrCost(Instruction::Br, CostKind);
6945     else
6946       // This branch will be eliminated by if-conversion.
6947       return 0;
6948     // Note: We currently assume zero cost for an unconditional branch inside
6949     // a predicated block since it will become a fall-through, although we
6950     // may decide in the future to call TTI for all branches.
6951   }
6952   case Instruction::PHI: {
6953     auto *Phi = cast<PHINode>(I);
6954 
6955     // First-order recurrences are replaced by vector shuffles inside the loop.
6956     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6957     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
6958       return TTI.getShuffleCost(
6959           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
6960           None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
6961 
6962     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6963     // converted into select instructions. We require N - 1 selects per phi
6964     // node, where N is the number of incoming values.
6965     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
6966       return (Phi->getNumIncomingValues() - 1) *
6967              TTI.getCmpSelInstrCost(
6968                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6969                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
6970                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
6971 
6972     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
6973   }
6974   case Instruction::UDiv:
6975   case Instruction::SDiv:
6976   case Instruction::URem:
6977   case Instruction::SRem:
6978     // If we have a predicated instruction, it may not be executed for each
6979     // vector lane. Get the scalarization cost and scale this amount by the
6980     // probability of executing the predicated block. If the instruction is not
6981     // predicated, we fall through to the next case.
6982     if (VF.isVector() && isScalarWithPredication(I, VF)) {
6983       InstructionCost Cost = 0;
6984 
6985       // These instructions have a non-void type, so account for the phi nodes
6986       // that we will create. This cost is likely to be zero. The phi node
6987       // cost, if any, should be scaled by the block probability because it
6988       // models a copy at the end of each predicated block.
6989       Cost += VF.getKnownMinValue() *
6990               TTI.getCFInstrCost(Instruction::PHI, CostKind);
6991 
6992       // The cost of the non-predicated instruction.
6993       Cost += VF.getKnownMinValue() *
6994               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
6995 
6996       // The cost of insertelement and extractelement instructions needed for
6997       // scalarization.
6998       Cost += getScalarizationOverhead(I, VF);
6999 
7000       // Scale the cost by the probability of executing the predicated blocks.
7001       // This assumes the predicated block for each vector lane is equally
7002       // likely.
7003       return Cost / getReciprocalPredBlockProb();
7004     }
7005     LLVM_FALLTHROUGH;
7006   case Instruction::Add:
7007   case Instruction::FAdd:
7008   case Instruction::Sub:
7009   case Instruction::FSub:
7010   case Instruction::Mul:
7011   case Instruction::FMul:
7012   case Instruction::FDiv:
7013   case Instruction::FRem:
7014   case Instruction::Shl:
7015   case Instruction::LShr:
7016   case Instruction::AShr:
7017   case Instruction::And:
7018   case Instruction::Or:
7019   case Instruction::Xor: {
7020     // Since we will replace the stride by 1 the multiplication should go away.
7021     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7022       return 0;
7023 
7024     // Detect reduction patterns
7025     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7026       return *RedCost;
7027 
7028     // Certain instructions can be cheaper to vectorize if they have a constant
7029     // second vector operand. One example of this are shifts on x86.
7030     Value *Op2 = I->getOperand(1);
7031     TargetTransformInfo::OperandValueProperties Op2VP;
7032     TargetTransformInfo::OperandValueKind Op2VK =
7033         TTI.getOperandInfo(Op2, Op2VP);
7034     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7035       Op2VK = TargetTransformInfo::OK_UniformValue;
7036 
7037     SmallVector<const Value *, 4> Operands(I->operand_values());
7038     return TTI.getArithmeticInstrCost(
7039         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7040         Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
7041   }
7042   case Instruction::FNeg: {
7043     return TTI.getArithmeticInstrCost(
7044         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7045         TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None,
7046         TargetTransformInfo::OP_None, I->getOperand(0), I);
7047   }
7048   case Instruction::Select: {
7049     SelectInst *SI = cast<SelectInst>(I);
7050     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7051     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7052 
7053     const Value *Op0, *Op1;
7054     using namespace llvm::PatternMatch;
7055     if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
7056                         match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
7057       // select x, y, false --> x & y
7058       // select x, true, y --> x | y
7059       TTI::OperandValueProperties Op1VP = TTI::OP_None;
7060       TTI::OperandValueProperties Op2VP = TTI::OP_None;
7061       TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP);
7062       TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP);
7063       assert(Op0->getType()->getScalarSizeInBits() == 1 &&
7064               Op1->getType()->getScalarSizeInBits() == 1);
7065 
7066       SmallVector<const Value *, 2> Operands{Op0, Op1};
7067       return TTI.getArithmeticInstrCost(
7068           match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
7069           CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I);
7070     }
7071 
7072     Type *CondTy = SI->getCondition()->getType();
7073     if (!ScalarCond)
7074       CondTy = VectorType::get(CondTy, VF);
7075 
7076     CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
7077     if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
7078       Pred = Cmp->getPredicate();
7079     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
7080                                   CostKind, I);
7081   }
7082   case Instruction::ICmp:
7083   case Instruction::FCmp: {
7084     Type *ValTy = I->getOperand(0)->getType();
7085     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7086     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7087       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7088     VectorTy = ToVectorTy(ValTy, VF);
7089     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7090                                   cast<CmpInst>(I)->getPredicate(), CostKind,
7091                                   I);
7092   }
7093   case Instruction::Store:
7094   case Instruction::Load: {
7095     ElementCount Width = VF;
7096     if (Width.isVector()) {
7097       InstWidening Decision = getWideningDecision(I, Width);
7098       assert(Decision != CM_Unknown &&
7099              "CM decision should be taken at this point");
7100       if (Decision == CM_Scalarize) {
7101         if (VF.isScalable() && isa<StoreInst>(I))
7102           // We can't scalarize a scalable vector store (even a uniform one
7103           // currently), return an invalid cost so as to prevent vectorization.
7104           return InstructionCost::getInvalid();
7105         Width = ElementCount::getFixed(1);
7106       }
7107     }
7108     VectorTy = ToVectorTy(getLoadStoreType(I), Width);
7109     return getMemoryInstructionCost(I, VF);
7110   }
7111   case Instruction::BitCast:
7112     if (I->getType()->isPointerTy())
7113       return 0;
7114     LLVM_FALLTHROUGH;
7115   case Instruction::ZExt:
7116   case Instruction::SExt:
7117   case Instruction::FPToUI:
7118   case Instruction::FPToSI:
7119   case Instruction::FPExt:
7120   case Instruction::PtrToInt:
7121   case Instruction::IntToPtr:
7122   case Instruction::SIToFP:
7123   case Instruction::UIToFP:
7124   case Instruction::Trunc:
7125   case Instruction::FPTrunc: {
7126     // Computes the CastContextHint from a Load/Store instruction.
7127     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7128       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7129              "Expected a load or a store!");
7130 
7131       if (VF.isScalar() || !TheLoop->contains(I))
7132         return TTI::CastContextHint::Normal;
7133 
7134       switch (getWideningDecision(I, VF)) {
7135       case LoopVectorizationCostModel::CM_GatherScatter:
7136         return TTI::CastContextHint::GatherScatter;
7137       case LoopVectorizationCostModel::CM_Interleave:
7138         return TTI::CastContextHint::Interleave;
7139       case LoopVectorizationCostModel::CM_Scalarize:
7140       case LoopVectorizationCostModel::CM_Widen:
7141         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7142                                         : TTI::CastContextHint::Normal;
7143       case LoopVectorizationCostModel::CM_Widen_Reverse:
7144         return TTI::CastContextHint::Reversed;
7145       case LoopVectorizationCostModel::CM_Unknown:
7146         llvm_unreachable("Instr did not go through cost modelling?");
7147       }
7148 
7149       llvm_unreachable("Unhandled case!");
7150     };
7151 
7152     unsigned Opcode = I->getOpcode();
7153     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7154     // For Trunc, the context is the only user, which must be a StoreInst.
7155     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7156       if (I->hasOneUse())
7157         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7158           CCH = ComputeCCH(Store);
7159     }
7160     // For Z/Sext, the context is the operand, which must be a LoadInst.
7161     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7162              Opcode == Instruction::FPExt) {
7163       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7164         CCH = ComputeCCH(Load);
7165     }
7166 
7167     // We optimize the truncation of induction variables having constant
7168     // integer steps. The cost of these truncations is the same as the scalar
7169     // operation.
7170     if (isOptimizableIVTruncate(I, VF)) {
7171       auto *Trunc = cast<TruncInst>(I);
7172       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7173                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7174     }
7175 
7176     // Detect reduction patterns
7177     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7178       return *RedCost;
7179 
7180     Type *SrcScalarTy = I->getOperand(0)->getType();
7181     Type *SrcVecTy =
7182         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7183     if (canTruncateToMinimalBitwidth(I, VF)) {
7184       // This cast is going to be shrunk. This may remove the cast or it might
7185       // turn it into slightly different cast. For example, if MinBW == 16,
7186       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7187       //
7188       // Calculate the modified src and dest types.
7189       Type *MinVecTy = VectorTy;
7190       if (Opcode == Instruction::Trunc) {
7191         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7192         VectorTy =
7193             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7194       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7195         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7196         VectorTy =
7197             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7198       }
7199     }
7200 
7201     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7202   }
7203   case Instruction::Call: {
7204     if (RecurrenceDescriptor::isFMulAddIntrinsic(I))
7205       if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7206         return *RedCost;
7207     bool NeedToScalarize;
7208     CallInst *CI = cast<CallInst>(I);
7209     InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7210     if (getVectorIntrinsicIDForCall(CI, TLI)) {
7211       InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
7212       return std::min(CallCost, IntrinsicCost);
7213     }
7214     return CallCost;
7215   }
7216   case Instruction::ExtractValue:
7217     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7218   case Instruction::Alloca:
7219     // We cannot easily widen alloca to a scalable alloca, as
7220     // the result would need to be a vector of pointers.
7221     if (VF.isScalable())
7222       return InstructionCost::getInvalid();
7223     LLVM_FALLTHROUGH;
7224   default:
7225     // This opcode is unknown. Assume that it is the same as 'mul'.
7226     return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7227   } // end of switch.
7228 }
7229 
7230 char LoopVectorize::ID = 0;
7231 
7232 static const char lv_name[] = "Loop Vectorization";
7233 
7234 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7235 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7236 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7237 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
7238 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7239 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7240 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7241 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7242 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7243 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7244 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7245 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7246 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7247 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
7248 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
7249 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7250 
7251 namespace llvm {
7252 
7253 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7254 
7255 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7256                               bool VectorizeOnlyWhenForced) {
7257   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7258 }
7259 
7260 } // end namespace llvm
7261 
7262 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
7263   // Check if the pointer operand of a load or store instruction is
7264   // consecutive.
7265   if (auto *Ptr = getLoadStorePointerOperand(Inst))
7266     return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr);
7267   return false;
7268 }
7269 
7270 void LoopVectorizationCostModel::collectValuesToIgnore() {
7271   // Ignore ephemeral values.
7272   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7273 
7274   // Find all stores to invariant variables. Since they are going to sink
7275   // outside the loop we do not need calculate cost for them.
7276   for (BasicBlock *BB : TheLoop->blocks())
7277     for (Instruction &I : *BB) {
7278       StoreInst *SI;
7279       if ((SI = dyn_cast<StoreInst>(&I)) &&
7280           Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
7281         ValuesToIgnore.insert(&I);
7282     }
7283 
7284   // Ignore type-promoting instructions we identified during reduction
7285   // detection.
7286   for (auto &Reduction : Legal->getReductionVars()) {
7287     const RecurrenceDescriptor &RedDes = Reduction.second;
7288     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7289     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7290   }
7291   // Ignore type-casting instructions we identified during induction
7292   // detection.
7293   for (auto &Induction : Legal->getInductionVars()) {
7294     const InductionDescriptor &IndDes = Induction.second;
7295     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7296     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7297   }
7298 }
7299 
7300 void LoopVectorizationCostModel::collectInLoopReductions() {
7301   for (auto &Reduction : Legal->getReductionVars()) {
7302     PHINode *Phi = Reduction.first;
7303     const RecurrenceDescriptor &RdxDesc = Reduction.second;
7304 
7305     // We don't collect reductions that are type promoted (yet).
7306     if (RdxDesc.getRecurrenceType() != Phi->getType())
7307       continue;
7308 
7309     // If the target would prefer this reduction to happen "in-loop", then we
7310     // want to record it as such.
7311     unsigned Opcode = RdxDesc.getOpcode();
7312     if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7313         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7314                                    TargetTransformInfo::ReductionFlags()))
7315       continue;
7316 
7317     // Check that we can correctly put the reductions into the loop, by
7318     // finding the chain of operations that leads from the phi to the loop
7319     // exit value.
7320     SmallVector<Instruction *, 4> ReductionOperations =
7321         RdxDesc.getReductionOpChain(Phi, TheLoop);
7322     bool InLoop = !ReductionOperations.empty();
7323     if (InLoop) {
7324       InLoopReductionChains[Phi] = ReductionOperations;
7325       // Add the elements to InLoopReductionImmediateChains for cost modelling.
7326       Instruction *LastChain = Phi;
7327       for (auto *I : ReductionOperations) {
7328         InLoopReductionImmediateChains[I] = LastChain;
7329         LastChain = I;
7330       }
7331     }
7332     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7333                       << " reduction for phi: " << *Phi << "\n");
7334   }
7335 }
7336 
7337 // TODO: we could return a pair of values that specify the max VF and
7338 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7339 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7340 // doesn't have a cost model that can choose which plan to execute if
7341 // more than one is generated.
7342 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7343                                  LoopVectorizationCostModel &CM) {
7344   unsigned WidestType;
7345   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7346   return WidestVectorRegBits / WidestType;
7347 }
7348 
7349 VectorizationFactor
7350 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7351   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
7352   ElementCount VF = UserVF;
7353   // Outer loop handling: They may require CFG and instruction level
7354   // transformations before even evaluating whether vectorization is profitable.
7355   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7356   // the vectorization pipeline.
7357   if (!OrigLoop->isInnermost()) {
7358     // If the user doesn't provide a vectorization factor, determine a
7359     // reasonable one.
7360     if (UserVF.isZero()) {
7361       VF = ElementCount::getFixed(determineVPlanVF(
7362           TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
7363               .getFixedSize(),
7364           CM));
7365       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7366 
7367       // Make sure we have a VF > 1 for stress testing.
7368       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7369         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7370                           << "overriding computed VF.\n");
7371         VF = ElementCount::getFixed(4);
7372       }
7373     }
7374     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7375     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7376            "VF needs to be a power of two");
7377     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7378                       << "VF " << VF << " to build VPlans.\n");
7379     buildVPlans(VF, VF);
7380 
7381     // For VPlan build stress testing, we bail out after VPlan construction.
7382     if (VPlanBuildStressTest)
7383       return VectorizationFactor::Disabled();
7384 
7385     return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
7386   }
7387 
7388   LLVM_DEBUG(
7389       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7390                 "VPlan-native path.\n");
7391   return VectorizationFactor::Disabled();
7392 }
7393 
7394 bool LoopVectorizationPlanner::requiresTooManyRuntimeChecks() const {
7395   unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
7396   return (NumRuntimePointerChecks >
7397               VectorizerParams::RuntimeMemoryCheckThreshold &&
7398           !Hints.allowReordering()) ||
7399          NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
7400 }
7401 
7402 Optional<VectorizationFactor>
7403 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7404   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7405   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7406   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7407     return None;
7408 
7409   // Invalidate interleave groups if all blocks of loop will be predicated.
7410   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7411       !useMaskedInterleavedAccesses(*TTI)) {
7412     LLVM_DEBUG(
7413         dbgs()
7414         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7415            "which requires masked-interleaved support.\n");
7416     if (CM.InterleaveInfo.invalidateGroups())
7417       // Invalidating interleave groups also requires invalidating all decisions
7418       // based on them, which includes widening decisions and uniform and scalar
7419       // values.
7420       CM.invalidateCostModelingDecisions();
7421   }
7422 
7423   ElementCount MaxUserVF =
7424       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7425   bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
7426   if (!UserVF.isZero() && UserVFIsLegal) {
7427     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7428            "VF needs to be a power of two");
7429     // Collect the instructions (and their associated costs) that will be more
7430     // profitable to scalarize.
7431     if (CM.selectUserVectorizationFactor(UserVF)) {
7432       LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7433       CM.collectInLoopReductions();
7434       buildVPlansWithVPRecipes(UserVF, UserVF);
7435       LLVM_DEBUG(printPlans(dbgs()));
7436       return {{UserVF, 0, 0}};
7437     } else
7438       reportVectorizationInfo("UserVF ignored because of invalid costs.",
7439                               "InvalidCost", ORE, OrigLoop);
7440   }
7441 
7442   // Populate the set of Vectorization Factor Candidates.
7443   ElementCountSet VFCandidates;
7444   for (auto VF = ElementCount::getFixed(1);
7445        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7446     VFCandidates.insert(VF);
7447   for (auto VF = ElementCount::getScalable(1);
7448        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7449     VFCandidates.insert(VF);
7450 
7451   for (const auto &VF : VFCandidates) {
7452     // Collect Uniform and Scalar instructions after vectorization with VF.
7453     CM.collectUniformsAndScalars(VF);
7454 
7455     // Collect the instructions (and their associated costs) that will be more
7456     // profitable to scalarize.
7457     if (VF.isVector())
7458       CM.collectInstsToScalarize(VF);
7459   }
7460 
7461   CM.collectInLoopReductions();
7462   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7463   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7464 
7465   LLVM_DEBUG(printPlans(dbgs()));
7466   if (!MaxFactors.hasVector())
7467     return VectorizationFactor::Disabled();
7468 
7469   // Select the optimal vectorization factor.
7470   return CM.selectVectorizationFactor(VFCandidates);
7471 }
7472 
7473 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
7474   assert(count_if(VPlans,
7475                   [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7476              1 &&
7477          "Best VF has not a single VPlan.");
7478 
7479   for (const VPlanPtr &Plan : VPlans) {
7480     if (Plan->hasVF(VF))
7481       return *Plan.get();
7482   }
7483   llvm_unreachable("No plan found!");
7484 }
7485 
7486 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7487   SmallVector<Metadata *, 4> MDs;
7488   // Reserve first location for self reference to the LoopID metadata node.
7489   MDs.push_back(nullptr);
7490   bool IsUnrollMetadata = false;
7491   MDNode *LoopID = L->getLoopID();
7492   if (LoopID) {
7493     // First find existing loop unrolling disable metadata.
7494     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7495       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7496       if (MD) {
7497         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7498         IsUnrollMetadata =
7499             S && S->getString().startswith("llvm.loop.unroll.disable");
7500       }
7501       MDs.push_back(LoopID->getOperand(i));
7502     }
7503   }
7504 
7505   if (!IsUnrollMetadata) {
7506     // Add runtime unroll disable metadata.
7507     LLVMContext &Context = L->getHeader()->getContext();
7508     SmallVector<Metadata *, 1> DisableOperands;
7509     DisableOperands.push_back(
7510         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7511     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7512     MDs.push_back(DisableNode);
7513     MDNode *NewLoopID = MDNode::get(Context, MDs);
7514     // Set operand 0 to refer to the loop id itself.
7515     NewLoopID->replaceOperandWith(0, NewLoopID);
7516     L->setLoopID(NewLoopID);
7517   }
7518 }
7519 
7520 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
7521                                            VPlan &BestVPlan,
7522                                            InnerLoopVectorizer &ILV,
7523                                            DominatorTree *DT) {
7524   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
7525                     << '\n');
7526 
7527   // Perform the actual loop transformation.
7528 
7529   // 1. Set up the skeleton for vectorization, including vector pre-header and
7530   // middle block. The vector loop is created during VPlan execution.
7531   VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
7532   Value *CanonicalIVStartValue;
7533   std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
7534       ILV.createVectorizedLoopSkeleton();
7535 
7536   // Only use noalias metadata when using memory checks guaranteeing no overlap
7537   // across all iterations.
7538   const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7539   if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7540       !LAI->getRuntimePointerChecking()->getDiffChecks()) {
7541 
7542     //  We currently don't use LoopVersioning for the actual loop cloning but we
7543     //  still use it to add the noalias metadata.
7544     //  TODO: Find a better way to re-use LoopVersioning functionality to add
7545     //        metadata.
7546     State.LVer = std::make_unique<LoopVersioning>(
7547         *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
7548         PSE.getSE());
7549     State.LVer->prepareNoAliasMetadata();
7550   }
7551 
7552   ILV.collectPoisonGeneratingRecipes(State);
7553 
7554   ILV.printDebugTracesAtStart();
7555 
7556   //===------------------------------------------------===//
7557   //
7558   // Notice: any optimization or new instruction that go
7559   // into the code below should also be implemented in
7560   // the cost-model.
7561   //
7562   //===------------------------------------------------===//
7563 
7564   // 2. Copy and widen instructions from the old loop into the new loop.
7565   BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr),
7566                              ILV.getOrCreateVectorTripCount(nullptr),
7567                              CanonicalIVStartValue, State);
7568 
7569   BestVPlan.execute(&State);
7570 
7571   // Keep all loop hints from the original loop on the vector loop (we'll
7572   // replace the vectorizer-specific hints below).
7573   MDNode *OrigLoopID = OrigLoop->getLoopID();
7574 
7575   Optional<MDNode *> VectorizedLoopID =
7576       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7577                                       LLVMLoopVectorizeFollowupVectorized});
7578 
7579   VPBasicBlock *HeaderVPBB =
7580       BestVPlan.getVectorLoopRegion()->getEntryBasicBlock();
7581   Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7582   if (VectorizedLoopID)
7583     L->setLoopID(VectorizedLoopID.getValue());
7584   else {
7585     // Keep all loop hints from the original loop on the vector loop (we'll
7586     // replace the vectorizer-specific hints below).
7587     if (MDNode *LID = OrigLoop->getLoopID())
7588       L->setLoopID(LID);
7589 
7590     LoopVectorizeHints Hints(L, true, *ORE);
7591     Hints.setAlreadyVectorized();
7592   }
7593   // Disable runtime unrolling when vectorizing the epilogue loop.
7594   if (CanonicalIVStartValue)
7595     AddRuntimeUnrollDisableMetaData(L);
7596 
7597   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7598   //    predication, updating analyses.
7599   ILV.fixVectorizedLoop(State, BestVPlan);
7600 
7601   ILV.printDebugTracesAtEnd();
7602 }
7603 
7604 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
7605 void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
7606   for (const auto &Plan : VPlans)
7607     if (PrintVPlansInDotFormat)
7608       Plan->printDOT(O);
7609     else
7610       Plan->print(O);
7611 }
7612 #endif
7613 
7614 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
7615     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7616 
7617   // We create new control-flow for the vectorized loop, so the original exit
7618   // conditions will be dead after vectorization if it's only used by the
7619   // terminator
7620   SmallVector<BasicBlock*> ExitingBlocks;
7621   OrigLoop->getExitingBlocks(ExitingBlocks);
7622   for (auto *BB : ExitingBlocks) {
7623     auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));
7624     if (!Cmp || !Cmp->hasOneUse())
7625       continue;
7626 
7627     // TODO: we should introduce a getUniqueExitingBlocks on Loop
7628     if (!DeadInstructions.insert(Cmp).second)
7629       continue;
7630 
7631     // The operands of the icmp is often a dead trunc, used by IndUpdate.
7632     // TODO: can recurse through operands in general
7633     for (Value *Op : Cmp->operands()) {
7634       if (isa<TruncInst>(Op) && Op->hasOneUse())
7635           DeadInstructions.insert(cast<Instruction>(Op));
7636     }
7637   }
7638 
7639   // We create new "steps" for induction variable updates to which the original
7640   // induction variables map. An original update instruction will be dead if
7641   // all its users except the induction variable are dead.
7642   auto *Latch = OrigLoop->getLoopLatch();
7643   for (auto &Induction : Legal->getInductionVars()) {
7644     PHINode *Ind = Induction.first;
7645     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
7646 
7647     // If the tail is to be folded by masking, the primary induction variable,
7648     // if exists, isn't dead: it will be used for masking. Don't kill it.
7649     if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
7650       continue;
7651 
7652     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
7653           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
7654         }))
7655       DeadInstructions.insert(IndUpdate);
7656   }
7657 }
7658 
7659 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7660 
7661 //===--------------------------------------------------------------------===//
7662 // EpilogueVectorizerMainLoop
7663 //===--------------------------------------------------------------------===//
7664 
7665 /// This function is partially responsible for generating the control flow
7666 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7667 std::pair<BasicBlock *, Value *>
7668 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
7669   MDNode *OrigLoopID = OrigLoop->getLoopID();
7670 
7671   // Workaround!  Compute the trip count of the original loop and cache it
7672   // before we start modifying the CFG.  This code has a systemic problem
7673   // wherein it tries to run analysis over partially constructed IR; this is
7674   // wrong, and not simply for SCEV.  The trip count of the original loop
7675   // simply happens to be prone to hitting this in practice.  In theory, we
7676   // can hit the same issue for any SCEV, or ValueTracking query done during
7677   // mutation.  See PR49900.
7678   getOrCreateTripCount(OrigLoop->getLoopPreheader());
7679   createVectorLoopSkeleton("");
7680 
7681   // Generate the code to check the minimum iteration count of the vector
7682   // epilogue (see below).
7683   EPI.EpilogueIterationCountCheck =
7684       emitIterationCountCheck(LoopScalarPreHeader, true);
7685   EPI.EpilogueIterationCountCheck->setName("iter.check");
7686 
7687   // Generate the code to check any assumptions that we've made for SCEV
7688   // expressions.
7689   EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader);
7690 
7691   // Generate the code that checks at runtime if arrays overlap. We put the
7692   // checks into a separate block to make the more common case of few elements
7693   // faster.
7694   EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader);
7695 
7696   // Generate the iteration count check for the main loop, *after* the check
7697   // for the epilogue loop, so that the path-length is shorter for the case
7698   // that goes directly through the vector epilogue. The longer-path length for
7699   // the main loop is compensated for, by the gain from vectorizing the larger
7700   // trip count. Note: the branch will get updated later on when we vectorize
7701   // the epilogue.
7702   EPI.MainLoopIterationCountCheck =
7703       emitIterationCountCheck(LoopScalarPreHeader, false);
7704 
7705   // Generate the induction variable.
7706   EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
7707 
7708   // Skip induction resume value creation here because they will be created in
7709   // the second pass. If we created them here, they wouldn't be used anyway,
7710   // because the vplan in the second pass still contains the inductions from the
7711   // original loop.
7712 
7713   return {completeLoopSkeleton(OrigLoopID), nullptr};
7714 }
7715 
7716 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7717   LLVM_DEBUG({
7718     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7719            << "Main Loop VF:" << EPI.MainLoopVF
7720            << ", Main Loop UF:" << EPI.MainLoopUF
7721            << ", Epilogue Loop VF:" << EPI.EpilogueVF
7722            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7723   });
7724 }
7725 
7726 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7727   DEBUG_WITH_TYPE(VerboseDebug, {
7728     dbgs() << "intermediate fn:\n"
7729            << *OrigLoop->getHeader()->getParent() << "\n";
7730   });
7731 }
7732 
7733 BasicBlock *
7734 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
7735                                                     bool ForEpilogue) {
7736   assert(Bypass && "Expected valid bypass basic block.");
7737   ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7738   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7739   Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
7740   // Reuse existing vector loop preheader for TC checks.
7741   // Note that new preheader block is generated for vector loop.
7742   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7743   IRBuilder<> Builder(TCCheckBlock->getTerminator());
7744 
7745   // Generate code to check if the loop's trip count is less than VF * UF of the
7746   // main vector loop.
7747   auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ?
7748       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7749 
7750   Value *CheckMinIters = Builder.CreateICmp(
7751       P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7752       "min.iters.check");
7753 
7754   if (!ForEpilogue)
7755     TCCheckBlock->setName("vector.main.loop.iter.check");
7756 
7757   // Create new preheader for vector loop.
7758   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7759                                    DT, LI, nullptr, "vector.ph");
7760 
7761   if (ForEpilogue) {
7762     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7763                                  DT->getNode(Bypass)->getIDom()) &&
7764            "TC check is expected to dominate Bypass");
7765 
7766     // Update dominator for Bypass & LoopExit.
7767     DT->changeImmediateDominator(Bypass, TCCheckBlock);
7768     if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
7769       // For loops with multiple exits, there's no edge from the middle block
7770       // to exit blocks (as the epilogue must run) and thus no need to update
7771       // the immediate dominator of the exit blocks.
7772       DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
7773 
7774     LoopBypassBlocks.push_back(TCCheckBlock);
7775 
7776     // Save the trip count so we don't have to regenerate it in the
7777     // vec.epilog.iter.check. This is safe to do because the trip count
7778     // generated here dominates the vector epilog iter check.
7779     EPI.TripCount = Count;
7780   }
7781 
7782   ReplaceInstWithInst(
7783       TCCheckBlock->getTerminator(),
7784       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7785 
7786   return TCCheckBlock;
7787 }
7788 
7789 //===--------------------------------------------------------------------===//
7790 // EpilogueVectorizerEpilogueLoop
7791 //===--------------------------------------------------------------------===//
7792 
7793 /// This function is partially responsible for generating the control flow
7794 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7795 std::pair<BasicBlock *, Value *>
7796 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
7797   MDNode *OrigLoopID = OrigLoop->getLoopID();
7798   createVectorLoopSkeleton("vec.epilog.");
7799 
7800   // Now, compare the remaining count and if there aren't enough iterations to
7801   // execute the vectorized epilogue skip to the scalar part.
7802   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
7803   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
7804   LoopVectorPreHeader =
7805       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
7806                  LI, nullptr, "vec.epilog.ph");
7807   emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
7808                                           VecEpilogueIterationCountCheck);
7809 
7810   // Adjust the control flow taking the state info from the main loop
7811   // vectorization into account.
7812   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7813          "expected this to be saved from the previous pass.");
7814   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7815       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7816 
7817   DT->changeImmediateDominator(LoopVectorPreHeader,
7818                                EPI.MainLoopIterationCountCheck);
7819 
7820   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7821       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7822 
7823   if (EPI.SCEVSafetyCheck)
7824     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7825         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7826   if (EPI.MemSafetyCheck)
7827     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7828         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7829 
7830   DT->changeImmediateDominator(
7831       VecEpilogueIterationCountCheck,
7832       VecEpilogueIterationCountCheck->getSinglePredecessor());
7833 
7834   DT->changeImmediateDominator(LoopScalarPreHeader,
7835                                EPI.EpilogueIterationCountCheck);
7836   if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
7837     // If there is an epilogue which must run, there's no edge from the
7838     // middle block to exit blocks  and thus no need to update the immediate
7839     // dominator of the exit blocks.
7840     DT->changeImmediateDominator(LoopExitBlock,
7841                                  EPI.EpilogueIterationCountCheck);
7842 
7843   // Keep track of bypass blocks, as they feed start values to the induction
7844   // phis in the scalar loop preheader.
7845   if (EPI.SCEVSafetyCheck)
7846     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
7847   if (EPI.MemSafetyCheck)
7848     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
7849   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
7850 
7851   // The vec.epilog.iter.check block may contain Phi nodes from reductions which
7852   // merge control-flow from the latch block and the middle block. Update the
7853   // incoming values here and move the Phi into the preheader.
7854   SmallVector<PHINode *, 4> PhisInBlock;
7855   for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
7856     PhisInBlock.push_back(&Phi);
7857 
7858   for (PHINode *Phi : PhisInBlock) {
7859     Phi->replaceIncomingBlockWith(
7860         VecEpilogueIterationCountCheck->getSinglePredecessor(),
7861         VecEpilogueIterationCountCheck);
7862     Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
7863     if (EPI.SCEVSafetyCheck)
7864       Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
7865     if (EPI.MemSafetyCheck)
7866       Phi->removeIncomingValue(EPI.MemSafetyCheck);
7867     Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
7868   }
7869 
7870   // Generate a resume induction for the vector epilogue and put it in the
7871   // vector epilogue preheader
7872   Type *IdxTy = Legal->getWidestInductionType();
7873   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
7874                                          LoopVectorPreHeader->getFirstNonPHI());
7875   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7876   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7877                            EPI.MainLoopIterationCountCheck);
7878 
7879   // Generate induction resume values. These variables save the new starting
7880   // indexes for the scalar loop. They are used to test if there are any tail
7881   // iterations left once the vector loop has completed.
7882   // Note that when the vectorized epilogue is skipped due to iteration count
7883   // check, then the resume value for the induction variable comes from
7884   // the trip count of the main vector loop, hence passing the AdditionalBypass
7885   // argument.
7886   createInductionResumeValues({VecEpilogueIterationCountCheck,
7887                                EPI.VectorTripCount} /* AdditionalBypass */);
7888 
7889   return {completeLoopSkeleton(OrigLoopID), EPResumeVal};
7890 }
7891 
7892 BasicBlock *
7893 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7894     BasicBlock *Bypass, BasicBlock *Insert) {
7895 
7896   assert(EPI.TripCount &&
7897          "Expected trip count to have been safed in the first pass.");
7898   assert(
7899       (!isa<Instruction>(EPI.TripCount) ||
7900        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7901       "saved trip count does not dominate insertion point.");
7902   Value *TC = EPI.TripCount;
7903   IRBuilder<> Builder(Insert->getTerminator());
7904   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7905 
7906   // Generate code to check if the loop's trip count is less than VF * UF of the
7907   // vector epilogue loop.
7908   auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ?
7909       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7910 
7911   Value *CheckMinIters =
7912       Builder.CreateICmp(P, Count,
7913                          createStepForVF(Builder, Count->getType(),
7914                                          EPI.EpilogueVF, EPI.EpilogueUF),
7915                          "min.epilog.iters.check");
7916 
7917   ReplaceInstWithInst(
7918       Insert->getTerminator(),
7919       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7920 
7921   LoopBypassBlocks.push_back(Insert);
7922   return Insert;
7923 }
7924 
7925 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7926   LLVM_DEBUG({
7927     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7928            << "Epilogue Loop VF:" << EPI.EpilogueVF
7929            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7930   });
7931 }
7932 
7933 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7934   DEBUG_WITH_TYPE(VerboseDebug, {
7935     dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7936   });
7937 }
7938 
7939 bool LoopVectorizationPlanner::getDecisionAndClampRange(
7940     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7941   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
7942   bool PredicateAtRangeStart = Predicate(Range.Start);
7943 
7944   for (ElementCount TmpVF = Range.Start * 2;
7945        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
7946     if (Predicate(TmpVF) != PredicateAtRangeStart) {
7947       Range.End = TmpVF;
7948       break;
7949     }
7950 
7951   return PredicateAtRangeStart;
7952 }
7953 
7954 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7955 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7956 /// of VF's starting at a given VF and extending it as much as possible. Each
7957 /// vectorization decision can potentially shorten this sub-range during
7958 /// buildVPlan().
7959 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
7960                                            ElementCount MaxVF) {
7961   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
7962   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
7963     VFRange SubRange = {VF, MaxVFPlusOne};
7964     VPlans.push_back(buildVPlan(SubRange));
7965     VF = SubRange.End;
7966   }
7967 }
7968 
7969 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
7970                                          VPlanPtr &Plan) {
7971   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7972 
7973   // Look for cached value.
7974   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7975   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
7976   if (ECEntryIt != EdgeMaskCache.end())
7977     return ECEntryIt->second;
7978 
7979   VPValue *SrcMask = createBlockInMask(Src, Plan);
7980 
7981   // The terminator has to be a branch inst!
7982   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
7983   assert(BI && "Unexpected terminator found");
7984 
7985   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
7986     return EdgeMaskCache[Edge] = SrcMask;
7987 
7988   // If source is an exiting block, we know the exit edge is dynamically dead
7989   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
7990   // adding uses of an otherwise potentially dead instruction.
7991   if (OrigLoop->isLoopExiting(Src))
7992     return EdgeMaskCache[Edge] = SrcMask;
7993 
7994   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
7995   assert(EdgeMask && "No Edge Mask found for condition");
7996 
7997   if (BI->getSuccessor(0) != Dst)
7998     EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
7999 
8000   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8001     // The condition is 'SrcMask && EdgeMask', which is equivalent to
8002     // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8003     // The select version does not introduce new UB if SrcMask is false and
8004     // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8005     VPValue *False = Plan->getOrAddVPValue(
8006         ConstantInt::getFalse(BI->getCondition()->getType()));
8007     EdgeMask =
8008         Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
8009   }
8010 
8011   return EdgeMaskCache[Edge] = EdgeMask;
8012 }
8013 
8014 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
8015   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8016 
8017   // Look for cached value.
8018   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
8019   if (BCEntryIt != BlockMaskCache.end())
8020     return BCEntryIt->second;
8021 
8022   // All-one mask is modelled as no-mask following the convention for masked
8023   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8024   VPValue *BlockMask = nullptr;
8025 
8026   if (OrigLoop->getHeader() == BB) {
8027     if (!CM.blockNeedsPredicationForAnyReason(BB))
8028       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
8029 
8030     // Introduce the early-exit compare IV <= BTC to form header block mask.
8031     // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8032     // constructing the desired canonical IV in the header block as its first
8033     // non-phi instructions.
8034     assert(CM.foldTailByMasking() && "must fold the tail");
8035     VPBasicBlock *HeaderVPBB =
8036         Plan->getVectorLoopRegion()->getEntryBasicBlock();
8037     auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8038     auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV());
8039     HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi());
8040 
8041     VPBuilder::InsertPointGuard Guard(Builder);
8042     Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8043     if (CM.TTI.emitGetActiveLaneMask()) {
8044       VPValue *TC = Plan->getOrCreateTripCount();
8045       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC});
8046     } else {
8047       VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
8048       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
8049     }
8050     return BlockMaskCache[BB] = BlockMask;
8051   }
8052 
8053   // This is the block mask. We OR all incoming edges.
8054   for (auto *Predecessor : predecessors(BB)) {
8055     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8056     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
8057       return BlockMaskCache[BB] = EdgeMask;
8058 
8059     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8060       BlockMask = EdgeMask;
8061       continue;
8062     }
8063 
8064     BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8065   }
8066 
8067   return BlockMaskCache[BB] = BlockMask;
8068 }
8069 
8070 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
8071                                                 ArrayRef<VPValue *> Operands,
8072                                                 VFRange &Range,
8073                                                 VPlanPtr &Plan) {
8074   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8075          "Must be called with either a load or store");
8076 
8077   auto willWiden = [&](ElementCount VF) -> bool {
8078     LoopVectorizationCostModel::InstWidening Decision =
8079         CM.getWideningDecision(I, VF);
8080     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8081            "CM decision should be taken at this point.");
8082     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8083       return true;
8084     if (CM.isScalarAfterVectorization(I, VF) ||
8085         CM.isProfitableToScalarize(I, VF))
8086       return false;
8087     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8088   };
8089 
8090   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8091     return nullptr;
8092 
8093   VPValue *Mask = nullptr;
8094   if (Legal->isMaskRequired(I))
8095     Mask = createBlockInMask(I->getParent(), Plan);
8096 
8097   // Determine if the pointer operand of the access is either consecutive or
8098   // reverse consecutive.
8099   LoopVectorizationCostModel::InstWidening Decision =
8100       CM.getWideningDecision(I, Range.Start);
8101   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8102   bool Consecutive =
8103       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8104 
8105   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8106     return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
8107                                               Consecutive, Reverse);
8108 
8109   StoreInst *Store = cast<StoreInst>(I);
8110   return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
8111                                             Mask, Consecutive, Reverse);
8112 }
8113 
8114 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8115 /// insert a recipe to expand the step for the induction recipe.
8116 static VPWidenIntOrFpInductionRecipe *createWidenInductionRecipes(
8117     PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start,
8118     const InductionDescriptor &IndDesc, LoopVectorizationCostModel &CM,
8119     VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range) {
8120   // Returns true if an instruction \p I should be scalarized instead of
8121   // vectorized for the chosen vectorization factor.
8122   auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) {
8123     return CM.isScalarAfterVectorization(I, VF) ||
8124            CM.isProfitableToScalarize(I, VF);
8125   };
8126 
8127   bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange(
8128       [&](ElementCount VF) {
8129         return ShouldScalarizeInstruction(PhiOrTrunc, VF);
8130       },
8131       Range);
8132   assert(IndDesc.getStartValue() ==
8133          Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8134   assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8135          "step must be loop invariant");
8136 
8137   VPValue *Step =
8138       vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
8139   if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8140     return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI,
8141                                              !NeedsScalarIVOnly);
8142   }
8143   assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8144   return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc,
8145                                            !NeedsScalarIVOnly);
8146 }
8147 
8148 VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI(
8149     PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) {
8150 
8151   // Check if this is an integer or fp induction. If so, build the recipe that
8152   // produces its scalar and vector values.
8153   if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8154     return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, CM, Plan,
8155                                        *PSE.getSE(), *OrigLoop, Range);
8156 
8157   // Check if this is pointer induction. If so, build the recipe for it.
8158   if (auto *II = Legal->getPointerInductionDescriptor(Phi))
8159     return new VPWidenPointerInductionRecipe(Phi, Operands[0], *II,
8160                                              *PSE.getSE());
8161   return nullptr;
8162 }
8163 
8164 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8165     TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) {
8166   // Optimize the special case where the source is a constant integer
8167   // induction variable. Notice that we can only optimize the 'trunc' case
8168   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8169   // (c) other casts depend on pointer size.
8170 
8171   // Determine whether \p K is a truncation based on an induction variable that
8172   // can be optimized.
8173   auto isOptimizableIVTruncate =
8174       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8175     return [=](ElementCount VF) -> bool {
8176       return CM.isOptimizableIVTruncate(K, VF);
8177     };
8178   };
8179 
8180   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8181           isOptimizableIVTruncate(I), Range)) {
8182 
8183     auto *Phi = cast<PHINode>(I->getOperand(0));
8184     const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8185     VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
8186     return createWidenInductionRecipes(Phi, I, Start, II, CM, Plan,
8187                                        *PSE.getSE(), *OrigLoop, Range);
8188   }
8189   return nullptr;
8190 }
8191 
8192 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
8193                                                 ArrayRef<VPValue *> Operands,
8194                                                 VPlanPtr &Plan) {
8195   // If all incoming values are equal, the incoming VPValue can be used directly
8196   // instead of creating a new VPBlendRecipe.
8197   VPValue *FirstIncoming = Operands[0];
8198   if (all_of(Operands, [FirstIncoming](const VPValue *Inc) {
8199         return FirstIncoming == Inc;
8200       })) {
8201     return Operands[0];
8202   }
8203 
8204   unsigned NumIncoming = Phi->getNumIncomingValues();
8205   // For in-loop reductions, we do not need to create an additional select.
8206   VPValue *InLoopVal = nullptr;
8207   for (unsigned In = 0; In < NumIncoming; In++) {
8208     PHINode *PhiOp =
8209         dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue());
8210     if (PhiOp && CM.isInLoopReduction(PhiOp)) {
8211       assert(!InLoopVal && "Found more than one in-loop reduction!");
8212       InLoopVal = Operands[In];
8213     }
8214   }
8215 
8216   assert((!InLoopVal || NumIncoming == 2) &&
8217          "Found an in-loop reduction for PHI with unexpected number of "
8218          "incoming values");
8219   if (InLoopVal)
8220     return Operands[Operands[0] == InLoopVal ? 1 : 0];
8221 
8222   // We know that all PHIs in non-header blocks are converted into selects, so
8223   // we don't have to worry about the insertion order and we can just use the
8224   // builder. At this point we generate the predication tree. There may be
8225   // duplications since this is a simple recursive scan, but future
8226   // optimizations will clean it up.
8227   SmallVector<VPValue *, 2> OperandsWithMask;
8228 
8229   for (unsigned In = 0; In < NumIncoming; In++) {
8230     VPValue *EdgeMask =
8231       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8232     assert((EdgeMask || NumIncoming == 1) &&
8233            "Multiple predecessors with one having a full mask");
8234     OperandsWithMask.push_back(Operands[In]);
8235     if (EdgeMask)
8236       OperandsWithMask.push_back(EdgeMask);
8237   }
8238   return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
8239 }
8240 
8241 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8242                                                    ArrayRef<VPValue *> Operands,
8243                                                    VFRange &Range) const {
8244 
8245   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8246       [this, CI](ElementCount VF) {
8247         return CM.isScalarWithPredication(CI, VF);
8248       },
8249       Range);
8250 
8251   if (IsPredicated)
8252     return nullptr;
8253 
8254   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8255   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8256              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8257              ID == Intrinsic::pseudoprobe ||
8258              ID == Intrinsic::experimental_noalias_scope_decl))
8259     return nullptr;
8260 
8261   auto willWiden = [&](ElementCount VF) -> bool {
8262     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8263     // The following case may be scalarized depending on the VF.
8264     // The flag shows whether we use Intrinsic or a usual Call for vectorized
8265     // version of the instruction.
8266     // Is it beneficial to perform intrinsic call compared to lib call?
8267     bool NeedToScalarize = false;
8268     InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
8269     InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
8270     bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
8271     return UseVectorIntrinsic || !NeedToScalarize;
8272   };
8273 
8274   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8275     return nullptr;
8276 
8277   ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());
8278   return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()));
8279 }
8280 
8281 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8282   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8283          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8284   // Instruction should be widened, unless it is scalar after vectorization,
8285   // scalarization is profitable or it is predicated.
8286   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8287     return CM.isScalarAfterVectorization(I, VF) ||
8288            CM.isProfitableToScalarize(I, VF) ||
8289            CM.isScalarWithPredication(I, VF);
8290   };
8291   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8292                                                              Range);
8293 }
8294 
8295 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8296                                            ArrayRef<VPValue *> Operands) const {
8297   auto IsVectorizableOpcode = [](unsigned Opcode) {
8298     switch (Opcode) {
8299     case Instruction::Add:
8300     case Instruction::And:
8301     case Instruction::AShr:
8302     case Instruction::BitCast:
8303     case Instruction::FAdd:
8304     case Instruction::FCmp:
8305     case Instruction::FDiv:
8306     case Instruction::FMul:
8307     case Instruction::FNeg:
8308     case Instruction::FPExt:
8309     case Instruction::FPToSI:
8310     case Instruction::FPToUI:
8311     case Instruction::FPTrunc:
8312     case Instruction::FRem:
8313     case Instruction::FSub:
8314     case Instruction::ICmp:
8315     case Instruction::IntToPtr:
8316     case Instruction::LShr:
8317     case Instruction::Mul:
8318     case Instruction::Or:
8319     case Instruction::PtrToInt:
8320     case Instruction::SDiv:
8321     case Instruction::Select:
8322     case Instruction::SExt:
8323     case Instruction::Shl:
8324     case Instruction::SIToFP:
8325     case Instruction::SRem:
8326     case Instruction::Sub:
8327     case Instruction::Trunc:
8328     case Instruction::UDiv:
8329     case Instruction::UIToFP:
8330     case Instruction::URem:
8331     case Instruction::Xor:
8332     case Instruction::ZExt:
8333     case Instruction::Freeze:
8334       return true;
8335     }
8336     return false;
8337   };
8338 
8339   if (!IsVectorizableOpcode(I->getOpcode()))
8340     return nullptr;
8341 
8342   // Success: widen this instruction.
8343   return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8344 }
8345 
8346 void VPRecipeBuilder::fixHeaderPhis() {
8347   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8348   for (VPHeaderPHIRecipe *R : PhisToFix) {
8349     auto *PN = cast<PHINode>(R->getUnderlyingValue());
8350     VPRecipeBase *IncR =
8351         getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8352     R->addOperand(IncR->getVPSingleValue());
8353   }
8354 }
8355 
8356 VPBasicBlock *VPRecipeBuilder::handleReplication(
8357     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8358     VPlanPtr &Plan) {
8359   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8360       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8361       Range);
8362 
8363   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8364       [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); },
8365       Range);
8366 
8367   // Even if the instruction is not marked as uniform, there are certain
8368   // intrinsic calls that can be effectively treated as such, so we check for
8369   // them here. Conservatively, we only do this for scalable vectors, since
8370   // for fixed-width VFs we can always fall back on full scalarization.
8371   if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8372     switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8373     case Intrinsic::assume:
8374     case Intrinsic::lifetime_start:
8375     case Intrinsic::lifetime_end:
8376       // For scalable vectors if one of the operands is variant then we still
8377       // want to mark as uniform, which will generate one instruction for just
8378       // the first lane of the vector. We can't scalarize the call in the same
8379       // way as for fixed-width vectors because we don't know how many lanes
8380       // there are.
8381       //
8382       // The reasons for doing it this way for scalable vectors are:
8383       //   1. For the assume intrinsic generating the instruction for the first
8384       //      lane is still be better than not generating any at all. For
8385       //      example, the input may be a splat across all lanes.
8386       //   2. For the lifetime start/end intrinsics the pointer operand only
8387       //      does anything useful when the input comes from a stack object,
8388       //      which suggests it should always be uniform. For non-stack objects
8389       //      the effect is to poison the object, which still allows us to
8390       //      remove the call.
8391       IsUniform = true;
8392       break;
8393     default:
8394       break;
8395     }
8396   }
8397 
8398   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8399                                        IsUniform, IsPredicated);
8400   setRecipe(I, Recipe);
8401   Plan->addVPValue(I, Recipe);
8402 
8403   // Find if I uses a predicated instruction. If so, it will use its scalar
8404   // value. Avoid hoisting the insert-element which packs the scalar value into
8405   // a vector value, as that happens iff all users use the vector value.
8406   for (VPValue *Op : Recipe->operands()) {
8407     auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef());
8408     if (!PredR)
8409       continue;
8410     auto *RepR =
8411         cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef());
8412     assert(RepR->isPredicated() &&
8413            "expected Replicate recipe to be predicated");
8414     RepR->setAlsoPack(false);
8415   }
8416 
8417   // Finalize the recipe for Instr, first if it is not predicated.
8418   if (!IsPredicated) {
8419     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8420     VPBB->appendRecipe(Recipe);
8421     return VPBB;
8422   }
8423   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8424 
8425   VPBlockBase *SingleSucc = VPBB->getSingleSuccessor();
8426   assert(SingleSucc && "VPBB must have a single successor when handling "
8427                        "predicated replication.");
8428   VPBlockUtils::disconnectBlocks(VPBB, SingleSucc);
8429   // Record predicated instructions for above packing optimizations.
8430   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
8431   VPBlockUtils::insertBlockAfter(Region, VPBB);
8432   auto *RegSucc = new VPBasicBlock();
8433   VPBlockUtils::insertBlockAfter(RegSucc, Region);
8434   VPBlockUtils::connectBlocks(RegSucc, SingleSucc);
8435   return RegSucc;
8436 }
8437 
8438 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
8439                                                       VPRecipeBase *PredRecipe,
8440                                                       VPlanPtr &Plan) {
8441   // Instructions marked for predication are replicated and placed under an
8442   // if-then construct to prevent side-effects.
8443 
8444   // Generate recipes to compute the block mask for this region.
8445   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
8446 
8447   // Build the triangular if-then region.
8448   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8449   assert(Instr->getParent() && "Predicated instruction not in any basic block");
8450   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8451   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8452   auto *PHIRecipe = Instr->getType()->isVoidTy()
8453                         ? nullptr
8454                         : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
8455   if (PHIRecipe) {
8456     Plan->removeVPValueFor(Instr);
8457     Plan->addVPValue(Instr, PHIRecipe);
8458   }
8459   auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8460   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8461   VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true);
8462 
8463   // Note: first set Entry as region entry and then connect successors starting
8464   // from it in order, to propagate the "parent" of each VPBasicBlock.
8465   VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
8466   VPBlockUtils::connectBlocks(Pred, Exiting);
8467 
8468   return Region;
8469 }
8470 
8471 VPRecipeOrVPValueTy
8472 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8473                                         ArrayRef<VPValue *> Operands,
8474                                         VFRange &Range, VPlanPtr &Plan) {
8475   // First, check for specific widening recipes that deal with inductions, Phi
8476   // nodes, calls and memory operations.
8477   VPRecipeBase *Recipe;
8478   if (auto Phi = dyn_cast<PHINode>(Instr)) {
8479     if (Phi->getParent() != OrigLoop->getHeader())
8480       return tryToBlend(Phi, Operands, Plan);
8481     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range)))
8482       return toVPRecipeResult(Recipe);
8483 
8484     VPHeaderPHIRecipe *PhiRecipe = nullptr;
8485     assert((Legal->isReductionVariable(Phi) ||
8486             Legal->isFirstOrderRecurrence(Phi)) &&
8487            "can only widen reductions and first-order recurrences here");
8488     VPValue *StartV = Operands[0];
8489     if (Legal->isReductionVariable(Phi)) {
8490       const RecurrenceDescriptor &RdxDesc =
8491           Legal->getReductionVars().find(Phi)->second;
8492       assert(RdxDesc.getRecurrenceStartValue() ==
8493              Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8494       PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8495                                            CM.isInLoopReduction(Phi),
8496                                            CM.useOrderedReductions(RdxDesc));
8497     } else {
8498       PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8499     }
8500 
8501     // Record the incoming value from the backedge, so we can add the incoming
8502     // value from the backedge after all recipes have been created.
8503     recordRecipeOf(cast<Instruction>(
8504         Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())));
8505     PhisToFix.push_back(PhiRecipe);
8506     return toVPRecipeResult(PhiRecipe);
8507   }
8508 
8509   if (isa<TruncInst>(Instr) &&
8510       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
8511                                                Range, *Plan)))
8512     return toVPRecipeResult(Recipe);
8513 
8514   // All widen recipes below deal only with VF > 1.
8515   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8516           [&](ElementCount VF) { return VF.isScalar(); }, Range))
8517     return nullptr;
8518 
8519   if (auto *CI = dyn_cast<CallInst>(Instr))
8520     return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
8521 
8522   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8523     return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
8524 
8525   if (!shouldWiden(Instr, Range))
8526     return nullptr;
8527 
8528   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8529     return toVPRecipeResult(new VPWidenGEPRecipe(
8530         GEP, make_range(Operands.begin(), Operands.end()), OrigLoop));
8531 
8532   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8533     bool InvariantCond =
8534         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
8535     return toVPRecipeResult(new VPWidenSelectRecipe(
8536         *SI, make_range(Operands.begin(), Operands.end()), InvariantCond));
8537   }
8538 
8539   return toVPRecipeResult(tryToWiden(Instr, Operands));
8540 }
8541 
8542 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8543                                                         ElementCount MaxVF) {
8544   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8545 
8546   // Collect instructions from the original loop that will become trivially dead
8547   // in the vectorized loop. We don't need to vectorize these instructions. For
8548   // example, original induction update instructions can become dead because we
8549   // separately emit induction "steps" when generating code for the new loop.
8550   // Similarly, we create a new latch condition when setting up the structure
8551   // of the new loop, so the old one can become dead.
8552   SmallPtrSet<Instruction *, 4> DeadInstructions;
8553   collectTriviallyDeadInstructions(DeadInstructions);
8554 
8555   // Add assume instructions we need to drop to DeadInstructions, to prevent
8556   // them from being added to the VPlan.
8557   // TODO: We only need to drop assumes in blocks that get flattend. If the
8558   // control flow is preserved, we should keep them.
8559   auto &ConditionalAssumes = Legal->getConditionalAssumes();
8560   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
8561 
8562   MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
8563   // Dead instructions do not need sinking. Remove them from SinkAfter.
8564   for (Instruction *I : DeadInstructions)
8565     SinkAfter.erase(I);
8566 
8567   // Cannot sink instructions after dead instructions (there won't be any
8568   // recipes for them). Instead, find the first non-dead previous instruction.
8569   for (auto &P : Legal->getSinkAfter()) {
8570     Instruction *SinkTarget = P.second;
8571     Instruction *FirstInst = &*SinkTarget->getParent()->begin();
8572     (void)FirstInst;
8573     while (DeadInstructions.contains(SinkTarget)) {
8574       assert(
8575           SinkTarget != FirstInst &&
8576           "Must find a live instruction (at least the one feeding the "
8577           "first-order recurrence PHI) before reaching beginning of the block");
8578       SinkTarget = SinkTarget->getPrevNode();
8579       assert(SinkTarget != P.first &&
8580              "sink source equals target, no sinking required");
8581     }
8582     P.second = SinkTarget;
8583   }
8584 
8585   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8586   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8587     VFRange SubRange = {VF, MaxVFPlusOne};
8588     VPlans.push_back(
8589         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
8590     VF = SubRange.End;
8591   }
8592 }
8593 
8594 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a
8595 // CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a
8596 // BranchOnCount VPInstruction to the latch.
8597 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
8598                                   bool HasNUW) {
8599   Value *StartIdx = ConstantInt::get(IdxTy, 0);
8600   auto *StartV = Plan.getOrAddVPValue(StartIdx);
8601 
8602   auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8603   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8604   VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8605   Header->insert(CanonicalIVPHI, Header->begin());
8606 
8607   auto *CanonicalIVIncrement =
8608       new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW
8609                                : VPInstruction::CanonicalIVIncrement,
8610                         {CanonicalIVPHI}, DL);
8611   CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8612 
8613   VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
8614   EB->appendRecipe(CanonicalIVIncrement);
8615 
8616   auto *BranchOnCount =
8617       new VPInstruction(VPInstruction::BranchOnCount,
8618                         {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8619   EB->appendRecipe(BranchOnCount);
8620 }
8621 
8622 // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
8623 // original exit block.
8624 static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB,
8625                                 VPBasicBlock *MiddleVPBB, Loop *OrigLoop,
8626                                 VPlan &Plan) {
8627   BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
8628   BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
8629   // Only handle single-exit loops with unique exit blocks for now.
8630   if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB)
8631     return;
8632 
8633   // Introduce VPUsers modeling the exit values.
8634   for (PHINode &ExitPhi : ExitBB->phis()) {
8635     Value *IncomingValue =
8636         ExitPhi.getIncomingValueForBlock(ExitingBB);
8637     VPValue *V = Plan.getOrAddVPValue(IncomingValue, true);
8638     Plan.addLiveOut(&ExitPhi, V);
8639   }
8640 }
8641 
8642 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
8643     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
8644     const MapVector<Instruction *, Instruction *> &SinkAfter) {
8645 
8646   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8647 
8648   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8649 
8650   // ---------------------------------------------------------------------------
8651   // Pre-construction: record ingredients whose recipes we'll need to further
8652   // process after constructing the initial VPlan.
8653   // ---------------------------------------------------------------------------
8654 
8655   // Mark instructions we'll need to sink later and their targets as
8656   // ingredients whose recipe we'll need to record.
8657   for (auto &Entry : SinkAfter) {
8658     RecipeBuilder.recordRecipeOf(Entry.first);
8659     RecipeBuilder.recordRecipeOf(Entry.second);
8660   }
8661   for (auto &Reduction : CM.getInLoopReductionChains()) {
8662     PHINode *Phi = Reduction.first;
8663     RecurKind Kind =
8664         Legal->getReductionVars().find(Phi)->second.getRecurrenceKind();
8665     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8666 
8667     RecipeBuilder.recordRecipeOf(Phi);
8668     for (auto &R : ReductionOperations) {
8669       RecipeBuilder.recordRecipeOf(R);
8670       // For min/max reductions, where we have a pair of icmp/select, we also
8671       // need to record the ICmp recipe, so it can be removed later.
8672       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
8673              "Only min/max recurrences allowed for inloop reductions");
8674       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
8675         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
8676     }
8677   }
8678 
8679   // For each interleave group which is relevant for this (possibly trimmed)
8680   // Range, add it to the set of groups to be later applied to the VPlan and add
8681   // placeholders for its members' Recipes which we'll be replacing with a
8682   // single VPInterleaveRecipe.
8683   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8684     auto applyIG = [IG, this](ElementCount VF) -> bool {
8685       return (VF.isVector() && // Query is illegal for VF == 1
8686               CM.getWideningDecision(IG->getInsertPos(), VF) ==
8687                   LoopVectorizationCostModel::CM_Interleave);
8688     };
8689     if (!getDecisionAndClampRange(applyIG, Range))
8690       continue;
8691     InterleaveGroups.insert(IG);
8692     for (unsigned i = 0; i < IG->getFactor(); i++)
8693       if (Instruction *Member = IG->getMember(i))
8694         RecipeBuilder.recordRecipeOf(Member);
8695   };
8696 
8697   // ---------------------------------------------------------------------------
8698   // Build initial VPlan: Scan the body of the loop in a topological order to
8699   // visit each basic block after having visited its predecessor basic blocks.
8700   // ---------------------------------------------------------------------------
8701 
8702   // Create initial VPlan skeleton, starting with a block for the pre-header,
8703   // followed by a region for the vector loop, followed by the middle block. The
8704   // skeleton vector loop region contains a header and latch block.
8705   VPBasicBlock *Preheader = new VPBasicBlock("vector.ph");
8706   auto Plan = std::make_unique<VPlan>(Preheader);
8707 
8708   VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
8709   VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
8710   VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
8711   auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop");
8712   VPBlockUtils::insertBlockAfter(TopRegion, Preheader);
8713   VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");
8714   VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
8715 
8716   Instruction *DLInst =
8717       getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
8718   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(),
8719                         DLInst ? DLInst->getDebugLoc() : DebugLoc(),
8720                         !CM.foldTailByMasking());
8721 
8722   // Scan the body of the loop in a topological order to visit each basic block
8723   // after having visited its predecessor basic blocks.
8724   LoopBlocksDFS DFS(OrigLoop);
8725   DFS.perform(LI);
8726 
8727   VPBasicBlock *VPBB = HeaderVPBB;
8728   SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove;
8729   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8730     // Relevant instructions from basic block BB will be grouped into VPRecipe
8731     // ingredients and fill a new VPBasicBlock.
8732     unsigned VPBBsForBB = 0;
8733     if (VPBB != HeaderVPBB)
8734       VPBB->setName(BB->getName());
8735     Builder.setInsertPoint(VPBB);
8736 
8737     // Introduce each ingredient into VPlan.
8738     // TODO: Model and preserve debug intrinsics in VPlan.
8739     for (Instruction &I : BB->instructionsWithoutDebug()) {
8740       Instruction *Instr = &I;
8741 
8742       // First filter out irrelevant instructions, to ensure no recipes are
8743       // built for them.
8744       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
8745         continue;
8746 
8747       SmallVector<VPValue *, 4> Operands;
8748       auto *Phi = dyn_cast<PHINode>(Instr);
8749       if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
8750         Operands.push_back(Plan->getOrAddVPValue(
8751             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
8752       } else {
8753         auto OpRange = Plan->mapToVPValues(Instr->operands());
8754         Operands = {OpRange.begin(), OpRange.end()};
8755       }
8756 
8757       // Invariant stores inside loop will be deleted and a single store
8758       // with the final reduction value will be added to the exit block
8759       StoreInst *SI;
8760       if ((SI = dyn_cast<StoreInst>(&I)) &&
8761           Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
8762         continue;
8763 
8764       if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
8765               Instr, Operands, Range, Plan)) {
8766         // If Instr can be simplified to an existing VPValue, use it.
8767         if (RecipeOrValue.is<VPValue *>()) {
8768           auto *VPV = RecipeOrValue.get<VPValue *>();
8769           Plan->addVPValue(Instr, VPV);
8770           // If the re-used value is a recipe, register the recipe for the
8771           // instruction, in case the recipe for Instr needs to be recorded.
8772           if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef()))
8773             RecipeBuilder.setRecipe(Instr, R);
8774           continue;
8775         }
8776         // Otherwise, add the new recipe.
8777         VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>();
8778         for (auto *Def : Recipe->definedValues()) {
8779           auto *UV = Def->getUnderlyingValue();
8780           Plan->addVPValue(UV, Def);
8781         }
8782 
8783         if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) &&
8784             HeaderVPBB->getFirstNonPhi() != VPBB->end()) {
8785           // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section
8786           // of the header block. That can happen for truncates of induction
8787           // variables. Those recipes are moved to the phi section of the header
8788           // block after applying SinkAfter, which relies on the original
8789           // position of the trunc.
8790           assert(isa<TruncInst>(Instr));
8791           InductionsToMove.push_back(
8792               cast<VPWidenIntOrFpInductionRecipe>(Recipe));
8793         }
8794         RecipeBuilder.setRecipe(Instr, Recipe);
8795         VPBB->appendRecipe(Recipe);
8796         continue;
8797       }
8798 
8799       // Otherwise, if all widening options failed, Instruction is to be
8800       // replicated. This may create a successor for VPBB.
8801       VPBasicBlock *NextVPBB =
8802           RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan);
8803       if (NextVPBB != VPBB) {
8804         VPBB = NextVPBB;
8805         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
8806                                     : "");
8807       }
8808     }
8809 
8810     VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
8811     VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
8812   }
8813 
8814   HeaderVPBB->setName("vector.body");
8815 
8816   // Fold the last, empty block into its predecessor.
8817   VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB);
8818   assert(VPBB && "expected to fold last (empty) block");
8819   // After here, VPBB should not be used.
8820   VPBB = nullptr;
8821 
8822   addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan);
8823 
8824   assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
8825          !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
8826          "entry block must be set to a VPRegionBlock having a non-empty entry "
8827          "VPBasicBlock");
8828   RecipeBuilder.fixHeaderPhis();
8829 
8830   // ---------------------------------------------------------------------------
8831   // Transform initial VPlan: Apply previously taken decisions, in order, to
8832   // bring the VPlan to its final state.
8833   // ---------------------------------------------------------------------------
8834 
8835   // Apply Sink-After legal constraints.
8836   auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {
8837     auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
8838     if (Region && Region->isReplicator()) {
8839       assert(Region->getNumSuccessors() == 1 &&
8840              Region->getNumPredecessors() == 1 && "Expected SESE region!");
8841       assert(R->getParent()->size() == 1 &&
8842              "A recipe in an original replicator region must be the only "
8843              "recipe in its block");
8844       return Region;
8845     }
8846     return nullptr;
8847   };
8848   for (auto &Entry : SinkAfter) {
8849     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
8850     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
8851 
8852     auto *TargetRegion = GetReplicateRegion(Target);
8853     auto *SinkRegion = GetReplicateRegion(Sink);
8854     if (!SinkRegion) {
8855       // If the sink source is not a replicate region, sink the recipe directly.
8856       if (TargetRegion) {
8857         // The target is in a replication region, make sure to move Sink to
8858         // the block after it, not into the replication region itself.
8859         VPBasicBlock *NextBlock =
8860             cast<VPBasicBlock>(TargetRegion->getSuccessors().front());
8861         Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
8862       } else
8863         Sink->moveAfter(Target);
8864       continue;
8865     }
8866 
8867     // The sink source is in a replicate region. Unhook the region from the CFG.
8868     auto *SinkPred = SinkRegion->getSinglePredecessor();
8869     auto *SinkSucc = SinkRegion->getSingleSuccessor();
8870     VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion);
8871     VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc);
8872     VPBlockUtils::connectBlocks(SinkPred, SinkSucc);
8873 
8874     if (TargetRegion) {
8875       // The target recipe is also in a replicate region, move the sink region
8876       // after the target region.
8877       auto *TargetSucc = TargetRegion->getSingleSuccessor();
8878       VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc);
8879       VPBlockUtils::connectBlocks(TargetRegion, SinkRegion);
8880       VPBlockUtils::connectBlocks(SinkRegion, TargetSucc);
8881     } else {
8882       // The sink source is in a replicate region, we need to move the whole
8883       // replicate region, which should only contain a single recipe in the
8884       // main block.
8885       auto *SplitBlock =
8886           Target->getParent()->splitAt(std::next(Target->getIterator()));
8887 
8888       auto *SplitPred = SplitBlock->getSinglePredecessor();
8889 
8890       VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock);
8891       VPBlockUtils::connectBlocks(SplitPred, SinkRegion);
8892       VPBlockUtils::connectBlocks(SinkRegion, SplitBlock);
8893     }
8894   }
8895 
8896   VPlanTransforms::removeRedundantCanonicalIVs(*Plan);
8897   VPlanTransforms::removeRedundantInductionCasts(*Plan);
8898 
8899   // Now that sink-after is done, move induction recipes for optimized truncates
8900   // to the phi section of the header block.
8901   for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove)
8902     Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
8903 
8904   // Adjust the recipes for any inloop reductions.
8905   adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExiting()), Plan,
8906                              RecipeBuilder, Range.Start);
8907 
8908   // Introduce a recipe to combine the incoming and previous values of a
8909   // first-order recurrence.
8910   for (VPRecipeBase &R :
8911        Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
8912     auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R);
8913     if (!RecurPhi)
8914       continue;
8915 
8916     VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe();
8917     VPBasicBlock *InsertBlock = PrevRecipe->getParent();
8918     auto *Region = GetReplicateRegion(PrevRecipe);
8919     if (Region)
8920       InsertBlock = dyn_cast<VPBasicBlock>(Region->getSingleSuccessor());
8921     if (!InsertBlock) {
8922       InsertBlock = new VPBasicBlock(Region->getName() + ".succ");
8923       VPBlockUtils::insertBlockAfter(InsertBlock, Region);
8924     }
8925     if (Region || PrevRecipe->isPhi())
8926       Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
8927     else
8928       Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator()));
8929 
8930     auto *RecurSplice = cast<VPInstruction>(
8931         Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
8932                              {RecurPhi, RecurPhi->getBackedgeValue()}));
8933 
8934     RecurPhi->replaceAllUsesWith(RecurSplice);
8935     // Set the first operand of RecurSplice to RecurPhi again, after replacing
8936     // all users.
8937     RecurSplice->setOperand(0, RecurPhi);
8938   }
8939 
8940   // Interleave memory: for each Interleave Group we marked earlier as relevant
8941   // for this VPlan, replace the Recipes widening its memory instructions with a
8942   // single VPInterleaveRecipe at its insertion point.
8943   for (auto IG : InterleaveGroups) {
8944     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
8945         RecipeBuilder.getRecipe(IG->getInsertPos()));
8946     SmallVector<VPValue *, 4> StoredValues;
8947     for (unsigned i = 0; i < IG->getFactor(); ++i)
8948       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
8949         auto *StoreR =
8950             cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
8951         StoredValues.push_back(StoreR->getStoredValue());
8952       }
8953 
8954     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
8955                                         Recipe->getMask());
8956     VPIG->insertBefore(Recipe);
8957     unsigned J = 0;
8958     for (unsigned i = 0; i < IG->getFactor(); ++i)
8959       if (Instruction *Member = IG->getMember(i)) {
8960         if (!Member->getType()->isVoidTy()) {
8961           VPValue *OriginalV = Plan->getVPValue(Member);
8962           Plan->removeVPValueFor(Member);
8963           Plan->addVPValue(Member, VPIG->getVPValue(J));
8964           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
8965           J++;
8966         }
8967         RecipeBuilder.getRecipe(Member)->eraseFromParent();
8968       }
8969   }
8970 
8971   std::string PlanName;
8972   raw_string_ostream RSO(PlanName);
8973   ElementCount VF = Range.Start;
8974   Plan->addVF(VF);
8975   RSO << "Initial VPlan for VF={" << VF;
8976   for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
8977     Plan->addVF(VF);
8978     RSO << "," << VF;
8979   }
8980   RSO << "},UF>=1";
8981   RSO.flush();
8982   Plan->setName(PlanName);
8983 
8984   // From this point onwards, VPlan-to-VPlan transformations may change the plan
8985   // in ways that accessing values using original IR values is incorrect.
8986   Plan->disableValue2VPValue();
8987 
8988   VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE());
8989   VPlanTransforms::sinkScalarOperands(*Plan);
8990   VPlanTransforms::mergeReplicateRegions(*Plan);
8991   VPlanTransforms::removeDeadRecipes(*Plan);
8992   VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan);
8993 
8994   // Fold Exit block into its predecessor if possible.
8995   // TODO: Fold block earlier once all VPlan transforms properly maintain a
8996   // VPBasicBlock as exit.
8997   VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExiting());
8998 
8999   assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
9000   return Plan;
9001 }
9002 
9003 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9004   // Outer loop handling: They may require CFG and instruction level
9005   // transformations before even evaluating whether vectorization is profitable.
9006   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9007   // the vectorization pipeline.
9008   assert(!OrigLoop->isInnermost());
9009   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9010 
9011   // Create new empty VPlan
9012   auto Plan = std::make_unique<VPlan>();
9013 
9014   // Build hierarchical CFG
9015   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9016   HCFGBuilder.buildHierarchicalCFG();
9017 
9018   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
9019        VF *= 2)
9020     Plan->addVF(VF);
9021 
9022   SmallPtrSet<Instruction *, 1> DeadInstructions;
9023   VPlanTransforms::VPInstructionsToVPRecipes(
9024       OrigLoop, Plan,
9025       [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
9026       DeadInstructions, *PSE.getSE());
9027 
9028   // Remove the existing terminator of the exiting block of the top-most region.
9029   // A BranchOnCount will be added instead when adding the canonical IV recipes.
9030   auto *Term =
9031       Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
9032   Term->eraseFromParent();
9033 
9034   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(),
9035                         true);
9036   return Plan;
9037 }
9038 
9039 // Adjust the recipes for reductions. For in-loop reductions the chain of
9040 // instructions leading from the loop exit instr to the phi need to be converted
9041 // to reductions, with one operand being vector and the other being the scalar
9042 // reduction chain. For other reductions, a select is introduced between the phi
9043 // and live-out recipes when folding the tail.
9044 void LoopVectorizationPlanner::adjustRecipesForReductions(
9045     VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
9046     ElementCount MinVF) {
9047   for (auto &Reduction : CM.getInLoopReductionChains()) {
9048     PHINode *Phi = Reduction.first;
9049     const RecurrenceDescriptor &RdxDesc =
9050         Legal->getReductionVars().find(Phi)->second;
9051     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9052 
9053     if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc))
9054       continue;
9055 
9056     // ReductionOperations are orders top-down from the phi's use to the
9057     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
9058     // which of the two operands will remain scalar and which will be reduced.
9059     // For minmax the chain will be the select instructions.
9060     Instruction *Chain = Phi;
9061     for (Instruction *R : ReductionOperations) {
9062       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
9063       RecurKind Kind = RdxDesc.getRecurrenceKind();
9064 
9065       VPValue *ChainOp = Plan->getVPValue(Chain);
9066       unsigned FirstOpId;
9067       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
9068              "Only min/max recurrences allowed for inloop reductions");
9069       // Recognize a call to the llvm.fmuladd intrinsic.
9070       bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9071       assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) &&
9072              "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9073       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9074         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
9075                "Expected to replace a VPWidenSelectSC");
9076         FirstOpId = 1;
9077       } else {
9078         assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) ||
9079                 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) &&
9080                "Expected to replace a VPWidenSC");
9081         FirstOpId = 0;
9082       }
9083       unsigned VecOpId =
9084           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
9085       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
9086 
9087       auto *CondOp = CM.blockNeedsPredicationForAnyReason(R->getParent())
9088                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
9089                          : nullptr;
9090 
9091       if (IsFMulAdd) {
9092         // If the instruction is a call to the llvm.fmuladd intrinsic then we
9093         // need to create an fmul recipe to use as the vector operand for the
9094         // fadd reduction.
9095         VPInstruction *FMulRecipe = new VPInstruction(
9096             Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))});
9097         FMulRecipe->setFastMathFlags(R->getFastMathFlags());
9098         WidenRecipe->getParent()->insert(FMulRecipe,
9099                                          WidenRecipe->getIterator());
9100         VecOp = FMulRecipe;
9101       }
9102       VPReductionRecipe *RedRecipe =
9103           new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
9104       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9105       Plan->removeVPValueFor(R);
9106       Plan->addVPValue(R, RedRecipe);
9107       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
9108       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9109       WidenRecipe->eraseFromParent();
9110 
9111       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9112         VPRecipeBase *CompareRecipe =
9113             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
9114         assert(isa<VPWidenRecipe>(CompareRecipe) &&
9115                "Expected to replace a VPWidenSC");
9116         assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
9117                "Expected no remaining users");
9118         CompareRecipe->eraseFromParent();
9119       }
9120       Chain = R;
9121     }
9122   }
9123 
9124   // If tail is folded by masking, introduce selects between the phi
9125   // and the live-out instruction of each reduction, at the beginning of the
9126   // dedicated latch block.
9127   if (CM.foldTailByMasking()) {
9128     Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin());
9129     for (VPRecipeBase &R :
9130          Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9131       VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9132       if (!PhiR || PhiR->isInLoop())
9133         continue;
9134       VPValue *Cond =
9135           RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
9136       VPValue *Red = PhiR->getBackedgeValue();
9137       assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB &&
9138              "reduction recipe must be defined before latch");
9139       Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});
9140     }
9141   }
9142 }
9143 
9144 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9145 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9146                                VPSlotTracker &SlotTracker) const {
9147   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9148   IG->getInsertPos()->printAsOperand(O, false);
9149   O << ", ";
9150   getAddr()->printAsOperand(O, SlotTracker);
9151   VPValue *Mask = getMask();
9152   if (Mask) {
9153     O << ", ";
9154     Mask->printAsOperand(O, SlotTracker);
9155   }
9156 
9157   unsigned OpIdx = 0;
9158   for (unsigned i = 0; i < IG->getFactor(); ++i) {
9159     if (!IG->getMember(i))
9160       continue;
9161     if (getNumStoreOperands() > 0) {
9162       O << "\n" << Indent << "  store ";
9163       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9164       O << " to index " << i;
9165     } else {
9166       O << "\n" << Indent << "  ";
9167       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
9168       O << " = load from index " << i;
9169     }
9170     ++OpIdx;
9171   }
9172 }
9173 #endif
9174 
9175 void VPWidenCallRecipe::execute(VPTransformState &State) {
9176   State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
9177                                   *this, State);
9178 }
9179 
9180 void VPWidenSelectRecipe::execute(VPTransformState &State) {
9181   auto &I = *cast<SelectInst>(getUnderlyingInstr());
9182   State.ILV->setDebugLocFromInst(&I);
9183 
9184   // The condition can be loop invariant  but still defined inside the
9185   // loop. This means that we can't just use the original 'cond' value.
9186   // We have to take the 'vectorized' value and pick the first lane.
9187   // Instcombine will make this a no-op.
9188   auto *InvarCond =
9189       InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr;
9190 
9191   for (unsigned Part = 0; Part < State.UF; ++Part) {
9192     Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part);
9193     Value *Op0 = State.get(getOperand(1), Part);
9194     Value *Op1 = State.get(getOperand(2), Part);
9195     Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
9196     State.set(this, Sel, Part);
9197     State.addMetadata(Sel, &I);
9198   }
9199 }
9200 
9201 void VPWidenRecipe::execute(VPTransformState &State) {
9202   auto &I = *cast<Instruction>(getUnderlyingValue());
9203   auto &Builder = State.Builder;
9204   switch (I.getOpcode()) {
9205   case Instruction::Call:
9206   case Instruction::Br:
9207   case Instruction::PHI:
9208   case Instruction::GetElementPtr:
9209   case Instruction::Select:
9210     llvm_unreachable("This instruction is handled by a different recipe.");
9211   case Instruction::UDiv:
9212   case Instruction::SDiv:
9213   case Instruction::SRem:
9214   case Instruction::URem:
9215   case Instruction::Add:
9216   case Instruction::FAdd:
9217   case Instruction::Sub:
9218   case Instruction::FSub:
9219   case Instruction::FNeg:
9220   case Instruction::Mul:
9221   case Instruction::FMul:
9222   case Instruction::FDiv:
9223   case Instruction::FRem:
9224   case Instruction::Shl:
9225   case Instruction::LShr:
9226   case Instruction::AShr:
9227   case Instruction::And:
9228   case Instruction::Or:
9229   case Instruction::Xor: {
9230     // Just widen unops and binops.
9231     State.ILV->setDebugLocFromInst(&I);
9232 
9233     for (unsigned Part = 0; Part < State.UF; ++Part) {
9234       SmallVector<Value *, 2> Ops;
9235       for (VPValue *VPOp : operands())
9236         Ops.push_back(State.get(VPOp, Part));
9237 
9238       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
9239 
9240       if (auto *VecOp = dyn_cast<Instruction>(V)) {
9241         VecOp->copyIRFlags(&I);
9242 
9243         // If the instruction is vectorized and was in a basic block that needed
9244         // predication, we can't propagate poison-generating flags (nuw/nsw,
9245         // exact, etc.). The control flow has been linearized and the
9246         // instruction is no longer guarded by the predicate, which could make
9247         // the flag properties to no longer hold.
9248         if (State.MayGeneratePoisonRecipes.contains(this))
9249           VecOp->dropPoisonGeneratingFlags();
9250       }
9251 
9252       // Use this vector value for all users of the original instruction.
9253       State.set(this, V, Part);
9254       State.addMetadata(V, &I);
9255     }
9256 
9257     break;
9258   }
9259   case Instruction::Freeze: {
9260     State.ILV->setDebugLocFromInst(&I);
9261 
9262     for (unsigned Part = 0; Part < State.UF; ++Part) {
9263       Value *Op = State.get(getOperand(0), Part);
9264 
9265       Value *Freeze = Builder.CreateFreeze(Op);
9266       State.set(this, Freeze, Part);
9267     }
9268     break;
9269   }
9270   case Instruction::ICmp:
9271   case Instruction::FCmp: {
9272     // Widen compares. Generate vector compares.
9273     bool FCmp = (I.getOpcode() == Instruction::FCmp);
9274     auto *Cmp = cast<CmpInst>(&I);
9275     State.ILV->setDebugLocFromInst(Cmp);
9276     for (unsigned Part = 0; Part < State.UF; ++Part) {
9277       Value *A = State.get(getOperand(0), Part);
9278       Value *B = State.get(getOperand(1), Part);
9279       Value *C = nullptr;
9280       if (FCmp) {
9281         // Propagate fast math flags.
9282         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
9283         Builder.setFastMathFlags(Cmp->getFastMathFlags());
9284         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
9285       } else {
9286         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
9287       }
9288       State.set(this, C, Part);
9289       State.addMetadata(C, &I);
9290     }
9291 
9292     break;
9293   }
9294 
9295   case Instruction::ZExt:
9296   case Instruction::SExt:
9297   case Instruction::FPToUI:
9298   case Instruction::FPToSI:
9299   case Instruction::FPExt:
9300   case Instruction::PtrToInt:
9301   case Instruction::IntToPtr:
9302   case Instruction::SIToFP:
9303   case Instruction::UIToFP:
9304   case Instruction::Trunc:
9305   case Instruction::FPTrunc:
9306   case Instruction::BitCast: {
9307     auto *CI = cast<CastInst>(&I);
9308     State.ILV->setDebugLocFromInst(CI);
9309 
9310     /// Vectorize casts.
9311     Type *DestTy = (State.VF.isScalar())
9312                        ? CI->getType()
9313                        : VectorType::get(CI->getType(), State.VF);
9314 
9315     for (unsigned Part = 0; Part < State.UF; ++Part) {
9316       Value *A = State.get(getOperand(0), Part);
9317       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
9318       State.set(this, Cast, Part);
9319       State.addMetadata(Cast, &I);
9320     }
9321     break;
9322   }
9323   default:
9324     // This instruction is not vectorized by simple widening.
9325     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
9326     llvm_unreachable("Unhandled instruction!");
9327   } // end of switch.
9328 }
9329 
9330 void VPWidenGEPRecipe::execute(VPTransformState &State) {
9331   auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
9332   // Construct a vector GEP by widening the operands of the scalar GEP as
9333   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
9334   // results in a vector of pointers when at least one operand of the GEP
9335   // is vector-typed. Thus, to keep the representation compact, we only use
9336   // vector-typed operands for loop-varying values.
9337 
9338   if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
9339     // If we are vectorizing, but the GEP has only loop-invariant operands,
9340     // the GEP we build (by only using vector-typed operands for
9341     // loop-varying values) would be a scalar pointer. Thus, to ensure we
9342     // produce a vector of pointers, we need to either arbitrarily pick an
9343     // operand to broadcast, or broadcast a clone of the original GEP.
9344     // Here, we broadcast a clone of the original.
9345     //
9346     // TODO: If at some point we decide to scalarize instructions having
9347     //       loop-invariant operands, this special case will no longer be
9348     //       required. We would add the scalarization decision to
9349     //       collectLoopScalars() and teach getVectorValue() to broadcast
9350     //       the lane-zero scalar value.
9351     auto *Clone = State.Builder.Insert(GEP->clone());
9352     for (unsigned Part = 0; Part < State.UF; ++Part) {
9353       Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone);
9354       State.set(this, EntryPart, Part);
9355       State.addMetadata(EntryPart, GEP);
9356     }
9357   } else {
9358     // If the GEP has at least one loop-varying operand, we are sure to
9359     // produce a vector of pointers. But if we are only unrolling, we want
9360     // to produce a scalar GEP for each unroll part. Thus, the GEP we
9361     // produce with the code below will be scalar (if VF == 1) or vector
9362     // (otherwise). Note that for the unroll-only case, we still maintain
9363     // values in the vector mapping with initVector, as we do for other
9364     // instructions.
9365     for (unsigned Part = 0; Part < State.UF; ++Part) {
9366       // The pointer operand of the new GEP. If it's loop-invariant, we
9367       // won't broadcast it.
9368       auto *Ptr = IsPtrLoopInvariant
9369                       ? State.get(getOperand(0), VPIteration(0, 0))
9370                       : State.get(getOperand(0), Part);
9371 
9372       // Collect all the indices for the new GEP. If any index is
9373       // loop-invariant, we won't broadcast it.
9374       SmallVector<Value *, 4> Indices;
9375       for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
9376         VPValue *Operand = getOperand(I);
9377         if (IsIndexLoopInvariant[I - 1])
9378           Indices.push_back(State.get(Operand, VPIteration(0, 0)));
9379         else
9380           Indices.push_back(State.get(Operand, Part));
9381       }
9382 
9383       // If the GEP instruction is vectorized and was in a basic block that
9384       // needed predication, we can't propagate the poison-generating 'inbounds'
9385       // flag. The control flow has been linearized and the GEP is no longer
9386       // guarded by the predicate, which could make the 'inbounds' properties to
9387       // no longer hold.
9388       bool IsInBounds =
9389           GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0;
9390 
9391       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
9392       // but it should be a vector, otherwise.
9393       auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
9394                                              Indices, "", IsInBounds);
9395       assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
9396              "NewGEP is not a pointer vector");
9397       State.set(this, NewGEP, Part);
9398       State.addMetadata(NewGEP, GEP);
9399     }
9400   }
9401 }
9402 
9403 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
9404   assert(!State.Instance && "Int or FP induction being replicated.");
9405 
9406   Value *Start = getStartValue()->getLiveInIRValue();
9407   const InductionDescriptor &ID = getInductionDescriptor();
9408   TruncInst *Trunc = getTruncInst();
9409   IRBuilderBase &Builder = State.Builder;
9410   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
9411   assert(State.VF.isVector() && "must have vector VF");
9412 
9413   // The value from the original loop to which we are mapping the new induction
9414   // variable.
9415   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
9416 
9417   // Fast-math-flags propagate from the original induction instruction.
9418   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
9419   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
9420     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
9421 
9422   // Now do the actual transformations, and start with fetching the step value.
9423   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9424 
9425   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
9426          "Expected either an induction phi-node or a truncate of it!");
9427 
9428   // Construct the initial value of the vector IV in the vector loop preheader
9429   auto CurrIP = Builder.saveIP();
9430   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9431   Builder.SetInsertPoint(VectorPH->getTerminator());
9432   if (isa<TruncInst>(EntryVal)) {
9433     assert(Start->getType()->isIntegerTy() &&
9434            "Truncation requires an integer type");
9435     auto *TruncType = cast<IntegerType>(EntryVal->getType());
9436     Step = Builder.CreateTrunc(Step, TruncType);
9437     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
9438   }
9439 
9440   Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
9441   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
9442   Value *SteppedStart = getStepVector(
9443       SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
9444 
9445   // We create vector phi nodes for both integer and floating-point induction
9446   // variables. Here, we determine the kind of arithmetic we will perform.
9447   Instruction::BinaryOps AddOp;
9448   Instruction::BinaryOps MulOp;
9449   if (Step->getType()->isIntegerTy()) {
9450     AddOp = Instruction::Add;
9451     MulOp = Instruction::Mul;
9452   } else {
9453     AddOp = ID.getInductionOpcode();
9454     MulOp = Instruction::FMul;
9455   }
9456 
9457   // Multiply the vectorization factor by the step using integer or
9458   // floating-point arithmetic as appropriate.
9459   Type *StepType = Step->getType();
9460   Value *RuntimeVF;
9461   if (Step->getType()->isFloatingPointTy())
9462     RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
9463   else
9464     RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
9465   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
9466 
9467   // Create a vector splat to use in the induction update.
9468   //
9469   // FIXME: If the step is non-constant, we create the vector splat with
9470   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
9471   //        handle a constant vector splat.
9472   Value *SplatVF = isa<Constant>(Mul)
9473                        ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
9474                        : Builder.CreateVectorSplat(State.VF, Mul);
9475   Builder.restoreIP(CurrIP);
9476 
9477   // We may need to add the step a number of times, depending on the unroll
9478   // factor. The last of those goes into the PHI.
9479   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
9480                                     &*State.CFG.PrevBB->getFirstInsertionPt());
9481   VecInd->setDebugLoc(EntryVal->getDebugLoc());
9482   Instruction *LastInduction = VecInd;
9483   for (unsigned Part = 0; Part < State.UF; ++Part) {
9484     State.set(this, LastInduction, Part);
9485 
9486     if (isa<TruncInst>(EntryVal))
9487       State.addMetadata(LastInduction, EntryVal);
9488 
9489     LastInduction = cast<Instruction>(
9490         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
9491     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
9492   }
9493 
9494   LastInduction->setName("vec.ind.next");
9495   VecInd->addIncoming(SteppedStart, VectorPH);
9496   // Add induction update using an incorrect block temporarily. The phi node
9497   // will be fixed after VPlan execution. Note that at this point the latch
9498   // block cannot be used, as it does not exist yet.
9499   // TODO: Model increment value in VPlan, by turning the recipe into a
9500   // multi-def and a subclass of VPHeaderPHIRecipe.
9501   VecInd->addIncoming(LastInduction, VectorPH);
9502 }
9503 
9504 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
9505   assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
9506          "Not a pointer induction according to InductionDescriptor!");
9507   assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
9508          "Unexpected type.");
9509 
9510   auto *IVR = getParent()->getPlan()->getCanonicalIV();
9511   PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
9512 
9513   if (onlyScalarsGenerated(State.VF)) {
9514     // This is the normalized GEP that starts counting at zero.
9515     Value *PtrInd = State.Builder.CreateSExtOrTrunc(
9516         CanonicalIV, IndDesc.getStep()->getType());
9517     // Determine the number of scalars we need to generate for each unroll
9518     // iteration. If the instruction is uniform, we only need to generate the
9519     // first lane. Otherwise, we generate all VF values.
9520     bool IsUniform = vputils::onlyFirstLaneUsed(this);
9521     assert((IsUniform || !State.VF.isScalable()) &&
9522            "Cannot scalarize a scalable VF");
9523     unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
9524 
9525     for (unsigned Part = 0; Part < State.UF; ++Part) {
9526       Value *PartStart =
9527           createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part);
9528 
9529       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
9530         Value *Idx = State.Builder.CreateAdd(
9531             PartStart, ConstantInt::get(PtrInd->getType(), Lane));
9532         Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx);
9533 
9534         Value *Step = CreateStepValue(IndDesc.getStep(), SE,
9535                                       State.CFG.PrevBB->getTerminator());
9536         Value *SclrGep = emitTransformedIndex(
9537             State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc);
9538         SclrGep->setName("next.gep");
9539         State.set(this, SclrGep, VPIteration(Part, Lane));
9540       }
9541     }
9542     return;
9543   }
9544 
9545   assert(isa<SCEVConstant>(IndDesc.getStep()) &&
9546          "Induction step not a SCEV constant!");
9547   Type *PhiType = IndDesc.getStep()->getType();
9548 
9549   // Build a pointer phi
9550   Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
9551   Type *ScStValueType = ScalarStartValue->getType();
9552   PHINode *NewPointerPhi =
9553       PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
9554 
9555   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9556   NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
9557 
9558   // A pointer induction, performed by using a gep
9559   const DataLayout &DL = NewPointerPhi->getModule()->getDataLayout();
9560   Instruction *InductionLoc = &*State.Builder.GetInsertPoint();
9561 
9562   const SCEV *ScalarStep = IndDesc.getStep();
9563   SCEVExpander Exp(SE, DL, "induction");
9564   Value *ScalarStepValue = Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
9565   Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
9566   Value *NumUnrolledElems =
9567       State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
9568   Value *InductionGEP = GetElementPtrInst::Create(
9569       IndDesc.getElementType(), NewPointerPhi,
9570       State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
9571       InductionLoc);
9572   // Add induction update using an incorrect block temporarily. The phi node
9573   // will be fixed after VPlan execution. Note that at this point the latch
9574   // block cannot be used, as it does not exist yet.
9575   // TODO: Model increment value in VPlan, by turning the recipe into a
9576   // multi-def and a subclass of VPHeaderPHIRecipe.
9577   NewPointerPhi->addIncoming(InductionGEP, VectorPH);
9578 
9579   // Create UF many actual address geps that use the pointer
9580   // phi as base and a vectorized version of the step value
9581   // (<step*0, ..., step*N>) as offset.
9582   for (unsigned Part = 0; Part < State.UF; ++Part) {
9583     Type *VecPhiType = VectorType::get(PhiType, State.VF);
9584     Value *StartOffsetScalar =
9585         State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
9586     Value *StartOffset =
9587         State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
9588     // Create a vector of consecutive numbers from zero to VF.
9589     StartOffset = State.Builder.CreateAdd(
9590         StartOffset, State.Builder.CreateStepVector(VecPhiType));
9591 
9592     Value *GEP = State.Builder.CreateGEP(
9593         IndDesc.getElementType(), NewPointerPhi,
9594         State.Builder.CreateMul(
9595             StartOffset,
9596             State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
9597             "vector.gep"));
9598     State.set(this, GEP, Part);
9599   }
9600 }
9601 
9602 void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
9603   assert(!State.Instance && "VPScalarIVStepsRecipe being replicated.");
9604 
9605   // Fast-math-flags propagate from the original induction instruction.
9606   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9607   if (IndDesc.getInductionBinOp() &&
9608       isa<FPMathOperator>(IndDesc.getInductionBinOp()))
9609     State.Builder.setFastMathFlags(
9610         IndDesc.getInductionBinOp()->getFastMathFlags());
9611 
9612   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9613   auto CreateScalarIV = [&](Value *&Step) -> Value * {
9614     Value *ScalarIV = State.get(getCanonicalIV(), VPIteration(0, 0));
9615     auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0);
9616     if (!isCanonical() || CanonicalIV->getType() != Ty) {
9617       ScalarIV =
9618           Ty->isIntegerTy()
9619               ? State.Builder.CreateSExtOrTrunc(ScalarIV, Ty)
9620               : State.Builder.CreateCast(Instruction::SIToFP, ScalarIV, Ty);
9621       ScalarIV = emitTransformedIndex(State.Builder, ScalarIV,
9622                                       getStartValue()->getLiveInIRValue(), Step,
9623                                       IndDesc);
9624       ScalarIV->setName("offset.idx");
9625     }
9626     if (TruncToTy) {
9627       assert(Step->getType()->isIntegerTy() &&
9628              "Truncation requires an integer step");
9629       ScalarIV = State.Builder.CreateTrunc(ScalarIV, TruncToTy);
9630       Step = State.Builder.CreateTrunc(Step, TruncToTy);
9631     }
9632     return ScalarIV;
9633   };
9634 
9635   Value *ScalarIV = CreateScalarIV(Step);
9636   if (State.VF.isVector()) {
9637     buildScalarSteps(ScalarIV, Step, IndDesc, this, State);
9638     return;
9639   }
9640 
9641   for (unsigned Part = 0; Part < State.UF; ++Part) {
9642     assert(!State.VF.isScalable() && "scalable vectors not yet supported.");
9643     Value *EntryPart;
9644     if (Step->getType()->isFloatingPointTy()) {
9645       Value *StartIdx =
9646           getRuntimeVFAsFloat(State.Builder, Step->getType(), State.VF * Part);
9647       // Floating-point operations inherit FMF via the builder's flags.
9648       Value *MulOp = State.Builder.CreateFMul(StartIdx, Step);
9649       EntryPart = State.Builder.CreateBinOp(IndDesc.getInductionOpcode(),
9650                                             ScalarIV, MulOp);
9651     } else {
9652       Value *StartIdx =
9653           getRuntimeVF(State.Builder, Step->getType(), State.VF * Part);
9654       EntryPart = State.Builder.CreateAdd(
9655           ScalarIV, State.Builder.CreateMul(StartIdx, Step), "induction");
9656     }
9657     State.set(this, EntryPart, Part);
9658   }
9659 }
9660 
9661 void VPBlendRecipe::execute(VPTransformState &State) {
9662   State.ILV->setDebugLocFromInst(Phi);
9663   // We know that all PHIs in non-header blocks are converted into
9664   // selects, so we don't have to worry about the insertion order and we
9665   // can just use the builder.
9666   // At this point we generate the predication tree. There may be
9667   // duplications since this is a simple recursive scan, but future
9668   // optimizations will clean it up.
9669 
9670   unsigned NumIncoming = getNumIncomingValues();
9671 
9672   // Generate a sequence of selects of the form:
9673   // SELECT(Mask3, In3,
9674   //        SELECT(Mask2, In2,
9675   //               SELECT(Mask1, In1,
9676   //                      In0)))
9677   // Note that Mask0 is never used: lanes for which no path reaches this phi and
9678   // are essentially undef are taken from In0.
9679   InnerLoopVectorizer::VectorParts Entry(State.UF);
9680   for (unsigned In = 0; In < NumIncoming; ++In) {
9681     for (unsigned Part = 0; Part < State.UF; ++Part) {
9682       // We might have single edge PHIs (blocks) - use an identity
9683       // 'select' for the first PHI operand.
9684       Value *In0 = State.get(getIncomingValue(In), Part);
9685       if (In == 0)
9686         Entry[Part] = In0; // Initialize with the first incoming value.
9687       else {
9688         // Select between the current value and the previous incoming edge
9689         // based on the incoming mask.
9690         Value *Cond = State.get(getMask(In), Part);
9691         Entry[Part] =
9692             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
9693       }
9694     }
9695   }
9696   for (unsigned Part = 0; Part < State.UF; ++Part)
9697     State.set(this, Entry[Part], Part);
9698 }
9699 
9700 void VPInterleaveRecipe::execute(VPTransformState &State) {
9701   assert(!State.Instance && "Interleave group being replicated.");
9702   State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9703                                       getStoredValues(), getMask());
9704 }
9705 
9706 void VPReductionRecipe::execute(VPTransformState &State) {
9707   assert(!State.Instance && "Reduction being replicated.");
9708   Value *PrevInChain = State.get(getChainOp(), 0);
9709   RecurKind Kind = RdxDesc->getRecurrenceKind();
9710   bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc);
9711   // Propagate the fast-math flags carried by the underlying instruction.
9712   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
9713   State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags());
9714   for (unsigned Part = 0; Part < State.UF; ++Part) {
9715     Value *NewVecOp = State.get(getVecOp(), Part);
9716     if (VPValue *Cond = getCondOp()) {
9717       Value *NewCond = State.get(Cond, Part);
9718       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
9719       Value *Iden = RdxDesc->getRecurrenceIdentity(
9720           Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags());
9721       Value *IdenVec =
9722           State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
9723       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
9724       NewVecOp = Select;
9725     }
9726     Value *NewRed;
9727     Value *NextInChain;
9728     if (IsOrdered) {
9729       if (State.VF.isVector())
9730         NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp,
9731                                         PrevInChain);
9732       else
9733         NewRed = State.Builder.CreateBinOp(
9734             (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain,
9735             NewVecOp);
9736       PrevInChain = NewRed;
9737     } else {
9738       PrevInChain = State.get(getChainOp(), Part);
9739       NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
9740     }
9741     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9742       NextInChain =
9743           createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
9744                          NewRed, PrevInChain);
9745     } else if (IsOrdered)
9746       NextInChain = NewRed;
9747     else
9748       NextInChain = State.Builder.CreateBinOp(
9749           (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed,
9750           PrevInChain);
9751     State.set(this, NextInChain, Part);
9752   }
9753 }
9754 
9755 void VPReplicateRecipe::execute(VPTransformState &State) {
9756   if (State.Instance) { // Generate a single instance.
9757     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9758     State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance,
9759                                     IsPredicated, State);
9760     // Insert scalar instance packing it into a vector.
9761     if (AlsoPack && State.VF.isVector()) {
9762       // If we're constructing lane 0, initialize to start from poison.
9763       if (State.Instance->Lane.isFirstLane()) {
9764         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9765         Value *Poison = PoisonValue::get(
9766             VectorType::get(getUnderlyingValue()->getType(), State.VF));
9767         State.set(this, Poison, State.Instance->Part);
9768       }
9769       State.ILV->packScalarIntoVectorValue(this, *State.Instance, State);
9770     }
9771     return;
9772   }
9773 
9774   // Generate scalar instances for all VF lanes of all UF parts, unless the
9775   // instruction is uniform inwhich case generate only the first lane for each
9776   // of the UF parts.
9777   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
9778   assert((!State.VF.isScalable() || IsUniform) &&
9779          "Can't scalarize a scalable vector");
9780   for (unsigned Part = 0; Part < State.UF; ++Part)
9781     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9782       State.ILV->scalarizeInstruction(getUnderlyingInstr(), this,
9783                                       VPIteration(Part, Lane), IsPredicated,
9784                                       State);
9785 }
9786 
9787 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
9788   assert(State.Instance && "Branch on Mask works only on single instance.");
9789 
9790   unsigned Part = State.Instance->Part;
9791   unsigned Lane = State.Instance->Lane.getKnownLane();
9792 
9793   Value *ConditionBit = nullptr;
9794   VPValue *BlockInMask = getMask();
9795   if (BlockInMask) {
9796     ConditionBit = State.get(BlockInMask, Part);
9797     if (ConditionBit->getType()->isVectorTy())
9798       ConditionBit = State.Builder.CreateExtractElement(
9799           ConditionBit, State.Builder.getInt32(Lane));
9800   } else // Block in mask is all-one.
9801     ConditionBit = State.Builder.getTrue();
9802 
9803   // Replace the temporary unreachable terminator with a new conditional branch,
9804   // whose two destinations will be set later when they are created.
9805   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
9806   assert(isa<UnreachableInst>(CurrentTerminator) &&
9807          "Expected to replace unreachable terminator with conditional branch.");
9808   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
9809   CondBr->setSuccessor(0, nullptr);
9810   ReplaceInstWithInst(CurrentTerminator, CondBr);
9811 }
9812 
9813 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
9814   assert(State.Instance && "Predicated instruction PHI works per instance.");
9815   Instruction *ScalarPredInst =
9816       cast<Instruction>(State.get(getOperand(0), *State.Instance));
9817   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
9818   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
9819   assert(PredicatingBB && "Predicated block has no single predecessor.");
9820   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
9821          "operand must be VPReplicateRecipe");
9822 
9823   // By current pack/unpack logic we need to generate only a single phi node: if
9824   // a vector value for the predicated instruction exists at this point it means
9825   // the instruction has vector users only, and a phi for the vector value is
9826   // needed. In this case the recipe of the predicated instruction is marked to
9827   // also do that packing, thereby "hoisting" the insert-element sequence.
9828   // Otherwise, a phi node for the scalar value is needed.
9829   unsigned Part = State.Instance->Part;
9830   if (State.hasVectorValue(getOperand(0), Part)) {
9831     Value *VectorValue = State.get(getOperand(0), Part);
9832     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
9833     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
9834     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
9835     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
9836     if (State.hasVectorValue(this, Part))
9837       State.reset(this, VPhi, Part);
9838     else
9839       State.set(this, VPhi, Part);
9840     // NOTE: Currently we need to update the value of the operand, so the next
9841     // predicated iteration inserts its generated value in the correct vector.
9842     State.reset(getOperand(0), VPhi, Part);
9843   } else {
9844     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
9845     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
9846     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
9847                      PredicatingBB);
9848     Phi->addIncoming(ScalarPredInst, PredicatedBB);
9849     if (State.hasScalarValue(this, *State.Instance))
9850       State.reset(this, Phi, *State.Instance);
9851     else
9852       State.set(this, Phi, *State.Instance);
9853     // NOTE: Currently we need to update the value of the operand, so the next
9854     // predicated iteration inserts its generated value in the correct vector.
9855     State.reset(getOperand(0), Phi, *State.Instance);
9856   }
9857 }
9858 
9859 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9860   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9861 
9862   // Attempt to issue a wide load.
9863   LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
9864   StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
9865 
9866   assert((LI || SI) && "Invalid Load/Store instruction");
9867   assert((!SI || StoredValue) && "No stored value provided for widened store");
9868   assert((!LI || !StoredValue) && "Stored value provided for widened load");
9869 
9870   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9871 
9872   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9873   const Align Alignment = getLoadStoreAlignment(&Ingredient);
9874   bool CreateGatherScatter = !Consecutive;
9875 
9876   auto &Builder = State.Builder;
9877   InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
9878   bool isMaskRequired = getMask();
9879   if (isMaskRequired)
9880     for (unsigned Part = 0; Part < State.UF; ++Part)
9881       BlockInMaskParts[Part] = State.get(getMask(), Part);
9882 
9883   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
9884     // Calculate the pointer for the specific unroll-part.
9885     GetElementPtrInst *PartPtr = nullptr;
9886 
9887     bool InBounds = false;
9888     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
9889       InBounds = gep->isInBounds();
9890     if (Reverse) {
9891       // If the address is consecutive but reversed, then the
9892       // wide store needs to start at the last vector element.
9893       // RunTimeVF =  VScale * VF.getKnownMinValue()
9894       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
9895       Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF);
9896       // NumElt = -Part * RunTimeVF
9897       Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
9898       // LastLane = 1 - RunTimeVF
9899       Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
9900       PartPtr =
9901           cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
9902       PartPtr->setIsInBounds(InBounds);
9903       PartPtr = cast<GetElementPtrInst>(
9904           Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
9905       PartPtr->setIsInBounds(InBounds);
9906       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
9907         BlockInMaskParts[Part] =
9908             Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse");
9909     } else {
9910       Value *Increment =
9911           createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part);
9912       PartPtr = cast<GetElementPtrInst>(
9913           Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
9914       PartPtr->setIsInBounds(InBounds);
9915     }
9916 
9917     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
9918     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
9919   };
9920 
9921   // Handle Stores:
9922   if (SI) {
9923     State.ILV->setDebugLocFromInst(SI);
9924 
9925     for (unsigned Part = 0; Part < State.UF; ++Part) {
9926       Instruction *NewSI = nullptr;
9927       Value *StoredVal = State.get(StoredValue, Part);
9928       if (CreateGatherScatter) {
9929         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9930         Value *VectorGep = State.get(getAddr(), Part);
9931         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
9932                                             MaskPart);
9933       } else {
9934         if (Reverse) {
9935           // If we store to reverse consecutive memory locations, then we need
9936           // to reverse the order of elements in the stored value.
9937           StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
9938           // We don't want to update the value in the map as it might be used in
9939           // another expression. So don't call resetVectorValue(StoredVal).
9940         }
9941         auto *VecPtr =
9942             CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9943         if (isMaskRequired)
9944           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
9945                                             BlockInMaskParts[Part]);
9946         else
9947           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
9948       }
9949       State.addMetadata(NewSI, SI);
9950     }
9951     return;
9952   }
9953 
9954   // Handle loads.
9955   assert(LI && "Must have a load instruction");
9956   State.ILV->setDebugLocFromInst(LI);
9957   for (unsigned Part = 0; Part < State.UF; ++Part) {
9958     Value *NewLI;
9959     if (CreateGatherScatter) {
9960       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9961       Value *VectorGep = State.get(getAddr(), Part);
9962       NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
9963                                          nullptr, "wide.masked.gather");
9964       State.addMetadata(NewLI, LI);
9965     } else {
9966       auto *VecPtr =
9967           CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9968       if (isMaskRequired)
9969         NewLI = Builder.CreateMaskedLoad(
9970             DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
9971             PoisonValue::get(DataTy), "wide.masked.load");
9972       else
9973         NewLI =
9974             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
9975 
9976       // Add metadata to the load, but setVectorValue to the reverse shuffle.
9977       State.addMetadata(NewLI, LI);
9978       if (Reverse)
9979         NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
9980     }
9981 
9982     State.set(getVPSingleValue(), NewLI, Part);
9983   }
9984 }
9985 
9986 // Determine how to lower the scalar epilogue, which depends on 1) optimising
9987 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9988 // predication, and 4) a TTI hook that analyses whether the loop is suitable
9989 // for predication.
9990 static ScalarEpilogueLowering getScalarEpilogueLowering(
9991     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9992     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
9993     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
9994     LoopVectorizationLegality &LVL) {
9995   // 1) OptSize takes precedence over all other options, i.e. if this is set,
9996   // don't look at hints or options, and don't request a scalar epilogue.
9997   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9998   // LoopAccessInfo (due to code dependency and not being able to reliably get
9999   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
10000   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
10001   // versioning when the vectorization is forced, unlike hasOptSize. So revert
10002   // back to the old way and vectorize with versioning when forced. See D81345.)
10003   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
10004                                                       PGSOQueryType::IRPass) &&
10005                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
10006     return CM_ScalarEpilogueNotAllowedOptSize;
10007 
10008   // 2) If set, obey the directives
10009   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
10010     switch (PreferPredicateOverEpilogue) {
10011     case PreferPredicateTy::ScalarEpilogue:
10012       return CM_ScalarEpilogueAllowed;
10013     case PreferPredicateTy::PredicateElseScalarEpilogue:
10014       return CM_ScalarEpilogueNotNeededUsePredicate;
10015     case PreferPredicateTy::PredicateOrDontVectorize:
10016       return CM_ScalarEpilogueNotAllowedUsePredicate;
10017     };
10018   }
10019 
10020   // 3) If set, obey the hints
10021   switch (Hints.getPredicate()) {
10022   case LoopVectorizeHints::FK_Enabled:
10023     return CM_ScalarEpilogueNotNeededUsePredicate;
10024   case LoopVectorizeHints::FK_Disabled:
10025     return CM_ScalarEpilogueAllowed;
10026   };
10027 
10028   // 4) if the TTI hook indicates this is profitable, request predication.
10029   if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
10030                                        LVL.getLAI()))
10031     return CM_ScalarEpilogueNotNeededUsePredicate;
10032 
10033   return CM_ScalarEpilogueAllowed;
10034 }
10035 
10036 Value *VPTransformState::get(VPValue *Def, unsigned Part) {
10037   // If Values have been set for this Def return the one relevant for \p Part.
10038   if (hasVectorValue(Def, Part))
10039     return Data.PerPartOutput[Def][Part];
10040 
10041   if (!hasScalarValue(Def, {Part, 0})) {
10042     Value *IRV = Def->getLiveInIRValue();
10043     Value *B = ILV->getBroadcastInstrs(IRV);
10044     set(Def, B, Part);
10045     return B;
10046   }
10047 
10048   Value *ScalarValue = get(Def, {Part, 0});
10049   // If we aren't vectorizing, we can just copy the scalar map values over
10050   // to the vector map.
10051   if (VF.isScalar()) {
10052     set(Def, ScalarValue, Part);
10053     return ScalarValue;
10054   }
10055 
10056   auto *RepR = dyn_cast<VPReplicateRecipe>(Def);
10057   bool IsUniform = RepR && RepR->isUniform();
10058 
10059   unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
10060   // Check if there is a scalar value for the selected lane.
10061   if (!hasScalarValue(Def, {Part, LastLane})) {
10062     // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform.
10063     assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) ||
10064             isa<VPScalarIVStepsRecipe>(Def->getDef())) &&
10065            "unexpected recipe found to be invariant");
10066     IsUniform = true;
10067     LastLane = 0;
10068   }
10069 
10070   auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));
10071   // Set the insert point after the last scalarized instruction or after the
10072   // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
10073   // will directly follow the scalar definitions.
10074   auto OldIP = Builder.saveIP();
10075   auto NewIP =
10076       isa<PHINode>(LastInst)
10077           ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
10078           : std::next(BasicBlock::iterator(LastInst));
10079   Builder.SetInsertPoint(&*NewIP);
10080 
10081   // However, if we are vectorizing, we need to construct the vector values.
10082   // If the value is known to be uniform after vectorization, we can just
10083   // broadcast the scalar value corresponding to lane zero for each unroll
10084   // iteration. Otherwise, we construct the vector values using
10085   // insertelement instructions. Since the resulting vectors are stored in
10086   // State, we will only generate the insertelements once.
10087   Value *VectorValue = nullptr;
10088   if (IsUniform) {
10089     VectorValue = ILV->getBroadcastInstrs(ScalarValue);
10090     set(Def, VectorValue, Part);
10091   } else {
10092     // Initialize packing with insertelements to start from undef.
10093     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
10094     Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
10095     set(Def, Undef, Part);
10096     for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
10097       ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this);
10098     VectorValue = get(Def, Part);
10099   }
10100   Builder.restoreIP(OldIP);
10101   return VectorValue;
10102 }
10103 
10104 // Process the loop in the VPlan-native vectorization path. This path builds
10105 // VPlan upfront in the vectorization pipeline, which allows to apply
10106 // VPlan-to-VPlan transformations from the very beginning without modifying the
10107 // input LLVM IR.
10108 static bool processLoopInVPlanNativePath(
10109     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
10110     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
10111     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
10112     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
10113     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
10114     LoopVectorizationRequirements &Requirements) {
10115 
10116   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
10117     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
10118     return false;
10119   }
10120   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
10121   Function *F = L->getHeader()->getParent();
10122   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
10123 
10124   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10125       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
10126 
10127   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
10128                                 &Hints, IAI);
10129   // Use the planner for outer loop vectorization.
10130   // TODO: CM is not used at this point inside the planner. Turn CM into an
10131   // optional argument if we don't need it in the future.
10132   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints,
10133                                Requirements, ORE);
10134 
10135   // Get user vectorization factor.
10136   ElementCount UserVF = Hints.getWidth();
10137 
10138   CM.collectElementTypesForWidening();
10139 
10140   // Plan how to best vectorize, return the best VF and its cost.
10141   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
10142 
10143   // If we are stress testing VPlan builds, do not attempt to generate vector
10144   // code. Masked vector code generation support will follow soon.
10145   // Also, do not attempt to vectorize if no vector code will be produced.
10146   if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
10147     return false;
10148 
10149   VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10150 
10151   {
10152     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10153                              F->getParent()->getDataLayout());
10154     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
10155                            &CM, BFI, PSI, Checks);
10156     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
10157                       << L->getHeader()->getParent()->getName() << "\"\n");
10158     LVP.executePlan(VF.Width, 1, BestPlan, LB, DT);
10159   }
10160 
10161   // Mark the loop as already vectorized to avoid vectorizing again.
10162   Hints.setAlreadyVectorized();
10163   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10164   return true;
10165 }
10166 
10167 // Emit a remark if there are stores to floats that required a floating point
10168 // extension. If the vectorized loop was generated with floating point there
10169 // will be a performance penalty from the conversion overhead and the change in
10170 // the vector width.
10171 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
10172   SmallVector<Instruction *, 4> Worklist;
10173   for (BasicBlock *BB : L->getBlocks()) {
10174     for (Instruction &Inst : *BB) {
10175       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
10176         if (S->getValueOperand()->getType()->isFloatTy())
10177           Worklist.push_back(S);
10178       }
10179     }
10180   }
10181 
10182   // Traverse the floating point stores upwards searching, for floating point
10183   // conversions.
10184   SmallPtrSet<const Instruction *, 4> Visited;
10185   SmallPtrSet<const Instruction *, 4> EmittedRemark;
10186   while (!Worklist.empty()) {
10187     auto *I = Worklist.pop_back_val();
10188     if (!L->contains(I))
10189       continue;
10190     if (!Visited.insert(I).second)
10191       continue;
10192 
10193     // Emit a remark if the floating point store required a floating
10194     // point conversion.
10195     // TODO: More work could be done to identify the root cause such as a
10196     // constant or a function return type and point the user to it.
10197     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
10198       ORE->emit([&]() {
10199         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
10200                                           I->getDebugLoc(), L->getHeader())
10201                << "floating point conversion changes vector width. "
10202                << "Mixed floating point precision requires an up/down "
10203                << "cast that will negatively impact performance.";
10204       });
10205 
10206     for (Use &Op : I->operands())
10207       if (auto *OpI = dyn_cast<Instruction>(Op))
10208         Worklist.push_back(OpI);
10209   }
10210 }
10211 
10212 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
10213     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10214                                !EnableLoopInterleaving),
10215       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10216                               !EnableLoopVectorization) {}
10217 
10218 bool LoopVectorizePass::processLoop(Loop *L) {
10219   assert((EnableVPlanNativePath || L->isInnermost()) &&
10220          "VPlan-native path is not enabled. Only process inner loops.");
10221 
10222 #ifndef NDEBUG
10223   const std::string DebugLocStr = getDebugLocString(L);
10224 #endif /* NDEBUG */
10225 
10226   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
10227                     << L->getHeader()->getParent()->getName() << "' from "
10228                     << DebugLocStr << "\n");
10229 
10230   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
10231 
10232   LLVM_DEBUG(
10233       dbgs() << "LV: Loop hints:"
10234              << " force="
10235              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
10236                      ? "disabled"
10237                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
10238                             ? "enabled"
10239                             : "?"))
10240              << " width=" << Hints.getWidth()
10241              << " interleave=" << Hints.getInterleave() << "\n");
10242 
10243   // Function containing loop
10244   Function *F = L->getHeader()->getParent();
10245 
10246   // Looking at the diagnostic output is the only way to determine if a loop
10247   // was vectorized (other than looking at the IR or machine code), so it
10248   // is important to generate an optimization remark for each loop. Most of
10249   // these messages are generated as OptimizationRemarkAnalysis. Remarks
10250   // generated as OptimizationRemark and OptimizationRemarkMissed are
10251   // less verbose reporting vectorized loops and unvectorized loops that may
10252   // benefit from vectorization, respectively.
10253 
10254   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10255     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10256     return false;
10257   }
10258 
10259   PredicatedScalarEvolution PSE(*SE, *L);
10260 
10261   // Check if it is legal to vectorize the loop.
10262   LoopVectorizationRequirements Requirements;
10263   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
10264                                 &Requirements, &Hints, DB, AC, BFI, PSI);
10265   if (!LVL.canVectorize(EnableVPlanNativePath)) {
10266     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10267     Hints.emitRemarkWithHints();
10268     return false;
10269   }
10270 
10271   // Check the function attributes and profiles to find out if this function
10272   // should be optimized for size.
10273   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10274       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
10275 
10276   // Entrance to the VPlan-native vectorization path. Outer loops are processed
10277   // here. They may require CFG and instruction level transformations before
10278   // even evaluating whether vectorization is profitable. Since we cannot modify
10279   // the incoming IR, we need to build VPlan upfront in the vectorization
10280   // pipeline.
10281   if (!L->isInnermost())
10282     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10283                                         ORE, BFI, PSI, Hints, Requirements);
10284 
10285   assert(L->isInnermost() && "Inner loop expected.");
10286 
10287   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10288   // count by optimizing for size, to minimize overheads.
10289   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
10290   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10291     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10292                       << "This loop is worth vectorizing only if no scalar "
10293                       << "iteration overheads are incurred.");
10294     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10295       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10296     else {
10297       LLVM_DEBUG(dbgs() << "\n");
10298       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10299     }
10300   }
10301 
10302   // Check the function attributes to see if implicit floats are allowed.
10303   // FIXME: This check doesn't seem possibly correct -- what if the loop is
10304   // an integer loop and the vector instructions selected are purely integer
10305   // vector instructions?
10306   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10307     reportVectorizationFailure(
10308         "Can't vectorize when the NoImplicitFloat attribute is used",
10309         "loop not vectorized due to NoImplicitFloat attribute",
10310         "NoImplicitFloat", ORE, L);
10311     Hints.emitRemarkWithHints();
10312     return false;
10313   }
10314 
10315   // Check if the target supports potentially unsafe FP vectorization.
10316   // FIXME: Add a check for the type of safety issue (denormal, signaling)
10317   // for the target we're vectorizing for, to make sure none of the
10318   // additional fp-math flags can help.
10319   if (Hints.isPotentiallyUnsafe() &&
10320       TTI->isFPVectorizationPotentiallyUnsafe()) {
10321     reportVectorizationFailure(
10322         "Potentially unsafe FP op prevents vectorization",
10323         "loop not vectorized due to unsafe FP support.",
10324         "UnsafeFP", ORE, L);
10325     Hints.emitRemarkWithHints();
10326     return false;
10327   }
10328 
10329   bool AllowOrderedReductions;
10330   // If the flag is set, use that instead and override the TTI behaviour.
10331   if (ForceOrderedReductions.getNumOccurrences() > 0)
10332     AllowOrderedReductions = ForceOrderedReductions;
10333   else
10334     AllowOrderedReductions = TTI->enableOrderedReductions();
10335   if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10336     ORE->emit([&]() {
10337       auto *ExactFPMathInst = Requirements.getExactFPInst();
10338       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10339                                                  ExactFPMathInst->getDebugLoc(),
10340                                                  ExactFPMathInst->getParent())
10341              << "loop not vectorized: cannot prove it is safe to reorder "
10342                 "floating-point operations";
10343     });
10344     LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10345                          "reorder floating-point operations\n");
10346     Hints.emitRemarkWithHints();
10347     return false;
10348   }
10349 
10350   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10351   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10352 
10353   // If an override option has been passed in for interleaved accesses, use it.
10354   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
10355     UseInterleaved = EnableInterleavedMemAccesses;
10356 
10357   // Analyze interleaved memory accesses.
10358   if (UseInterleaved) {
10359     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
10360   }
10361 
10362   // Use the cost model.
10363   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10364                                 F, &Hints, IAI);
10365   CM.collectValuesToIgnore();
10366   CM.collectElementTypesForWidening();
10367 
10368   // Use the planner for vectorization.
10369   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints,
10370                                Requirements, ORE);
10371 
10372   // Get user vectorization factor and interleave count.
10373   ElementCount UserVF = Hints.getWidth();
10374   unsigned UserIC = Hints.getInterleave();
10375 
10376   // Plan how to best vectorize, return the best VF and its cost.
10377   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
10378 
10379   VectorizationFactor VF = VectorizationFactor::Disabled();
10380   unsigned IC = 1;
10381 
10382   GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10383                            F->getParent()->getDataLayout());
10384   if (MaybeVF) {
10385     if (LVP.requiresTooManyRuntimeChecks()) {
10386       ORE->emit([&]() {
10387         return OptimizationRemarkAnalysisAliasing(
10388                    DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
10389                    L->getHeader())
10390                << "loop not vectorized: cannot prove it is safe to reorder "
10391                   "memory operations";
10392       });
10393       LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
10394       Hints.emitRemarkWithHints();
10395       return false;
10396     }
10397     VF = *MaybeVF;
10398     // Select the interleave count.
10399     IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());
10400 
10401     unsigned SelectedIC = std::max(IC, UserIC);
10402     //  Optimistically generate runtime checks if they are needed. Drop them if
10403     //  they turn out to not be profitable.
10404     if (VF.Width.isVector() || SelectedIC > 1)
10405       Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
10406   }
10407 
10408   // Identify the diagnostic messages that should be produced.
10409   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10410   bool VectorizeLoop = true, InterleaveLoop = true;
10411   if (VF.Width.isScalar()) {
10412     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10413     VecDiagMsg = std::make_pair(
10414         "VectorizationNotBeneficial",
10415         "the cost-model indicates that vectorization is not beneficial");
10416     VectorizeLoop = false;
10417   }
10418 
10419   if (!MaybeVF && UserIC > 1) {
10420     // Tell the user interleaving was avoided up-front, despite being explicitly
10421     // requested.
10422     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10423                          "interleaving should be avoided up front\n");
10424     IntDiagMsg = std::make_pair(
10425         "InterleavingAvoided",
10426         "Ignoring UserIC, because interleaving was avoided up front");
10427     InterleaveLoop = false;
10428   } else if (IC == 1 && UserIC <= 1) {
10429     // Tell the user interleaving is not beneficial.
10430     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10431     IntDiagMsg = std::make_pair(
10432         "InterleavingNotBeneficial",
10433         "the cost-model indicates that interleaving is not beneficial");
10434     InterleaveLoop = false;
10435     if (UserIC == 1) {
10436       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10437       IntDiagMsg.second +=
10438           " and is explicitly disabled or interleave count is set to 1";
10439     }
10440   } else if (IC > 1 && UserIC == 1) {
10441     // Tell the user interleaving is beneficial, but it explicitly disabled.
10442     LLVM_DEBUG(
10443         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10444     IntDiagMsg = std::make_pair(
10445         "InterleavingBeneficialButDisabled",
10446         "the cost-model indicates that interleaving is beneficial "
10447         "but is explicitly disabled or interleave count is set to 1");
10448     InterleaveLoop = false;
10449   }
10450 
10451   // Override IC if user provided an interleave count.
10452   IC = UserIC > 0 ? UserIC : IC;
10453 
10454   // Emit diagnostic messages, if any.
10455   const char *VAPassName = Hints.vectorizeAnalysisPassName();
10456   if (!VectorizeLoop && !InterleaveLoop) {
10457     // Do not vectorize or interleaving the loop.
10458     ORE->emit([&]() {
10459       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10460                                       L->getStartLoc(), L->getHeader())
10461              << VecDiagMsg.second;
10462     });
10463     ORE->emit([&]() {
10464       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10465                                       L->getStartLoc(), L->getHeader())
10466              << IntDiagMsg.second;
10467     });
10468     return false;
10469   } else if (!VectorizeLoop && InterleaveLoop) {
10470     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10471     ORE->emit([&]() {
10472       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10473                                         L->getStartLoc(), L->getHeader())
10474              << VecDiagMsg.second;
10475     });
10476   } else if (VectorizeLoop && !InterleaveLoop) {
10477     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10478                       << ") in " << DebugLocStr << '\n');
10479     ORE->emit([&]() {
10480       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10481                                         L->getStartLoc(), L->getHeader())
10482              << IntDiagMsg.second;
10483     });
10484   } else if (VectorizeLoop && InterleaveLoop) {
10485     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10486                       << ") in " << DebugLocStr << '\n');
10487     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10488   }
10489 
10490   bool DisableRuntimeUnroll = false;
10491   MDNode *OrigLoopID = L->getLoopID();
10492   {
10493     using namespace ore;
10494     if (!VectorizeLoop) {
10495       assert(IC > 1 && "interleave count should not be 1 or 0");
10496       // If we decided that it is not legal to vectorize the loop, then
10497       // interleave it.
10498       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10499                                  &CM, BFI, PSI, Checks);
10500 
10501       VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10502       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT);
10503 
10504       ORE->emit([&]() {
10505         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10506                                   L->getHeader())
10507                << "interleaved loop (interleaved count: "
10508                << NV("InterleaveCount", IC) << ")";
10509       });
10510     } else {
10511       // If we decided that it is *legal* to vectorize the loop, then do it.
10512 
10513       // Consider vectorizing the epilogue too if it's profitable.
10514       VectorizationFactor EpilogueVF =
10515           CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
10516       if (EpilogueVF.Width.isVector()) {
10517 
10518         // The first pass vectorizes the main loop and creates a scalar epilogue
10519         // to be vectorized by executing the plan (potentially with a different
10520         // factor) again shortly afterwards.
10521         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10522         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10523                                            EPI, &LVL, &CM, BFI, PSI, Checks);
10524 
10525         VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
10526         LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV,
10527                         DT);
10528         ++LoopsVectorized;
10529 
10530         // Second pass vectorizes the epilogue and adjusts the control flow
10531         // edges from the first pass.
10532         EPI.MainLoopVF = EPI.EpilogueVF;
10533         EPI.MainLoopUF = EPI.EpilogueUF;
10534         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10535                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
10536                                                  Checks);
10537 
10538         VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
10539         VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
10540         VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10541         Header->setName("vec.epilog.vector.body");
10542 
10543         // Ensure that the start values for any VPReductionPHIRecipes are
10544         // updated before vectorising the epilogue loop.
10545         for (VPRecipeBase &R : Header->phis()) {
10546           if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10547             if (auto *Resume = MainILV.getReductionResumeValue(
10548                     ReductionPhi->getRecurrenceDescriptor())) {
10549               VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(Resume);
10550               ReductionPhi->setOperand(0, StartVal);
10551             }
10552           }
10553         }
10554 
10555         LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10556                         DT);
10557         ++LoopsEpilogueVectorized;
10558 
10559         if (!MainILV.areSafetyChecksAdded())
10560           DisableRuntimeUnroll = true;
10561       } else {
10562         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
10563                                &LVL, &CM, BFI, PSI, Checks);
10564 
10565         VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10566         LVP.executePlan(VF.Width, IC, BestPlan, LB, DT);
10567         ++LoopsVectorized;
10568 
10569         // Add metadata to disable runtime unrolling a scalar loop when there
10570         // are no runtime checks about strides and memory. A scalar loop that is
10571         // rarely used is not worth unrolling.
10572         if (!LB.areSafetyChecksAdded())
10573           DisableRuntimeUnroll = true;
10574       }
10575       // Report the vectorization decision.
10576       ORE->emit([&]() {
10577         return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
10578                                   L->getHeader())
10579                << "vectorized loop (vectorization width: "
10580                << NV("VectorizationFactor", VF.Width)
10581                << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
10582       });
10583     }
10584 
10585     if (ORE->allowExtraAnalysis(LV_NAME))
10586       checkMixedPrecision(L, ORE);
10587   }
10588 
10589   Optional<MDNode *> RemainderLoopID =
10590       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10591                                       LLVMLoopVectorizeFollowupEpilogue});
10592   if (RemainderLoopID) {
10593     L->setLoopID(RemainderLoopID.getValue());
10594   } else {
10595     if (DisableRuntimeUnroll)
10596       AddRuntimeUnrollDisableMetaData(L);
10597 
10598     // Mark the loop as already vectorized to avoid vectorizing again.
10599     Hints.setAlreadyVectorized();
10600   }
10601 
10602   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10603   return true;
10604 }
10605 
10606 LoopVectorizeResult LoopVectorizePass::runImpl(
10607     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10608     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
10609     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
10610     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
10611     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10612   SE = &SE_;
10613   LI = &LI_;
10614   TTI = &TTI_;
10615   DT = &DT_;
10616   BFI = &BFI_;
10617   TLI = TLI_;
10618   AA = &AA_;
10619   AC = &AC_;
10620   GetLAA = &GetLAA_;
10621   DB = &DB_;
10622   ORE = &ORE_;
10623   PSI = PSI_;
10624 
10625   // Don't attempt if
10626   // 1. the target claims to have no vector registers, and
10627   // 2. interleaving won't help ILP.
10628   //
10629   // The second condition is necessary because, even if the target has no
10630   // vector registers, loop vectorization may still enable scalar
10631   // interleaving.
10632   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10633       TTI->getMaxInterleaveFactor(1) < 2)
10634     return LoopVectorizeResult(false, false);
10635 
10636   bool Changed = false, CFGChanged = false;
10637 
10638   // The vectorizer requires loops to be in simplified form.
10639   // Since simplification may add new inner loops, it has to run before the
10640   // legality and profitability checks. This means running the loop vectorizer
10641   // will simplify all loops, regardless of whether anything end up being
10642   // vectorized.
10643   for (auto &L : *LI)
10644     Changed |= CFGChanged |=
10645         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10646 
10647   // Build up a worklist of inner-loops to vectorize. This is necessary as
10648   // the act of vectorizing or partially unrolling a loop creates new loops
10649   // and can invalidate iterators across the loops.
10650   SmallVector<Loop *, 8> Worklist;
10651 
10652   for (Loop *L : *LI)
10653     collectSupportedLoops(*L, LI, ORE, Worklist);
10654 
10655   LoopsAnalyzed += Worklist.size();
10656 
10657   // Now walk the identified inner loops.
10658   while (!Worklist.empty()) {
10659     Loop *L = Worklist.pop_back_val();
10660 
10661     // For the inner loops we actually process, form LCSSA to simplify the
10662     // transform.
10663     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10664 
10665     Changed |= CFGChanged |= processLoop(L);
10666   }
10667 
10668   // Process each loop nest in the function.
10669   return LoopVectorizeResult(Changed, CFGChanged);
10670 }
10671 
10672 PreservedAnalyses LoopVectorizePass::run(Function &F,
10673                                          FunctionAnalysisManager &AM) {
10674     auto &LI = AM.getResult<LoopAnalysis>(F);
10675     // There are no loops in the function. Return before computing other expensive
10676     // analyses.
10677     if (LI.empty())
10678       return PreservedAnalyses::all();
10679     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
10680     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10681     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10682     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
10683     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10684     auto &AA = AM.getResult<AAManager>(F);
10685     auto &AC = AM.getResult<AssumptionAnalysis>(F);
10686     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10687     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10688 
10689     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
10690     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
10691         [&](Loop &L) -> const LoopAccessInfo & {
10692       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,      SE,
10693                                         TLI, TTI, nullptr, nullptr, nullptr};
10694       return LAM.getResult<LoopAccessAnalysis>(L, AR);
10695     };
10696     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10697     ProfileSummaryInfo *PSI =
10698         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10699     LoopVectorizeResult Result =
10700         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
10701     if (!Result.MadeAnyChange)
10702       return PreservedAnalyses::all();
10703     PreservedAnalyses PA;
10704 
10705     // We currently do not preserve loopinfo/dominator analyses with outer loop
10706     // vectorization. Until this is addressed, mark these analyses as preserved
10707     // only for non-VPlan-native path.
10708     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10709     if (!EnableVPlanNativePath) {
10710       PA.preserve<LoopAnalysis>();
10711       PA.preserve<DominatorTreeAnalysis>();
10712     }
10713 
10714     if (Result.MadeCFGChange) {
10715       // Making CFG changes likely means a loop got vectorized. Indicate that
10716       // extra simplification passes should be run.
10717       // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10718       // be run if runtime checks have been added.
10719       AM.getResult<ShouldRunExtraVectorPasses>(F);
10720       PA.preserve<ShouldRunExtraVectorPasses>();
10721     } else {
10722       PA.preserveSet<CFGAnalyses>();
10723     }
10724     return PA;
10725 }
10726 
10727 void LoopVectorizePass::printPipeline(
10728     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10729   static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10730       OS, MapClassName2PassName);
10731 
10732   OS << "<";
10733   OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10734   OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10735   OS << ">";
10736 }
10737