1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SmallPtrSet.h"
73 #include "llvm/ADT/SmallSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
91 #include "llvm/Analysis/ProfileSummaryInfo.h"
92 #include "llvm/Analysis/ScalarEvolution.h"
93 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
94 #include "llvm/Analysis/TargetLibraryInfo.h"
95 #include "llvm/Analysis/TargetTransformInfo.h"
96 #include "llvm/Analysis/VectorUtils.h"
97 #include "llvm/IR/Attributes.h"
98 #include "llvm/IR/BasicBlock.h"
99 #include "llvm/IR/CFG.h"
100 #include "llvm/IR/Constant.h"
101 #include "llvm/IR/Constants.h"
102 #include "llvm/IR/DataLayout.h"
103 #include "llvm/IR/DebugInfoMetadata.h"
104 #include "llvm/IR/DebugLoc.h"
105 #include "llvm/IR/DerivedTypes.h"
106 #include "llvm/IR/DiagnosticInfo.h"
107 #include "llvm/IR/Dominators.h"
108 #include "llvm/IR/Function.h"
109 #include "llvm/IR/IRBuilder.h"
110 #include "llvm/IR/InstrTypes.h"
111 #include "llvm/IR/Instruction.h"
112 #include "llvm/IR/Instructions.h"
113 #include "llvm/IR/IntrinsicInst.h"
114 #include "llvm/IR/Intrinsics.h"
115 #include "llvm/IR/Metadata.h"
116 #include "llvm/IR/Module.h"
117 #include "llvm/IR/Operator.h"
118 #include "llvm/IR/PatternMatch.h"
119 #include "llvm/IR/Type.h"
120 #include "llvm/IR/Use.h"
121 #include "llvm/IR/User.h"
122 #include "llvm/IR/Value.h"
123 #include "llvm/IR/ValueHandle.h"
124 #include "llvm/IR/Verifier.h"
125 #include "llvm/InitializePasses.h"
126 #include "llvm/Pass.h"
127 #include "llvm/Support/Casting.h"
128 #include "llvm/Support/CommandLine.h"
129 #include "llvm/Support/Compiler.h"
130 #include "llvm/Support/Debug.h"
131 #include "llvm/Support/ErrorHandling.h"
132 #include "llvm/Support/InstructionCost.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <functional>
147 #include <iterator>
148 #include <limits>
149 #include <memory>
150 #include <string>
151 #include <tuple>
152 #include <utility>
153 
154 using namespace llvm;
155 
156 #define LV_NAME "loop-vectorize"
157 #define DEBUG_TYPE LV_NAME
158 
159 #ifndef NDEBUG
160 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
161 #endif
162 
163 /// @{
164 /// Metadata attribute names
165 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
166 const char LLVMLoopVectorizeFollowupVectorized[] =
167     "llvm.loop.vectorize.followup_vectorized";
168 const char LLVMLoopVectorizeFollowupEpilogue[] =
169     "llvm.loop.vectorize.followup_epilogue";
170 /// @}
171 
172 STATISTIC(LoopsVectorized, "Number of loops vectorized");
173 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
174 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
175 
176 static cl::opt<bool> EnableEpilogueVectorization(
177     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
178     cl::desc("Enable vectorization of epilogue loops."));
179 
180 static cl::opt<unsigned> EpilogueVectorizationForceVF(
181     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
182     cl::desc("When epilogue vectorization is enabled, and a value greater than "
183              "1 is specified, forces the given VF for all applicable epilogue "
184              "loops."));
185 
186 static cl::opt<unsigned> EpilogueVectorizationMinVF(
187     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
188     cl::desc("Only loops with vectorization factor equal to or larger than "
189              "the specified value are considered for epilogue vectorization."));
190 
191 /// Loops with a known constant trip count below this number are vectorized only
192 /// if no scalar iteration overheads are incurred.
193 static cl::opt<unsigned> TinyTripCountVectorThreshold(
194     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
195     cl::desc("Loops with a constant trip count that is smaller than this "
196              "value are vectorized only if no scalar iteration overheads "
197              "are incurred."));
198 
199 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
200     "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
201     cl::desc("The maximum allowed number of runtime memory checks with a "
202              "vectorize(enable) pragma."));
203 
204 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
205 // that predication is preferred, and this lists all options. I.e., the
206 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
207 // and predicate the instructions accordingly. If tail-folding fails, there are
208 // different fallback strategies depending on these values:
209 namespace PreferPredicateTy {
210   enum Option {
211     ScalarEpilogue = 0,
212     PredicateElseScalarEpilogue,
213     PredicateOrDontVectorize
214   };
215 } // namespace PreferPredicateTy
216 
217 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
218     "prefer-predicate-over-epilogue",
219     cl::init(PreferPredicateTy::ScalarEpilogue),
220     cl::Hidden,
221     cl::desc("Tail-folding and predication preferences over creating a scalar "
222              "epilogue loop."),
223     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
224                          "scalar-epilogue",
225                          "Don't tail-predicate loops, create scalar epilogue"),
226               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
227                          "predicate-else-scalar-epilogue",
228                          "prefer tail-folding, create scalar epilogue if tail "
229                          "folding fails."),
230               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
231                          "predicate-dont-vectorize",
232                          "prefers tail-folding, don't attempt vectorization if "
233                          "tail-folding fails.")));
234 
235 static cl::opt<bool> MaximizeBandwidth(
236     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
237     cl::desc("Maximize bandwidth when selecting vectorization factor which "
238              "will be determined by the smallest type in loop."));
239 
240 static cl::opt<bool> EnableInterleavedMemAccesses(
241     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
242     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
243 
244 /// An interleave-group may need masking if it resides in a block that needs
245 /// predication, or in order to mask away gaps.
246 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
247     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
248     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
249 
250 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
251     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
252     cl::desc("We don't interleave loops with a estimated constant trip count "
253              "below this number"));
254 
255 static cl::opt<unsigned> ForceTargetNumScalarRegs(
256     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
257     cl::desc("A flag that overrides the target's number of scalar registers."));
258 
259 static cl::opt<unsigned> ForceTargetNumVectorRegs(
260     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
261     cl::desc("A flag that overrides the target's number of vector registers."));
262 
263 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
264     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
265     cl::desc("A flag that overrides the target's max interleave factor for "
266              "scalar loops."));
267 
268 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
269     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
270     cl::desc("A flag that overrides the target's max interleave factor for "
271              "vectorized loops."));
272 
273 static cl::opt<unsigned> ForceTargetInstructionCost(
274     "force-target-instruction-cost", cl::init(0), cl::Hidden,
275     cl::desc("A flag that overrides the target's expected cost for "
276              "an instruction to a single constant value. Mostly "
277              "useful for getting consistent testing."));
278 
279 static cl::opt<bool> ForceTargetSupportsScalableVectors(
280     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
281     cl::desc(
282         "Pretend that scalable vectors are supported, even if the target does "
283         "not support them. This flag should only be used for testing."));
284 
285 static cl::opt<unsigned> SmallLoopCost(
286     "small-loop-cost", cl::init(20), cl::Hidden,
287     cl::desc(
288         "The cost of a loop that is considered 'small' by the interleaver."));
289 
290 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
291     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
292     cl::desc("Enable the use of the block frequency analysis to access PGO "
293              "heuristics minimizing code growth in cold regions and being more "
294              "aggressive in hot regions."));
295 
296 // Runtime interleave loops for load/store throughput.
297 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
298     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
299     cl::desc(
300         "Enable runtime interleaving until load/store ports are saturated"));
301 
302 /// Interleave small loops with scalar reductions.
303 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
304     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
305     cl::desc("Enable interleaving for loops with small iteration counts that "
306              "contain scalar reductions to expose ILP."));
307 
308 /// The number of stores in a loop that are allowed to need predication.
309 static cl::opt<unsigned> NumberOfStoresToPredicate(
310     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
311     cl::desc("Max number of stores to be predicated behind an if."));
312 
313 static cl::opt<bool> EnableIndVarRegisterHeur(
314     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
315     cl::desc("Count the induction variable only once when interleaving"));
316 
317 static cl::opt<bool> EnableCondStoresVectorization(
318     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
319     cl::desc("Enable if predication of stores during vectorization."));
320 
321 static cl::opt<unsigned> MaxNestedScalarReductionIC(
322     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
323     cl::desc("The maximum interleave count to use when interleaving a scalar "
324              "reduction in a nested loop."));
325 
326 static cl::opt<bool>
327     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
328                            cl::Hidden,
329                            cl::desc("Prefer in-loop vector reductions, "
330                                     "overriding the targets preference."));
331 
332 static cl::opt<bool> ForceOrderedReductions(
333     "force-ordered-reductions", cl::init(false), cl::Hidden,
334     cl::desc("Enable the vectorisation of loops with in-order (strict) "
335              "FP reductions"));
336 
337 static cl::opt<bool> PreferPredicatedReductionSelect(
338     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
339     cl::desc(
340         "Prefer predicating a reduction operation over an after loop select."));
341 
342 cl::opt<bool> EnableVPlanNativePath(
343     "enable-vplan-native-path", cl::init(false), cl::Hidden,
344     cl::desc("Enable VPlan-native vectorization path with "
345              "support for outer loop vectorization."));
346 
347 // FIXME: Remove this switch once we have divergence analysis. Currently we
348 // assume divergent non-backedge branches when this switch is true.
349 cl::opt<bool> EnableVPlanPredication(
350     "enable-vplan-predication", cl::init(false), cl::Hidden,
351     cl::desc("Enable VPlan-native vectorization path predicator with "
352              "support for outer loop vectorization."));
353 
354 // This flag enables the stress testing of the VPlan H-CFG construction in the
355 // VPlan-native vectorization path. It must be used in conjuction with
356 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
357 // verification of the H-CFGs built.
358 static cl::opt<bool> VPlanBuildStressTest(
359     "vplan-build-stress-test", cl::init(false), cl::Hidden,
360     cl::desc(
361         "Build VPlan for every supported loop nest in the function and bail "
362         "out right after the build (stress test the VPlan H-CFG construction "
363         "in the VPlan-native vectorization path)."));
364 
365 cl::opt<bool> llvm::EnableLoopInterleaving(
366     "interleave-loops", cl::init(true), cl::Hidden,
367     cl::desc("Enable loop interleaving in Loop vectorization passes"));
368 cl::opt<bool> llvm::EnableLoopVectorization(
369     "vectorize-loops", cl::init(true), cl::Hidden,
370     cl::desc("Run the Loop vectorization passes"));
371 
372 cl::opt<bool> PrintVPlansInDotFormat(
373     "vplan-print-in-dot-format", cl::init(false), cl::Hidden,
374     cl::desc("Use dot format instead of plain text when dumping VPlans"));
375 
376 /// A helper function that returns true if the given type is irregular. The
377 /// type is irregular if its allocated size doesn't equal the store size of an
378 /// element of the corresponding vector type.
379 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
380   // Determine if an array of N elements of type Ty is "bitcast compatible"
381   // with a <N x Ty> vector.
382   // This is only true if there is no padding between the array elements.
383   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
384 }
385 
386 /// A helper function that returns the reciprocal of the block probability of
387 /// predicated blocks. If we return X, we are assuming the predicated block
388 /// will execute once for every X iterations of the loop header.
389 ///
390 /// TODO: We should use actual block probability here, if available. Currently,
391 ///       we always assume predicated blocks have a 50% chance of executing.
392 static unsigned getReciprocalPredBlockProb() { return 2; }
393 
394 /// A helper function that returns an integer or floating-point constant with
395 /// value C.
396 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
397   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
398                            : ConstantFP::get(Ty, C);
399 }
400 
401 /// Returns "best known" trip count for the specified loop \p L as defined by
402 /// the following procedure:
403 ///   1) Returns exact trip count if it is known.
404 ///   2) Returns expected trip count according to profile data if any.
405 ///   3) Returns upper bound estimate if it is known.
406 ///   4) Returns None if all of the above failed.
407 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
408   // Check if exact trip count is known.
409   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
410     return ExpectedTC;
411 
412   // Check if there is an expected trip count available from profile data.
413   if (LoopVectorizeWithBlockFrequency)
414     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
415       return EstimatedTC;
416 
417   // Check if upper bound estimate is known.
418   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
419     return ExpectedTC;
420 
421   return None;
422 }
423 
424 // Forward declare GeneratedRTChecks.
425 class GeneratedRTChecks;
426 
427 namespace llvm {
428 
429 AnalysisKey ShouldRunExtraVectorPasses::Key;
430 
431 /// InnerLoopVectorizer vectorizes loops which contain only one basic
432 /// block to a specified vectorization factor (VF).
433 /// This class performs the widening of scalars into vectors, or multiple
434 /// scalars. This class also implements the following features:
435 /// * It inserts an epilogue loop for handling loops that don't have iteration
436 ///   counts that are known to be a multiple of the vectorization factor.
437 /// * It handles the code generation for reduction variables.
438 /// * Scalarization (implementation using scalars) of un-vectorizable
439 ///   instructions.
440 /// InnerLoopVectorizer does not perform any vectorization-legality
441 /// checks, and relies on the caller to check for the different legality
442 /// aspects. The InnerLoopVectorizer relies on the
443 /// LoopVectorizationLegality class to provide information about the induction
444 /// and reduction variables that were found to a given vectorization factor.
445 class InnerLoopVectorizer {
446 public:
447   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
448                       LoopInfo *LI, DominatorTree *DT,
449                       const TargetLibraryInfo *TLI,
450                       const TargetTransformInfo *TTI, AssumptionCache *AC,
451                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
452                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
453                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
454                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
455       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
456         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
457         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
458         PSI(PSI), RTChecks(RTChecks) {
459     // Query this against the original loop and save it here because the profile
460     // of the original loop header may change as the transformation happens.
461     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
462         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
463   }
464 
465   virtual ~InnerLoopVectorizer() = default;
466 
467   /// Create a new empty loop that will contain vectorized instructions later
468   /// on, while the old loop will be used as the scalar remainder. Control flow
469   /// is generated around the vectorized (and scalar epilogue) loops consisting
470   /// of various checks and bypasses. Return the pre-header block of the new
471   /// loop and the start value for the canonical induction, if it is != 0. The
472   /// latter is the case when vectorizing the epilogue loop. In the case of
473   /// epilogue vectorization, this function is overriden to handle the more
474   /// complex control flow around the loops.
475   virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton();
476 
477   /// Widen a single call instruction within the innermost loop.
478   void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
479                             VPTransformState &State);
480 
481   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
482   void fixVectorizedLoop(VPTransformState &State);
483 
484   // Return true if any runtime check is added.
485   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
486 
487   /// A type for vectorized values in the new loop. Each value from the
488   /// original loop, when vectorized, is represented by UF vector values in the
489   /// new unrolled loop, where UF is the unroll factor.
490   using VectorParts = SmallVector<Value *, 2>;
491 
492   /// Vectorize a single first-order recurrence or pointer induction PHINode in
493   /// a block. This method handles the induction variable canonicalization. It
494   /// supports both VF = 1 for unrolled loops and arbitrary length vectors.
495   void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR,
496                            VPTransformState &State);
497 
498   /// A helper function to scalarize a single Instruction in the innermost loop.
499   /// Generates a sequence of scalar instances for each lane between \p MinLane
500   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
501   /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
502   /// Instr's operands.
503   void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe,
504                             const VPIteration &Instance, bool IfPredicateInstr,
505                             VPTransformState &State);
506 
507   /// Construct the vector value of a scalarized value \p V one lane at a time.
508   void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
509                                  VPTransformState &State);
510 
511   /// Try to vectorize interleaved access group \p Group with the base address
512   /// given in \p Addr, optionally masking the vector operations if \p
513   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
514   /// values in the vectorized loop.
515   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
516                                 ArrayRef<VPValue *> VPDefs,
517                                 VPTransformState &State, VPValue *Addr,
518                                 ArrayRef<VPValue *> StoredValues,
519                                 VPValue *BlockInMask = nullptr);
520 
521   /// Set the debug location in the builder \p Ptr using the debug location in
522   /// \p V. If \p Ptr is None then it uses the class member's Builder.
523   void setDebugLocFromInst(const Value *V,
524                            Optional<IRBuilderBase *> CustomBuilder = None);
525 
526   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
527   void fixNonInductionPHIs(VPTransformState &State);
528 
529   /// Returns true if the reordering of FP operations is not allowed, but we are
530   /// able to vectorize with strict in-order reductions for the given RdxDesc.
531   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
532 
533   /// Create a broadcast instruction. This method generates a broadcast
534   /// instruction (shuffle) for loop invariant values and for the induction
535   /// value. If this is the induction variable then we extend it to N, N+1, ...
536   /// this is needed because each iteration in the loop corresponds to a SIMD
537   /// element.
538   virtual Value *getBroadcastInstrs(Value *V);
539 
540   /// Add metadata from one instruction to another.
541   ///
542   /// This includes both the original MDs from \p From and additional ones (\see
543   /// addNewMetadata).  Use this for *newly created* instructions in the vector
544   /// loop.
545   void addMetadata(Instruction *To, Instruction *From);
546 
547   /// Similar to the previous function but it adds the metadata to a
548   /// vector of instructions.
549   void addMetadata(ArrayRef<Value *> To, Instruction *From);
550 
551   // Returns the resume value (bc.merge.rdx) for a reduction as
552   // generated by fixReduction.
553   PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);
554 
555 protected:
556   friend class LoopVectorizationPlanner;
557 
558   /// A small list of PHINodes.
559   using PhiVector = SmallVector<PHINode *, 4>;
560 
561   /// A type for scalarized values in the new loop. Each value from the
562   /// original loop, when scalarized, is represented by UF x VF scalar values
563   /// in the new unrolled loop, where UF is the unroll factor and VF is the
564   /// vectorization factor.
565   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
566 
567   /// Set up the values of the IVs correctly when exiting the vector loop.
568   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
569                     Value *CountRoundDown, Value *EndValue,
570                     BasicBlock *MiddleBlock, BasicBlock *VectorHeader);
571 
572   /// Introduce a conditional branch (on true, condition to be set later) at the
573   /// end of the header=latch connecting it to itself (across the backedge) and
574   /// to the exit block of \p L.
575   void createHeaderBranch(Loop *L);
576 
577   /// Handle all cross-iteration phis in the header.
578   void fixCrossIterationPHIs(VPTransformState &State);
579 
580   /// Create the exit value of first order recurrences in the middle block and
581   /// update their users.
582   void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
583                                VPTransformState &State);
584 
585   /// Create code for the loop exit value of the reduction.
586   void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
587 
588   /// Clear NSW/NUW flags from reduction instructions if necessary.
589   void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
590                                VPTransformState &State);
591 
592   /// Fixup the LCSSA phi nodes in the unique exit block.  This simply
593   /// means we need to add the appropriate incoming value from the middle
594   /// block as exiting edges from the scalar epilogue loop (if present) are
595   /// already in place, and we exit the vector loop exclusively to the middle
596   /// block.
597   void fixLCSSAPHIs(VPTransformState &State);
598 
599   /// Iteratively sink the scalarized operands of a predicated instruction into
600   /// the block that was created for it.
601   void sinkScalarOperands(Instruction *PredInst);
602 
603   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
604   /// represented as.
605   void truncateToMinimalBitwidths(VPTransformState &State);
606 
607   /// Returns (and creates if needed) the original loop trip count.
608   Value *getOrCreateTripCount(Loop *NewLoop);
609 
610   /// Returns (and creates if needed) the trip count of the widened loop.
611   Value *getOrCreateVectorTripCount(Loop *NewLoop);
612 
613   /// Returns a bitcasted value to the requested vector type.
614   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
615   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
616                                 const DataLayout &DL);
617 
618   /// Emit a bypass check to see if the vector trip count is zero, including if
619   /// it overflows.
620   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
621 
622   /// Emit a bypass check to see if all of the SCEV assumptions we've
623   /// had to make are correct. Returns the block containing the checks or
624   /// nullptr if no checks have been added.
625   BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
626 
627   /// Emit bypass checks to check any memory assumptions we may have made.
628   /// Returns the block containing the checks or nullptr if no checks have been
629   /// added.
630   BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
631 
632   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
633   /// vector loop preheader, middle block and scalar preheader. Also
634   /// allocate a loop object for the new vector loop and return it.
635   Loop *createVectorLoopSkeleton(StringRef Prefix);
636 
637   /// Create new phi nodes for the induction variables to resume iteration count
638   /// in the scalar epilogue, from where the vectorized loop left off.
639   /// In cases where the loop skeleton is more complicated (eg. epilogue
640   /// vectorization) and the resume values can come from an additional bypass
641   /// block, the \p AdditionalBypass pair provides information about the bypass
642   /// block and the end value on the edge from bypass to this loop.
643   void createInductionResumeValues(
644       Loop *L,
645       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
646 
647   /// Complete the loop skeleton by adding debug MDs, creating appropriate
648   /// conditional branches in the middle block, preparing the builder and
649   /// running the verifier. Take in the vector loop \p L as argument, and return
650   /// the preheader of the completed vector loop.
651   BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
652 
653   /// Add additional metadata to \p To that was not present on \p Orig.
654   ///
655   /// Currently this is used to add the noalias annotations based on the
656   /// inserted memchecks.  Use this for instructions that are *cloned* into the
657   /// vector loop.
658   void addNewMetadata(Instruction *To, const Instruction *Orig);
659 
660   /// Collect poison-generating recipes that may generate a poison value that is
661   /// used after vectorization, even when their operands are not poison. Those
662   /// recipes meet the following conditions:
663   ///  * Contribute to the address computation of a recipe generating a widen
664   ///    memory load/store (VPWidenMemoryInstructionRecipe or
665   ///    VPInterleaveRecipe).
666   ///  * Such a widen memory load/store has at least one underlying Instruction
667   ///    that is in a basic block that needs predication and after vectorization
668   ///    the generated instruction won't be predicated.
669   void collectPoisonGeneratingRecipes(VPTransformState &State);
670 
671   /// Allow subclasses to override and print debug traces before/after vplan
672   /// execution, when trace information is requested.
673   virtual void printDebugTracesAtStart(){};
674   virtual void printDebugTracesAtEnd(){};
675 
676   /// The original loop.
677   Loop *OrigLoop;
678 
679   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
680   /// dynamic knowledge to simplify SCEV expressions and converts them to a
681   /// more usable form.
682   PredicatedScalarEvolution &PSE;
683 
684   /// Loop Info.
685   LoopInfo *LI;
686 
687   /// Dominator Tree.
688   DominatorTree *DT;
689 
690   /// Alias Analysis.
691   AAResults *AA;
692 
693   /// Target Library Info.
694   const TargetLibraryInfo *TLI;
695 
696   /// Target Transform Info.
697   const TargetTransformInfo *TTI;
698 
699   /// Assumption Cache.
700   AssumptionCache *AC;
701 
702   /// Interface to emit optimization remarks.
703   OptimizationRemarkEmitter *ORE;
704 
705   /// LoopVersioning.  It's only set up (non-null) if memchecks were
706   /// used.
707   ///
708   /// This is currently only used to add no-alias metadata based on the
709   /// memchecks.  The actually versioning is performed manually.
710   std::unique_ptr<LoopVersioning> LVer;
711 
712   /// The vectorization SIMD factor to use. Each vector will have this many
713   /// vector elements.
714   ElementCount VF;
715 
716   /// The vectorization unroll factor to use. Each scalar is vectorized to this
717   /// many different vector instructions.
718   unsigned UF;
719 
720   /// The builder that we use
721   IRBuilder<> Builder;
722 
723   // --- Vectorization state ---
724 
725   /// The vector-loop preheader.
726   BasicBlock *LoopVectorPreHeader;
727 
728   /// The scalar-loop preheader.
729   BasicBlock *LoopScalarPreHeader;
730 
731   /// Middle Block between the vector and the scalar.
732   BasicBlock *LoopMiddleBlock;
733 
734   /// The unique ExitBlock of the scalar loop if one exists.  Note that
735   /// there can be multiple exiting edges reaching this block.
736   BasicBlock *LoopExitBlock;
737 
738   /// The scalar loop body.
739   BasicBlock *LoopScalarBody;
740 
741   /// A list of all bypass blocks. The first block is the entry of the loop.
742   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
743 
744   /// Store instructions that were predicated.
745   SmallVector<Instruction *, 4> PredicatedInstructions;
746 
747   /// Trip count of the original loop.
748   Value *TripCount = nullptr;
749 
750   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
751   Value *VectorTripCount = nullptr;
752 
753   /// The legality analysis.
754   LoopVectorizationLegality *Legal;
755 
756   /// The profitablity analysis.
757   LoopVectorizationCostModel *Cost;
758 
759   // Record whether runtime checks are added.
760   bool AddedSafetyChecks = false;
761 
762   // Holds the end values for each induction variable. We save the end values
763   // so we can later fix-up the external users of the induction variables.
764   DenseMap<PHINode *, Value *> IVEndValues;
765 
766   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
767   // fixed up at the end of vector code generation.
768   SmallVector<PHINode *, 8> OrigPHIsToFix;
769 
770   /// BFI and PSI are used to check for profile guided size optimizations.
771   BlockFrequencyInfo *BFI;
772   ProfileSummaryInfo *PSI;
773 
774   // Whether this loop should be optimized for size based on profile guided size
775   // optimizatios.
776   bool OptForSizeBasedOnProfile;
777 
778   /// Structure to hold information about generated runtime checks, responsible
779   /// for cleaning the checks, if vectorization turns out unprofitable.
780   GeneratedRTChecks &RTChecks;
781 
782   // Holds the resume values for reductions in the loops, used to set the
783   // correct start value of reduction PHIs when vectorizing the epilogue.
784   SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
785       ReductionResumeValues;
786 };
787 
788 class InnerLoopUnroller : public InnerLoopVectorizer {
789 public:
790   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
791                     LoopInfo *LI, DominatorTree *DT,
792                     const TargetLibraryInfo *TLI,
793                     const TargetTransformInfo *TTI, AssumptionCache *AC,
794                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
795                     LoopVectorizationLegality *LVL,
796                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
797                     ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
798       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
799                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
800                             BFI, PSI, Check) {}
801 
802 private:
803   Value *getBroadcastInstrs(Value *V) override;
804 };
805 
806 /// Encapsulate information regarding vectorization of a loop and its epilogue.
807 /// This information is meant to be updated and used across two stages of
808 /// epilogue vectorization.
809 struct EpilogueLoopVectorizationInfo {
810   ElementCount MainLoopVF = ElementCount::getFixed(0);
811   unsigned MainLoopUF = 0;
812   ElementCount EpilogueVF = ElementCount::getFixed(0);
813   unsigned EpilogueUF = 0;
814   BasicBlock *MainLoopIterationCountCheck = nullptr;
815   BasicBlock *EpilogueIterationCountCheck = nullptr;
816   BasicBlock *SCEVSafetyCheck = nullptr;
817   BasicBlock *MemSafetyCheck = nullptr;
818   Value *TripCount = nullptr;
819   Value *VectorTripCount = nullptr;
820 
821   EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
822                                 ElementCount EVF, unsigned EUF)
823       : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
824     assert(EUF == 1 &&
825            "A high UF for the epilogue loop is likely not beneficial.");
826   }
827 };
828 
829 /// An extension of the inner loop vectorizer that creates a skeleton for a
830 /// vectorized loop that has its epilogue (residual) also vectorized.
831 /// The idea is to run the vplan on a given loop twice, firstly to setup the
832 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
833 /// from the first step and vectorize the epilogue.  This is achieved by
834 /// deriving two concrete strategy classes from this base class and invoking
835 /// them in succession from the loop vectorizer planner.
836 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
837 public:
838   InnerLoopAndEpilogueVectorizer(
839       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
840       DominatorTree *DT, const TargetLibraryInfo *TLI,
841       const TargetTransformInfo *TTI, AssumptionCache *AC,
842       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
843       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
844       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
845       GeneratedRTChecks &Checks)
846       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
847                             EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI,
848                             Checks),
849         EPI(EPI) {}
850 
851   // Override this function to handle the more complex control flow around the
852   // three loops.
853   std::pair<BasicBlock *, Value *>
854   createVectorizedLoopSkeleton() final override {
855     return createEpilogueVectorizedLoopSkeleton();
856   }
857 
858   /// The interface for creating a vectorized skeleton using one of two
859   /// different strategies, each corresponding to one execution of the vplan
860   /// as described above.
861   virtual std::pair<BasicBlock *, Value *>
862   createEpilogueVectorizedLoopSkeleton() = 0;
863 
864   /// Holds and updates state information required to vectorize the main loop
865   /// and its epilogue in two separate passes. This setup helps us avoid
866   /// regenerating and recomputing runtime safety checks. It also helps us to
867   /// shorten the iteration-count-check path length for the cases where the
868   /// iteration count of the loop is so small that the main vector loop is
869   /// completely skipped.
870   EpilogueLoopVectorizationInfo &EPI;
871 };
872 
873 /// A specialized derived class of inner loop vectorizer that performs
874 /// vectorization of *main* loops in the process of vectorizing loops and their
875 /// epilogues.
876 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
877 public:
878   EpilogueVectorizerMainLoop(
879       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
880       DominatorTree *DT, const TargetLibraryInfo *TLI,
881       const TargetTransformInfo *TTI, AssumptionCache *AC,
882       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
883       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
884       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
885       GeneratedRTChecks &Check)
886       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
887                                        EPI, LVL, CM, BFI, PSI, Check) {}
888   /// Implements the interface for creating a vectorized skeleton using the
889   /// *main loop* strategy (ie the first pass of vplan execution).
890   std::pair<BasicBlock *, Value *>
891   createEpilogueVectorizedLoopSkeleton() final override;
892 
893 protected:
894   /// Emits an iteration count bypass check once for the main loop (when \p
895   /// ForEpilogue is false) and once for the epilogue loop (when \p
896   /// ForEpilogue is true).
897   BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
898                                              bool ForEpilogue);
899   void printDebugTracesAtStart() override;
900   void printDebugTracesAtEnd() override;
901 };
902 
903 // A specialized derived class of inner loop vectorizer that performs
904 // vectorization of *epilogue* loops in the process of vectorizing loops and
905 // their epilogues.
906 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
907 public:
908   EpilogueVectorizerEpilogueLoop(
909       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
910       DominatorTree *DT, const TargetLibraryInfo *TLI,
911       const TargetTransformInfo *TTI, AssumptionCache *AC,
912       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
913       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
914       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
915       GeneratedRTChecks &Checks)
916       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
917                                        EPI, LVL, CM, BFI, PSI, Checks) {}
918   /// Implements the interface for creating a vectorized skeleton using the
919   /// *epilogue loop* strategy (ie the second pass of vplan execution).
920   std::pair<BasicBlock *, Value *>
921   createEpilogueVectorizedLoopSkeleton() final override;
922 
923 protected:
924   /// Emits an iteration count bypass check after the main vector loop has
925   /// finished to see if there are any iterations left to execute by either
926   /// the vector epilogue or the scalar epilogue.
927   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
928                                                       BasicBlock *Bypass,
929                                                       BasicBlock *Insert);
930   void printDebugTracesAtStart() override;
931   void printDebugTracesAtEnd() override;
932 };
933 } // end namespace llvm
934 
935 /// Look for a meaningful debug location on the instruction or it's
936 /// operands.
937 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
938   if (!I)
939     return I;
940 
941   DebugLoc Empty;
942   if (I->getDebugLoc() != Empty)
943     return I;
944 
945   for (Use &Op : I->operands()) {
946     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
947       if (OpInst->getDebugLoc() != Empty)
948         return OpInst;
949   }
950 
951   return I;
952 }
953 
954 void InnerLoopVectorizer::setDebugLocFromInst(
955     const Value *V, Optional<IRBuilderBase *> CustomBuilder) {
956   IRBuilderBase *B = (CustomBuilder == None) ? &Builder : *CustomBuilder;
957   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) {
958     const DILocation *DIL = Inst->getDebugLoc();
959 
960     // When a FSDiscriminator is enabled, we don't need to add the multiply
961     // factors to the discriminators.
962     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
963         !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) {
964       // FIXME: For scalable vectors, assume vscale=1.
965       auto NewDIL =
966           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
967       if (NewDIL)
968         B->SetCurrentDebugLocation(NewDIL.getValue());
969       else
970         LLVM_DEBUG(dbgs()
971                    << "Failed to create new discriminator: "
972                    << DIL->getFilename() << " Line: " << DIL->getLine());
973     } else
974       B->SetCurrentDebugLocation(DIL);
975   } else
976     B->SetCurrentDebugLocation(DebugLoc());
977 }
978 
979 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
980 /// is passed, the message relates to that particular instruction.
981 #ifndef NDEBUG
982 static void debugVectorizationMessage(const StringRef Prefix,
983                                       const StringRef DebugMsg,
984                                       Instruction *I) {
985   dbgs() << "LV: " << Prefix << DebugMsg;
986   if (I != nullptr)
987     dbgs() << " " << *I;
988   else
989     dbgs() << '.';
990   dbgs() << '\n';
991 }
992 #endif
993 
994 /// Create an analysis remark that explains why vectorization failed
995 ///
996 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
997 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
998 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
999 /// the location of the remark.  \return the remark object that can be
1000 /// streamed to.
1001 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
1002     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
1003   Value *CodeRegion = TheLoop->getHeader();
1004   DebugLoc DL = TheLoop->getStartLoc();
1005 
1006   if (I) {
1007     CodeRegion = I->getParent();
1008     // If there is no debug location attached to the instruction, revert back to
1009     // using the loop's.
1010     if (I->getDebugLoc())
1011       DL = I->getDebugLoc();
1012   }
1013 
1014   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
1015 }
1016 
1017 namespace llvm {
1018 
1019 /// Return a value for Step multiplied by VF.
1020 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
1021                        int64_t Step) {
1022   assert(Ty->isIntegerTy() && "Expected an integer step");
1023   Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue());
1024   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
1025 }
1026 
1027 /// Return the runtime value for VF.
1028 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
1029   Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
1030   return VF.isScalable() ? B.CreateVScale(EC) : EC;
1031 }
1032 
1033 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
1034                                   ElementCount VF) {
1035   assert(FTy->isFloatingPointTy() && "Expected floating point type!");
1036   Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
1037   Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
1038   return B.CreateUIToFP(RuntimeVF, FTy);
1039 }
1040 
1041 void reportVectorizationFailure(const StringRef DebugMsg,
1042                                 const StringRef OREMsg, const StringRef ORETag,
1043                                 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1044                                 Instruction *I) {
1045   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
1046   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1047   ORE->emit(
1048       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1049       << "loop not vectorized: " << OREMsg);
1050 }
1051 
1052 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1053                              OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1054                              Instruction *I) {
1055   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
1056   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1057   ORE->emit(
1058       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1059       << Msg);
1060 }
1061 
1062 } // end namespace llvm
1063 
1064 #ifndef NDEBUG
1065 /// \return string containing a file name and a line # for the given loop.
1066 static std::string getDebugLocString(const Loop *L) {
1067   std::string Result;
1068   if (L) {
1069     raw_string_ostream OS(Result);
1070     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1071       LoopDbgLoc.print(OS);
1072     else
1073       // Just print the module name.
1074       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1075     OS.flush();
1076   }
1077   return Result;
1078 }
1079 #endif
1080 
1081 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1082                                          const Instruction *Orig) {
1083   // If the loop was versioned with memchecks, add the corresponding no-alias
1084   // metadata.
1085   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1086     LVer->annotateInstWithNoAlias(To, Orig);
1087 }
1088 
1089 void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
1090     VPTransformState &State) {
1091 
1092   // Collect recipes in the backward slice of `Root` that may generate a poison
1093   // value that is used after vectorization.
1094   SmallPtrSet<VPRecipeBase *, 16> Visited;
1095   auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
1096     SmallVector<VPRecipeBase *, 16> Worklist;
1097     Worklist.push_back(Root);
1098 
1099     // Traverse the backward slice of Root through its use-def chain.
1100     while (!Worklist.empty()) {
1101       VPRecipeBase *CurRec = Worklist.back();
1102       Worklist.pop_back();
1103 
1104       if (!Visited.insert(CurRec).second)
1105         continue;
1106 
1107       // Prune search if we find another recipe generating a widen memory
1108       // instruction. Widen memory instructions involved in address computation
1109       // will lead to gather/scatter instructions, which don't need to be
1110       // handled.
1111       if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
1112           isa<VPInterleaveRecipe>(CurRec) ||
1113           isa<VPScalarIVStepsRecipe>(CurRec) ||
1114           isa<VPCanonicalIVPHIRecipe>(CurRec))
1115         continue;
1116 
1117       // This recipe contributes to the address computation of a widen
1118       // load/store. Collect recipe if its underlying instruction has
1119       // poison-generating flags.
1120       Instruction *Instr = CurRec->getUnderlyingInstr();
1121       if (Instr && Instr->hasPoisonGeneratingFlags())
1122         State.MayGeneratePoisonRecipes.insert(CurRec);
1123 
1124       // Add new definitions to the worklist.
1125       for (VPValue *operand : CurRec->operands())
1126         if (VPDef *OpDef = operand->getDef())
1127           Worklist.push_back(cast<VPRecipeBase>(OpDef));
1128     }
1129   });
1130 
1131   // Traverse all the recipes in the VPlan and collect the poison-generating
1132   // recipes in the backward slice starting at the address of a VPWidenRecipe or
1133   // VPInterleaveRecipe.
1134   auto Iter = depth_first(
1135       VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry()));
1136   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1137     for (VPRecipeBase &Recipe : *VPBB) {
1138       if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
1139         Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr();
1140         VPDef *AddrDef = WidenRec->getAddr()->getDef();
1141         if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr &&
1142             Legal->blockNeedsPredication(UnderlyingInstr->getParent()))
1143           collectPoisonGeneratingInstrsInBackwardSlice(
1144               cast<VPRecipeBase>(AddrDef));
1145       } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1146         VPDef *AddrDef = InterleaveRec->getAddr()->getDef();
1147         if (AddrDef) {
1148           // Check if any member of the interleave group needs predication.
1149           const InterleaveGroup<Instruction> *InterGroup =
1150               InterleaveRec->getInterleaveGroup();
1151           bool NeedPredication = false;
1152           for (int I = 0, NumMembers = InterGroup->getNumMembers();
1153                I < NumMembers; ++I) {
1154             Instruction *Member = InterGroup->getMember(I);
1155             if (Member)
1156               NeedPredication |=
1157                   Legal->blockNeedsPredication(Member->getParent());
1158           }
1159 
1160           if (NeedPredication)
1161             collectPoisonGeneratingInstrsInBackwardSlice(
1162                 cast<VPRecipeBase>(AddrDef));
1163         }
1164       }
1165     }
1166   }
1167 }
1168 
1169 void InnerLoopVectorizer::addMetadata(Instruction *To,
1170                                       Instruction *From) {
1171   propagateMetadata(To, From);
1172   addNewMetadata(To, From);
1173 }
1174 
1175 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1176                                       Instruction *From) {
1177   for (Value *V : To) {
1178     if (Instruction *I = dyn_cast<Instruction>(V))
1179       addMetadata(I, From);
1180   }
1181 }
1182 
1183 PHINode *InnerLoopVectorizer::getReductionResumeValue(
1184     const RecurrenceDescriptor &RdxDesc) {
1185   auto It = ReductionResumeValues.find(&RdxDesc);
1186   assert(It != ReductionResumeValues.end() &&
1187          "Expected to find a resume value for the reduction.");
1188   return It->second;
1189 }
1190 
1191 namespace llvm {
1192 
1193 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1194 // lowered.
1195 enum ScalarEpilogueLowering {
1196 
1197   // The default: allowing scalar epilogues.
1198   CM_ScalarEpilogueAllowed,
1199 
1200   // Vectorization with OptForSize: don't allow epilogues.
1201   CM_ScalarEpilogueNotAllowedOptSize,
1202 
1203   // A special case of vectorisation with OptForSize: loops with a very small
1204   // trip count are considered for vectorization under OptForSize, thereby
1205   // making sure the cost of their loop body is dominant, free of runtime
1206   // guards and scalar iteration overheads.
1207   CM_ScalarEpilogueNotAllowedLowTripLoop,
1208 
1209   // Loop hint predicate indicating an epilogue is undesired.
1210   CM_ScalarEpilogueNotNeededUsePredicate,
1211 
1212   // Directive indicating we must either tail fold or not vectorize
1213   CM_ScalarEpilogueNotAllowedUsePredicate
1214 };
1215 
1216 /// ElementCountComparator creates a total ordering for ElementCount
1217 /// for the purposes of using it in a set structure.
1218 struct ElementCountComparator {
1219   bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
1220     return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
1221            std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
1222   }
1223 };
1224 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
1225 
1226 /// LoopVectorizationCostModel - estimates the expected speedups due to
1227 /// vectorization.
1228 /// In many cases vectorization is not profitable. This can happen because of
1229 /// a number of reasons. In this class we mainly attempt to predict the
1230 /// expected speedup/slowdowns due to the supported instruction set. We use the
1231 /// TargetTransformInfo to query the different backends for the cost of
1232 /// different operations.
1233 class LoopVectorizationCostModel {
1234 public:
1235   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1236                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1237                              LoopVectorizationLegality *Legal,
1238                              const TargetTransformInfo &TTI,
1239                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1240                              AssumptionCache *AC,
1241                              OptimizationRemarkEmitter *ORE, const Function *F,
1242                              const LoopVectorizeHints *Hints,
1243                              InterleavedAccessInfo &IAI)
1244       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1245         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1246         Hints(Hints), InterleaveInfo(IAI) {}
1247 
1248   /// \return An upper bound for the vectorization factors (both fixed and
1249   /// scalable). If the factors are 0, vectorization and interleaving should be
1250   /// avoided up front.
1251   FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1252 
1253   /// \return True if runtime checks are required for vectorization, and false
1254   /// otherwise.
1255   bool runtimeChecksRequired();
1256 
1257   /// \return The most profitable vectorization factor and the cost of that VF.
1258   /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
1259   /// then this vectorization factor will be selected if vectorization is
1260   /// possible.
1261   VectorizationFactor
1262   selectVectorizationFactor(const ElementCountSet &CandidateVFs);
1263 
1264   VectorizationFactor
1265   selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1266                                     const LoopVectorizationPlanner &LVP);
1267 
1268   /// Setup cost-based decisions for user vectorization factor.
1269   /// \return true if the UserVF is a feasible VF to be chosen.
1270   bool selectUserVectorizationFactor(ElementCount UserVF) {
1271     collectUniformsAndScalars(UserVF);
1272     collectInstsToScalarize(UserVF);
1273     return expectedCost(UserVF).first.isValid();
1274   }
1275 
1276   /// \return The size (in bits) of the smallest and widest types in the code
1277   /// that needs to be vectorized. We ignore values that remain scalar such as
1278   /// 64 bit loop indices.
1279   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1280 
1281   /// \return The desired interleave count.
1282   /// If interleave count has been specified by metadata it will be returned.
1283   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1284   /// are the selected vectorization factor and the cost of the selected VF.
1285   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1286 
1287   /// Memory access instruction may be vectorized in more than one way.
1288   /// Form of instruction after vectorization depends on cost.
1289   /// This function takes cost-based decisions for Load/Store instructions
1290   /// and collects them in a map. This decisions map is used for building
1291   /// the lists of loop-uniform and loop-scalar instructions.
1292   /// The calculated cost is saved with widening decision in order to
1293   /// avoid redundant calculations.
1294   void setCostBasedWideningDecision(ElementCount VF);
1295 
1296   /// A struct that represents some properties of the register usage
1297   /// of a loop.
1298   struct RegisterUsage {
1299     /// Holds the number of loop invariant values that are used in the loop.
1300     /// The key is ClassID of target-provided register class.
1301     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1302     /// Holds the maximum number of concurrent live intervals in the loop.
1303     /// The key is ClassID of target-provided register class.
1304     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1305   };
1306 
1307   /// \return Returns information about the register usages of the loop for the
1308   /// given vectorization factors.
1309   SmallVector<RegisterUsage, 8>
1310   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1311 
1312   /// Collect values we want to ignore in the cost model.
1313   void collectValuesToIgnore();
1314 
1315   /// Collect all element types in the loop for which widening is needed.
1316   void collectElementTypesForWidening();
1317 
1318   /// Split reductions into those that happen in the loop, and those that happen
1319   /// outside. In loop reductions are collected into InLoopReductionChains.
1320   void collectInLoopReductions();
1321 
1322   /// Returns true if we should use strict in-order reductions for the given
1323   /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1324   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1325   /// of FP operations.
1326   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) {
1327     return !Hints->allowReordering() && RdxDesc.isOrdered();
1328   }
1329 
1330   /// \returns The smallest bitwidth each instruction can be represented with.
1331   /// The vector equivalents of these instructions should be truncated to this
1332   /// type.
1333   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1334     return MinBWs;
1335   }
1336 
1337   /// \returns True if it is more profitable to scalarize instruction \p I for
1338   /// vectorization factor \p VF.
1339   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1340     assert(VF.isVector() &&
1341            "Profitable to scalarize relevant only for VF > 1.");
1342 
1343     // Cost model is not run in the VPlan-native path - return conservative
1344     // result until this changes.
1345     if (EnableVPlanNativePath)
1346       return false;
1347 
1348     auto Scalars = InstsToScalarize.find(VF);
1349     assert(Scalars != InstsToScalarize.end() &&
1350            "VF not yet analyzed for scalarization profitability");
1351     return Scalars->second.find(I) != Scalars->second.end();
1352   }
1353 
1354   /// Returns true if \p I is known to be uniform after vectorization.
1355   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1356     if (VF.isScalar())
1357       return true;
1358 
1359     // Cost model is not run in the VPlan-native path - return conservative
1360     // result until this changes.
1361     if (EnableVPlanNativePath)
1362       return false;
1363 
1364     auto UniformsPerVF = Uniforms.find(VF);
1365     assert(UniformsPerVF != Uniforms.end() &&
1366            "VF not yet analyzed for uniformity");
1367     return UniformsPerVF->second.count(I);
1368   }
1369 
1370   /// Returns true if \p I is known to be scalar after vectorization.
1371   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1372     if (VF.isScalar())
1373       return true;
1374 
1375     // Cost model is not run in the VPlan-native path - return conservative
1376     // result until this changes.
1377     if (EnableVPlanNativePath)
1378       return false;
1379 
1380     auto ScalarsPerVF = Scalars.find(VF);
1381     assert(ScalarsPerVF != Scalars.end() &&
1382            "Scalar values are not calculated for VF");
1383     return ScalarsPerVF->second.count(I);
1384   }
1385 
1386   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1387   /// for vectorization factor \p VF.
1388   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1389     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1390            !isProfitableToScalarize(I, VF) &&
1391            !isScalarAfterVectorization(I, VF);
1392   }
1393 
1394   /// Decision that was taken during cost calculation for memory instruction.
1395   enum InstWidening {
1396     CM_Unknown,
1397     CM_Widen,         // For consecutive accesses with stride +1.
1398     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1399     CM_Interleave,
1400     CM_GatherScatter,
1401     CM_Scalarize
1402   };
1403 
1404   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1405   /// instruction \p I and vector width \p VF.
1406   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1407                            InstructionCost Cost) {
1408     assert(VF.isVector() && "Expected VF >=2");
1409     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1410   }
1411 
1412   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1413   /// interleaving group \p Grp and vector width \p VF.
1414   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1415                            ElementCount VF, InstWidening W,
1416                            InstructionCost Cost) {
1417     assert(VF.isVector() && "Expected VF >=2");
1418     /// Broadcast this decicion to all instructions inside the group.
1419     /// But the cost will be assigned to one instruction only.
1420     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1421       if (auto *I = Grp->getMember(i)) {
1422         if (Grp->getInsertPos() == I)
1423           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1424         else
1425           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1426       }
1427     }
1428   }
1429 
1430   /// Return the cost model decision for the given instruction \p I and vector
1431   /// width \p VF. Return CM_Unknown if this instruction did not pass
1432   /// through the cost modeling.
1433   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1434     assert(VF.isVector() && "Expected VF to be a vector VF");
1435     // Cost model is not run in the VPlan-native path - return conservative
1436     // result until this changes.
1437     if (EnableVPlanNativePath)
1438       return CM_GatherScatter;
1439 
1440     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1441     auto Itr = WideningDecisions.find(InstOnVF);
1442     if (Itr == WideningDecisions.end())
1443       return CM_Unknown;
1444     return Itr->second.first;
1445   }
1446 
1447   /// Return the vectorization cost for the given instruction \p I and vector
1448   /// width \p VF.
1449   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1450     assert(VF.isVector() && "Expected VF >=2");
1451     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1452     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1453            "The cost is not calculated");
1454     return WideningDecisions[InstOnVF].second;
1455   }
1456 
1457   /// Return True if instruction \p I is an optimizable truncate whose operand
1458   /// is an induction variable. Such a truncate will be removed by adding a new
1459   /// induction variable with the destination type.
1460   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1461     // If the instruction is not a truncate, return false.
1462     auto *Trunc = dyn_cast<TruncInst>(I);
1463     if (!Trunc)
1464       return false;
1465 
1466     // Get the source and destination types of the truncate.
1467     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1468     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1469 
1470     // If the truncate is free for the given types, return false. Replacing a
1471     // free truncate with an induction variable would add an induction variable
1472     // update instruction to each iteration of the loop. We exclude from this
1473     // check the primary induction variable since it will need an update
1474     // instruction regardless.
1475     Value *Op = Trunc->getOperand(0);
1476     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1477       return false;
1478 
1479     // If the truncated value is not an induction variable, return false.
1480     return Legal->isInductionPhi(Op);
1481   }
1482 
1483   /// Collects the instructions to scalarize for each predicated instruction in
1484   /// the loop.
1485   void collectInstsToScalarize(ElementCount VF);
1486 
1487   /// Collect Uniform and Scalar values for the given \p VF.
1488   /// The sets depend on CM decision for Load/Store instructions
1489   /// that may be vectorized as interleave, gather-scatter or scalarized.
1490   void collectUniformsAndScalars(ElementCount VF) {
1491     // Do the analysis once.
1492     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1493       return;
1494     setCostBasedWideningDecision(VF);
1495     collectLoopUniforms(VF);
1496     collectLoopScalars(VF);
1497   }
1498 
1499   /// Returns true if the target machine supports masked store operation
1500   /// for the given \p DataType and kind of access to \p Ptr.
1501   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1502     return Legal->isConsecutivePtr(DataType, Ptr) &&
1503            TTI.isLegalMaskedStore(DataType, Alignment);
1504   }
1505 
1506   /// Returns true if the target machine supports masked load operation
1507   /// for the given \p DataType and kind of access to \p Ptr.
1508   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1509     return Legal->isConsecutivePtr(DataType, Ptr) &&
1510            TTI.isLegalMaskedLoad(DataType, Alignment);
1511   }
1512 
1513   /// Returns true if the target machine can represent \p V as a masked gather
1514   /// or scatter operation.
1515   bool isLegalGatherOrScatter(Value *V,
1516                               ElementCount VF = ElementCount::getFixed(1)) {
1517     bool LI = isa<LoadInst>(V);
1518     bool SI = isa<StoreInst>(V);
1519     if (!LI && !SI)
1520       return false;
1521     auto *Ty = getLoadStoreType(V);
1522     Align Align = getLoadStoreAlignment(V);
1523     if (VF.isVector())
1524       Ty = VectorType::get(Ty, VF);
1525     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1526            (SI && TTI.isLegalMaskedScatter(Ty, Align));
1527   }
1528 
1529   /// Returns true if the target machine supports all of the reduction
1530   /// variables found for the given VF.
1531   bool canVectorizeReductions(ElementCount VF) const {
1532     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1533       const RecurrenceDescriptor &RdxDesc = Reduction.second;
1534       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1535     }));
1536   }
1537 
1538   /// Returns true if \p I is an instruction that will be scalarized with
1539   /// predication when vectorizing \p I with vectorization factor \p VF. Such
1540   /// instructions include conditional stores and instructions that may divide
1541   /// by zero.
1542   bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1543 
1544   // Returns true if \p I is an instruction that will be predicated either
1545   // through scalar predication or masked load/store or masked gather/scatter.
1546   // \p VF is the vectorization factor that will be used to vectorize \p I.
1547   // Superset of instructions that return true for isScalarWithPredication.
1548   bool isPredicatedInst(Instruction *I, ElementCount VF,
1549                         bool IsKnownUniform = false) {
1550     // When we know the load is uniform and the original scalar loop was not
1551     // predicated we don't need to mark it as a predicated instruction. Any
1552     // vectorised blocks created when tail-folding are something artificial we
1553     // have introduced and we know there is always at least one active lane.
1554     // That's why we call Legal->blockNeedsPredication here because it doesn't
1555     // query tail-folding.
1556     if (IsKnownUniform && isa<LoadInst>(I) &&
1557         !Legal->blockNeedsPredication(I->getParent()))
1558       return false;
1559     if (!blockNeedsPredicationForAnyReason(I->getParent()))
1560       return false;
1561     // Loads and stores that need some form of masked operation are predicated
1562     // instructions.
1563     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1564       return Legal->isMaskRequired(I);
1565     return isScalarWithPredication(I, VF);
1566   }
1567 
1568   /// Returns true if \p I is a memory instruction with consecutive memory
1569   /// access that can be widened.
1570   bool
1571   memoryInstructionCanBeWidened(Instruction *I,
1572                                 ElementCount VF = ElementCount::getFixed(1));
1573 
1574   /// Returns true if \p I is a memory instruction in an interleaved-group
1575   /// of memory accesses that can be vectorized with wide vector loads/stores
1576   /// and shuffles.
1577   bool
1578   interleavedAccessCanBeWidened(Instruction *I,
1579                                 ElementCount VF = ElementCount::getFixed(1));
1580 
1581   /// Check if \p Instr belongs to any interleaved access group.
1582   bool isAccessInterleaved(Instruction *Instr) {
1583     return InterleaveInfo.isInterleaved(Instr);
1584   }
1585 
1586   /// Get the interleaved access group that \p Instr belongs to.
1587   const InterleaveGroup<Instruction> *
1588   getInterleavedAccessGroup(Instruction *Instr) {
1589     return InterleaveInfo.getInterleaveGroup(Instr);
1590   }
1591 
1592   /// Returns true if we're required to use a scalar epilogue for at least
1593   /// the final iteration of the original loop.
1594   bool requiresScalarEpilogue(ElementCount VF) const {
1595     if (!isScalarEpilogueAllowed())
1596       return false;
1597     // If we might exit from anywhere but the latch, must run the exiting
1598     // iteration in scalar form.
1599     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1600       return true;
1601     return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();
1602   }
1603 
1604   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1605   /// loop hint annotation.
1606   bool isScalarEpilogueAllowed() const {
1607     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1608   }
1609 
1610   /// Returns true if all loop blocks should be masked to fold tail loop.
1611   bool foldTailByMasking() const { return FoldTailByMasking; }
1612 
1613   /// Returns true if the instructions in this block requires predication
1614   /// for any reason, e.g. because tail folding now requires a predicate
1615   /// or because the block in the original loop was predicated.
1616   bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1617     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1618   }
1619 
1620   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1621   /// nodes to the chain of instructions representing the reductions. Uses a
1622   /// MapVector to ensure deterministic iteration order.
1623   using ReductionChainMap =
1624       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1625 
1626   /// Return the chain of instructions representing an inloop reduction.
1627   const ReductionChainMap &getInLoopReductionChains() const {
1628     return InLoopReductionChains;
1629   }
1630 
1631   /// Returns true if the Phi is part of an inloop reduction.
1632   bool isInLoopReduction(PHINode *Phi) const {
1633     return InLoopReductionChains.count(Phi);
1634   }
1635 
1636   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1637   /// with factor VF.  Return the cost of the instruction, including
1638   /// scalarization overhead if it's needed.
1639   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1640 
1641   /// Estimate cost of a call instruction CI if it were vectorized with factor
1642   /// VF. Return the cost of the instruction, including scalarization overhead
1643   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1644   /// scalarized -
1645   /// i.e. either vector version isn't available, or is too expensive.
1646   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1647                                     bool &NeedToScalarize) const;
1648 
1649   /// Returns true if the per-lane cost of VectorizationFactor A is lower than
1650   /// that of B.
1651   bool isMoreProfitable(const VectorizationFactor &A,
1652                         const VectorizationFactor &B) const;
1653 
1654   /// Invalidates decisions already taken by the cost model.
1655   void invalidateCostModelingDecisions() {
1656     WideningDecisions.clear();
1657     Uniforms.clear();
1658     Scalars.clear();
1659   }
1660 
1661 private:
1662   unsigned NumPredStores = 0;
1663 
1664   /// Convenience function that returns the value of vscale_range iff
1665   /// vscale_range.min == vscale_range.max or otherwise returns the value
1666   /// returned by the corresponding TLI method.
1667   Optional<unsigned> getVScaleForTuning() const;
1668 
1669   /// \return An upper bound for the vectorization factors for both
1670   /// fixed and scalable vectorization, where the minimum-known number of
1671   /// elements is a power-of-2 larger than zero. If scalable vectorization is
1672   /// disabled or unsupported, then the scalable part will be equal to
1673   /// ElementCount::getScalable(0).
1674   FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1675                                            ElementCount UserVF,
1676                                            bool FoldTailByMasking);
1677 
1678   /// \return the maximized element count based on the targets vector
1679   /// registers and the loop trip-count, but limited to a maximum safe VF.
1680   /// This is a helper function of computeFeasibleMaxVF.
1681   /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure
1682   /// issue that occurred on one of the buildbots which cannot be reproduced
1683   /// without having access to the properietary compiler (see comments on
1684   /// D98509). The issue is currently under investigation and this workaround
1685   /// will be removed as soon as possible.
1686   ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
1687                                        unsigned SmallestType,
1688                                        unsigned WidestType,
1689                                        const ElementCount &MaxSafeVF,
1690                                        bool FoldTailByMasking);
1691 
1692   /// \return the maximum legal scalable VF, based on the safe max number
1693   /// of elements.
1694   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1695 
1696   /// The vectorization cost is a combination of the cost itself and a boolean
1697   /// indicating whether any of the contributing operations will actually
1698   /// operate on vector values after type legalization in the backend. If this
1699   /// latter value is false, then all operations will be scalarized (i.e. no
1700   /// vectorization has actually taken place).
1701   using VectorizationCostTy = std::pair<InstructionCost, bool>;
1702 
1703   /// Returns the expected execution cost. The unit of the cost does
1704   /// not matter because we use the 'cost' units to compare different
1705   /// vector widths. The cost that is returned is *not* normalized by
1706   /// the factor width. If \p Invalid is not nullptr, this function
1707   /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1708   /// each instruction that has an Invalid cost for the given VF.
1709   using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1710   VectorizationCostTy
1711   expectedCost(ElementCount VF,
1712                SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1713 
1714   /// Returns the execution time cost of an instruction for a given vector
1715   /// width. Vector width of one means scalar.
1716   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1717 
1718   /// The cost-computation logic from getInstructionCost which provides
1719   /// the vector type as an output parameter.
1720   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1721                                      Type *&VectorTy);
1722 
1723   /// Return the cost of instructions in an inloop reduction pattern, if I is
1724   /// part of that pattern.
1725   Optional<InstructionCost>
1726   getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1727                           TTI::TargetCostKind CostKind);
1728 
1729   /// Calculate vectorization cost of memory instruction \p I.
1730   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1731 
1732   /// The cost computation for scalarized memory instruction.
1733   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1734 
1735   /// The cost computation for interleaving group of memory instructions.
1736   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1737 
1738   /// The cost computation for Gather/Scatter instruction.
1739   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1740 
1741   /// The cost computation for widening instruction \p I with consecutive
1742   /// memory access.
1743   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1744 
1745   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1746   /// Load: scalar load + broadcast.
1747   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1748   /// element)
1749   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1750 
1751   /// Estimate the overhead of scalarizing an instruction. This is a
1752   /// convenience wrapper for the type-based getScalarizationOverhead API.
1753   InstructionCost getScalarizationOverhead(Instruction *I,
1754                                            ElementCount VF) const;
1755 
1756   /// Returns whether the instruction is a load or store and will be a emitted
1757   /// as a vector operation.
1758   bool isConsecutiveLoadOrStore(Instruction *I);
1759 
1760   /// Returns true if an artificially high cost for emulated masked memrefs
1761   /// should be used.
1762   bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1763 
1764   /// Map of scalar integer values to the smallest bitwidth they can be legally
1765   /// represented as. The vector equivalents of these values should be truncated
1766   /// to this type.
1767   MapVector<Instruction *, uint64_t> MinBWs;
1768 
1769   /// A type representing the costs for instructions if they were to be
1770   /// scalarized rather than vectorized. The entries are Instruction-Cost
1771   /// pairs.
1772   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1773 
1774   /// A set containing all BasicBlocks that are known to present after
1775   /// vectorization as a predicated block.
1776   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1777 
1778   /// Records whether it is allowed to have the original scalar loop execute at
1779   /// least once. This may be needed as a fallback loop in case runtime
1780   /// aliasing/dependence checks fail, or to handle the tail/remainder
1781   /// iterations when the trip count is unknown or doesn't divide by the VF,
1782   /// or as a peel-loop to handle gaps in interleave-groups.
1783   /// Under optsize and when the trip count is very small we don't allow any
1784   /// iterations to execute in the scalar loop.
1785   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1786 
1787   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1788   bool FoldTailByMasking = false;
1789 
1790   /// A map holding scalar costs for different vectorization factors. The
1791   /// presence of a cost for an instruction in the mapping indicates that the
1792   /// instruction will be scalarized when vectorizing with the associated
1793   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1794   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1795 
1796   /// Holds the instructions known to be uniform after vectorization.
1797   /// The data is collected per VF.
1798   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1799 
1800   /// Holds the instructions known to be scalar after vectorization.
1801   /// The data is collected per VF.
1802   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1803 
1804   /// Holds the instructions (address computations) that are forced to be
1805   /// scalarized.
1806   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1807 
1808   /// PHINodes of the reductions that should be expanded in-loop along with
1809   /// their associated chains of reduction operations, in program order from top
1810   /// (PHI) to bottom
1811   ReductionChainMap InLoopReductionChains;
1812 
1813   /// A Map of inloop reduction operations and their immediate chain operand.
1814   /// FIXME: This can be removed once reductions can be costed correctly in
1815   /// vplan. This was added to allow quick lookup to the inloop operations,
1816   /// without having to loop through InLoopReductionChains.
1817   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1818 
1819   /// Returns the expected difference in cost from scalarizing the expression
1820   /// feeding a predicated instruction \p PredInst. The instructions to
1821   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1822   /// non-negative return value implies the expression will be scalarized.
1823   /// Currently, only single-use chains are considered for scalarization.
1824   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1825                               ElementCount VF);
1826 
1827   /// Collect the instructions that are uniform after vectorization. An
1828   /// instruction is uniform if we represent it with a single scalar value in
1829   /// the vectorized loop corresponding to each vector iteration. Examples of
1830   /// uniform instructions include pointer operands of consecutive or
1831   /// interleaved memory accesses. Note that although uniformity implies an
1832   /// instruction will be scalar, the reverse is not true. In general, a
1833   /// scalarized instruction will be represented by VF scalar values in the
1834   /// vectorized loop, each corresponding to an iteration of the original
1835   /// scalar loop.
1836   void collectLoopUniforms(ElementCount VF);
1837 
1838   /// Collect the instructions that are scalar after vectorization. An
1839   /// instruction is scalar if it is known to be uniform or will be scalarized
1840   /// during vectorization. collectLoopScalars should only add non-uniform nodes
1841   /// to the list if they are used by a load/store instruction that is marked as
1842   /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1843   /// VF values in the vectorized loop, each corresponding to an iteration of
1844   /// the original scalar loop.
1845   void collectLoopScalars(ElementCount VF);
1846 
1847   /// Keeps cost model vectorization decision and cost for instructions.
1848   /// Right now it is used for memory instructions only.
1849   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1850                                 std::pair<InstWidening, InstructionCost>>;
1851 
1852   DecisionList WideningDecisions;
1853 
1854   /// Returns true if \p V is expected to be vectorized and it needs to be
1855   /// extracted.
1856   bool needsExtract(Value *V, ElementCount VF) const {
1857     Instruction *I = dyn_cast<Instruction>(V);
1858     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1859         TheLoop->isLoopInvariant(I))
1860       return false;
1861 
1862     // Assume we can vectorize V (and hence we need extraction) if the
1863     // scalars are not computed yet. This can happen, because it is called
1864     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1865     // the scalars are collected. That should be a safe assumption in most
1866     // cases, because we check if the operands have vectorizable types
1867     // beforehand in LoopVectorizationLegality.
1868     return Scalars.find(VF) == Scalars.end() ||
1869            !isScalarAfterVectorization(I, VF);
1870   };
1871 
1872   /// Returns a range containing only operands needing to be extracted.
1873   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1874                                                    ElementCount VF) const {
1875     return SmallVector<Value *, 4>(make_filter_range(
1876         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1877   }
1878 
1879   /// Determines if we have the infrastructure to vectorize loop \p L and its
1880   /// epilogue, assuming the main loop is vectorized by \p VF.
1881   bool isCandidateForEpilogueVectorization(const Loop &L,
1882                                            const ElementCount VF) const;
1883 
1884   /// Returns true if epilogue vectorization is considered profitable, and
1885   /// false otherwise.
1886   /// \p VF is the vectorization factor chosen for the original loop.
1887   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1888 
1889 public:
1890   /// The loop that we evaluate.
1891   Loop *TheLoop;
1892 
1893   /// Predicated scalar evolution analysis.
1894   PredicatedScalarEvolution &PSE;
1895 
1896   /// Loop Info analysis.
1897   LoopInfo *LI;
1898 
1899   /// Vectorization legality.
1900   LoopVectorizationLegality *Legal;
1901 
1902   /// Vector target information.
1903   const TargetTransformInfo &TTI;
1904 
1905   /// Target Library Info.
1906   const TargetLibraryInfo *TLI;
1907 
1908   /// Demanded bits analysis.
1909   DemandedBits *DB;
1910 
1911   /// Assumption cache.
1912   AssumptionCache *AC;
1913 
1914   /// Interface to emit optimization remarks.
1915   OptimizationRemarkEmitter *ORE;
1916 
1917   const Function *TheFunction;
1918 
1919   /// Loop Vectorize Hint.
1920   const LoopVectorizeHints *Hints;
1921 
1922   /// The interleave access information contains groups of interleaved accesses
1923   /// with the same stride and close to each other.
1924   InterleavedAccessInfo &InterleaveInfo;
1925 
1926   /// Values to ignore in the cost model.
1927   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1928 
1929   /// Values to ignore in the cost model when VF > 1.
1930   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1931 
1932   /// All element types found in the loop.
1933   SmallPtrSet<Type *, 16> ElementTypesInLoop;
1934 
1935   /// Profitable vector factors.
1936   SmallVector<VectorizationFactor, 8> ProfitableVFs;
1937 };
1938 } // end namespace llvm
1939 
1940 /// Helper struct to manage generating runtime checks for vectorization.
1941 ///
1942 /// The runtime checks are created up-front in temporary blocks to allow better
1943 /// estimating the cost and un-linked from the existing IR. After deciding to
1944 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1945 /// temporary blocks are completely removed.
1946 class GeneratedRTChecks {
1947   /// Basic block which contains the generated SCEV checks, if any.
1948   BasicBlock *SCEVCheckBlock = nullptr;
1949 
1950   /// The value representing the result of the generated SCEV checks. If it is
1951   /// nullptr, either no SCEV checks have been generated or they have been used.
1952   Value *SCEVCheckCond = nullptr;
1953 
1954   /// Basic block which contains the generated memory runtime checks, if any.
1955   BasicBlock *MemCheckBlock = nullptr;
1956 
1957   /// The value representing the result of the generated memory runtime checks.
1958   /// If it is nullptr, either no memory runtime checks have been generated or
1959   /// they have been used.
1960   Value *MemRuntimeCheckCond = nullptr;
1961 
1962   DominatorTree *DT;
1963   LoopInfo *LI;
1964 
1965   SCEVExpander SCEVExp;
1966   SCEVExpander MemCheckExp;
1967 
1968 public:
1969   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1970                     const DataLayout &DL)
1971       : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"),
1972         MemCheckExp(SE, DL, "scev.check") {}
1973 
1974   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1975   /// accurately estimate the cost of the runtime checks. The blocks are
1976   /// un-linked from the IR and is added back during vector code generation. If
1977   /// there is no vector code generation, the check blocks are removed
1978   /// completely.
1979   void Create(Loop *L, const LoopAccessInfo &LAI,
1980               const SCEVPredicate &Pred) {
1981 
1982     BasicBlock *LoopHeader = L->getHeader();
1983     BasicBlock *Preheader = L->getLoopPreheader();
1984 
1985     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1986     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1987     // may be used by SCEVExpander. The blocks will be un-linked from their
1988     // predecessors and removed from LI & DT at the end of the function.
1989     if (!Pred.isAlwaysTrue()) {
1990       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1991                                   nullptr, "vector.scevcheck");
1992 
1993       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1994           &Pred, SCEVCheckBlock->getTerminator());
1995     }
1996 
1997     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1998     if (RtPtrChecking.Need) {
1999       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
2000       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
2001                                  "vector.memcheck");
2002 
2003       MemRuntimeCheckCond =
2004           addRuntimeChecks(MemCheckBlock->getTerminator(), L,
2005                            RtPtrChecking.getChecks(), MemCheckExp);
2006       assert(MemRuntimeCheckCond &&
2007              "no RT checks generated although RtPtrChecking "
2008              "claimed checks are required");
2009     }
2010 
2011     if (!MemCheckBlock && !SCEVCheckBlock)
2012       return;
2013 
2014     // Unhook the temporary block with the checks, update various places
2015     // accordingly.
2016     if (SCEVCheckBlock)
2017       SCEVCheckBlock->replaceAllUsesWith(Preheader);
2018     if (MemCheckBlock)
2019       MemCheckBlock->replaceAllUsesWith(Preheader);
2020 
2021     if (SCEVCheckBlock) {
2022       SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2023       new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
2024       Preheader->getTerminator()->eraseFromParent();
2025     }
2026     if (MemCheckBlock) {
2027       MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2028       new UnreachableInst(Preheader->getContext(), MemCheckBlock);
2029       Preheader->getTerminator()->eraseFromParent();
2030     }
2031 
2032     DT->changeImmediateDominator(LoopHeader, Preheader);
2033     if (MemCheckBlock) {
2034       DT->eraseNode(MemCheckBlock);
2035       LI->removeBlock(MemCheckBlock);
2036     }
2037     if (SCEVCheckBlock) {
2038       DT->eraseNode(SCEVCheckBlock);
2039       LI->removeBlock(SCEVCheckBlock);
2040     }
2041   }
2042 
2043   /// Remove the created SCEV & memory runtime check blocks & instructions, if
2044   /// unused.
2045   ~GeneratedRTChecks() {
2046     SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2047     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2048     if (!SCEVCheckCond)
2049       SCEVCleaner.markResultUsed();
2050 
2051     if (!MemRuntimeCheckCond)
2052       MemCheckCleaner.markResultUsed();
2053 
2054     if (MemRuntimeCheckCond) {
2055       auto &SE = *MemCheckExp.getSE();
2056       // Memory runtime check generation creates compares that use expanded
2057       // values. Remove them before running the SCEVExpanderCleaners.
2058       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2059         if (MemCheckExp.isInsertedInstruction(&I))
2060           continue;
2061         SE.forgetValue(&I);
2062         I.eraseFromParent();
2063       }
2064     }
2065     MemCheckCleaner.cleanup();
2066     SCEVCleaner.cleanup();
2067 
2068     if (SCEVCheckCond)
2069       SCEVCheckBlock->eraseFromParent();
2070     if (MemRuntimeCheckCond)
2071       MemCheckBlock->eraseFromParent();
2072   }
2073 
2074   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2075   /// adjusts the branches to branch to the vector preheader or \p Bypass,
2076   /// depending on the generated condition.
2077   BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2078                              BasicBlock *LoopVectorPreHeader,
2079                              BasicBlock *LoopExitBlock) {
2080     if (!SCEVCheckCond)
2081       return nullptr;
2082     if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond))
2083       if (C->isZero())
2084         return nullptr;
2085 
2086     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2087 
2088     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2089     // Create new preheader for vector loop.
2090     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2091       PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2092 
2093     SCEVCheckBlock->getTerminator()->eraseFromParent();
2094     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2095     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2096                                                 SCEVCheckBlock);
2097 
2098     DT->addNewBlock(SCEVCheckBlock, Pred);
2099     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2100 
2101     ReplaceInstWithInst(
2102         SCEVCheckBlock->getTerminator(),
2103         BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond));
2104     // Mark the check as used, to prevent it from being removed during cleanup.
2105     SCEVCheckCond = nullptr;
2106     return SCEVCheckBlock;
2107   }
2108 
2109   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2110   /// the branches to branch to the vector preheader or \p Bypass, depending on
2111   /// the generated condition.
2112   BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass,
2113                                    BasicBlock *LoopVectorPreHeader) {
2114     // Check if we generated code that checks in runtime if arrays overlap.
2115     if (!MemRuntimeCheckCond)
2116       return nullptr;
2117 
2118     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2119     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2120                                                 MemCheckBlock);
2121 
2122     DT->addNewBlock(MemCheckBlock, Pred);
2123     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2124     MemCheckBlock->moveBefore(LoopVectorPreHeader);
2125 
2126     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2127       PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2128 
2129     ReplaceInstWithInst(
2130         MemCheckBlock->getTerminator(),
2131         BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2132     MemCheckBlock->getTerminator()->setDebugLoc(
2133         Pred->getTerminator()->getDebugLoc());
2134 
2135     // Mark the check as used, to prevent it from being removed during cleanup.
2136     MemRuntimeCheckCond = nullptr;
2137     return MemCheckBlock;
2138   }
2139 };
2140 
2141 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2142 // vectorization. The loop needs to be annotated with #pragma omp simd
2143 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2144 // vector length information is not provided, vectorization is not considered
2145 // explicit. Interleave hints are not allowed either. These limitations will be
2146 // relaxed in the future.
2147 // Please, note that we are currently forced to abuse the pragma 'clang
2148 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2149 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2150 // provides *explicit vectorization hints* (LV can bypass legal checks and
2151 // assume that vectorization is legal). However, both hints are implemented
2152 // using the same metadata (llvm.loop.vectorize, processed by
2153 // LoopVectorizeHints). This will be fixed in the future when the native IR
2154 // representation for pragma 'omp simd' is introduced.
2155 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2156                                    OptimizationRemarkEmitter *ORE) {
2157   assert(!OuterLp->isInnermost() && "This is not an outer loop");
2158   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2159 
2160   // Only outer loops with an explicit vectorization hint are supported.
2161   // Unannotated outer loops are ignored.
2162   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2163     return false;
2164 
2165   Function *Fn = OuterLp->getHeader()->getParent();
2166   if (!Hints.allowVectorization(Fn, OuterLp,
2167                                 true /*VectorizeOnlyWhenForced*/)) {
2168     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2169     return false;
2170   }
2171 
2172   if (Hints.getInterleave() > 1) {
2173     // TODO: Interleave support is future work.
2174     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2175                          "outer loops.\n");
2176     Hints.emitRemarkWithHints();
2177     return false;
2178   }
2179 
2180   return true;
2181 }
2182 
2183 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2184                                   OptimizationRemarkEmitter *ORE,
2185                                   SmallVectorImpl<Loop *> &V) {
2186   // Collect inner loops and outer loops without irreducible control flow. For
2187   // now, only collect outer loops that have explicit vectorization hints. If we
2188   // are stress testing the VPlan H-CFG construction, we collect the outermost
2189   // loop of every loop nest.
2190   if (L.isInnermost() || VPlanBuildStressTest ||
2191       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2192     LoopBlocksRPO RPOT(&L);
2193     RPOT.perform(LI);
2194     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2195       V.push_back(&L);
2196       // TODO: Collect inner loops inside marked outer loops in case
2197       // vectorization fails for the outer loop. Do not invoke
2198       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2199       // already known to be reducible. We can use an inherited attribute for
2200       // that.
2201       return;
2202     }
2203   }
2204   for (Loop *InnerL : L)
2205     collectSupportedLoops(*InnerL, LI, ORE, V);
2206 }
2207 
2208 namespace {
2209 
2210 /// The LoopVectorize Pass.
2211 struct LoopVectorize : public FunctionPass {
2212   /// Pass identification, replacement for typeid
2213   static char ID;
2214 
2215   LoopVectorizePass Impl;
2216 
2217   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2218                          bool VectorizeOnlyWhenForced = false)
2219       : FunctionPass(ID),
2220         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2221     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2222   }
2223 
2224   bool runOnFunction(Function &F) override {
2225     if (skipFunction(F))
2226       return false;
2227 
2228     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2229     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2230     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2231     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2232     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2233     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2234     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2235     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2236     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2237     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2238     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2239     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2240     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2241 
2242     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2243         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2244 
2245     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2246                         GetLAA, *ORE, PSI).MadeAnyChange;
2247   }
2248 
2249   void getAnalysisUsage(AnalysisUsage &AU) const override {
2250     AU.addRequired<AssumptionCacheTracker>();
2251     AU.addRequired<BlockFrequencyInfoWrapperPass>();
2252     AU.addRequired<DominatorTreeWrapperPass>();
2253     AU.addRequired<LoopInfoWrapperPass>();
2254     AU.addRequired<ScalarEvolutionWrapperPass>();
2255     AU.addRequired<TargetTransformInfoWrapperPass>();
2256     AU.addRequired<AAResultsWrapperPass>();
2257     AU.addRequired<LoopAccessLegacyAnalysis>();
2258     AU.addRequired<DemandedBitsWrapperPass>();
2259     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2260     AU.addRequired<InjectTLIMappingsLegacy>();
2261 
2262     // We currently do not preserve loopinfo/dominator analyses with outer loop
2263     // vectorization. Until this is addressed, mark these analyses as preserved
2264     // only for non-VPlan-native path.
2265     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2266     if (!EnableVPlanNativePath) {
2267       AU.addPreserved<LoopInfoWrapperPass>();
2268       AU.addPreserved<DominatorTreeWrapperPass>();
2269     }
2270 
2271     AU.addPreserved<BasicAAWrapperPass>();
2272     AU.addPreserved<GlobalsAAWrapperPass>();
2273     AU.addRequired<ProfileSummaryInfoWrapperPass>();
2274   }
2275 };
2276 
2277 } // end anonymous namespace
2278 
2279 //===----------------------------------------------------------------------===//
2280 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2281 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2282 //===----------------------------------------------------------------------===//
2283 
2284 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2285   // We need to place the broadcast of invariant variables outside the loop,
2286   // but only if it's proven safe to do so. Else, broadcast will be inside
2287   // vector loop body.
2288   Instruction *Instr = dyn_cast<Instruction>(V);
2289   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2290                      (!Instr ||
2291                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2292   // Place the code for broadcasting invariant variables in the new preheader.
2293   IRBuilder<>::InsertPointGuard Guard(Builder);
2294   if (SafeToHoist)
2295     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2296 
2297   // Broadcast the scalar into all locations in the vector.
2298   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2299 
2300   return Shuf;
2301 }
2302 
2303 /// This function adds
2304 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
2305 /// to each vector element of Val. The sequence starts at StartIndex.
2306 /// \p Opcode is relevant for FP induction variable.
2307 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
2308                             Instruction::BinaryOps BinOp, ElementCount VF,
2309                             IRBuilderBase &Builder) {
2310   assert(VF.isVector() && "only vector VFs are supported");
2311 
2312   // Create and check the types.
2313   auto *ValVTy = cast<VectorType>(Val->getType());
2314   ElementCount VLen = ValVTy->getElementCount();
2315 
2316   Type *STy = Val->getType()->getScalarType();
2317   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2318          "Induction Step must be an integer or FP");
2319   assert(Step->getType() == STy && "Step has wrong type");
2320 
2321   SmallVector<Constant *, 8> Indices;
2322 
2323   // Create a vector of consecutive numbers from zero to VF.
2324   VectorType *InitVecValVTy = ValVTy;
2325   if (STy->isFloatingPointTy()) {
2326     Type *InitVecValSTy =
2327         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
2328     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2329   }
2330   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2331 
2332   // Splat the StartIdx
2333   Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
2334 
2335   if (STy->isIntegerTy()) {
2336     InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2337     Step = Builder.CreateVectorSplat(VLen, Step);
2338     assert(Step->getType() == Val->getType() && "Invalid step vec");
2339     // FIXME: The newly created binary instructions should contain nsw/nuw
2340     // flags, which can be found from the original scalar operations.
2341     Step = Builder.CreateMul(InitVec, Step);
2342     return Builder.CreateAdd(Val, Step, "induction");
2343   }
2344 
2345   // Floating point induction.
2346   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2347          "Binary Opcode should be specified for FP induction");
2348   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2349   InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
2350 
2351   Step = Builder.CreateVectorSplat(VLen, Step);
2352   Value *MulOp = Builder.CreateFMul(InitVec, Step);
2353   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2354 }
2355 
2356 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
2357 /// variable on which to base the steps, \p Step is the size of the step.
2358 static void buildScalarSteps(Value *ScalarIV, Value *Step,
2359                              const InductionDescriptor &ID, VPValue *Def,
2360                              VPTransformState &State) {
2361   IRBuilderBase &Builder = State.Builder;
2362   // We shouldn't have to build scalar steps if we aren't vectorizing.
2363   assert(State.VF.isVector() && "VF should be greater than one");
2364   // Get the value type and ensure it and the step have the same integer type.
2365   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2366   assert(ScalarIVTy == Step->getType() &&
2367          "Val and Step should have the same type");
2368 
2369   // We build scalar steps for both integer and floating-point induction
2370   // variables. Here, we determine the kind of arithmetic we will perform.
2371   Instruction::BinaryOps AddOp;
2372   Instruction::BinaryOps MulOp;
2373   if (ScalarIVTy->isIntegerTy()) {
2374     AddOp = Instruction::Add;
2375     MulOp = Instruction::Mul;
2376   } else {
2377     AddOp = ID.getInductionOpcode();
2378     MulOp = Instruction::FMul;
2379   }
2380 
2381   // Determine the number of scalars we need to generate for each unroll
2382   // iteration.
2383   bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def);
2384   unsigned Lanes = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2385   // Compute the scalar steps and save the results in State.
2386   Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2387                                      ScalarIVTy->getScalarSizeInBits());
2388   Type *VecIVTy = nullptr;
2389   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2390   if (!FirstLaneOnly && State.VF.isScalable()) {
2391     VecIVTy = VectorType::get(ScalarIVTy, State.VF);
2392     UnitStepVec =
2393         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
2394     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
2395     SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV);
2396   }
2397 
2398   for (unsigned Part = 0; Part < State.UF; ++Part) {
2399     Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
2400 
2401     if (!FirstLaneOnly && State.VF.isScalable()) {
2402       auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
2403       auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2404       if (ScalarIVTy->isFloatingPointTy())
2405         InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2406       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2407       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2408       State.set(Def, Add, Part);
2409       // It's useful to record the lane values too for the known minimum number
2410       // of elements so we do those below. This improves the code quality when
2411       // trying to extract the first element, for example.
2412     }
2413 
2414     if (ScalarIVTy->isFloatingPointTy())
2415       StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2416 
2417     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2418       Value *StartIdx = Builder.CreateBinOp(
2419           AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2420       // The step returned by `createStepForVF` is a runtime-evaluated value
2421       // when VF is scalable. Otherwise, it should be folded into a Constant.
2422       assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
2423              "Expected StartIdx to be folded to a constant when VF is not "
2424              "scalable");
2425       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2426       auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2427       State.set(Def, Add, VPIteration(Part, Lane));
2428     }
2429   }
2430 }
2431 
2432 // Generate code for the induction step. Note that induction steps are
2433 // required to be loop-invariant
2434 static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE,
2435                               Instruction *InsertBefore,
2436                               Loop *OrigLoop = nullptr) {
2437   const DataLayout &DL = SE.getDataLayout();
2438   assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) &&
2439          "Induction step should be loop invariant");
2440   if (auto *E = dyn_cast<SCEVUnknown>(Step))
2441     return E->getValue();
2442 
2443   SCEVExpander Exp(SE, DL, "induction");
2444   return Exp.expandCodeFor(Step, Step->getType(), InsertBefore);
2445 }
2446 
2447 /// Compute the transformed value of Index at offset StartValue using step
2448 /// StepValue.
2449 /// For integer induction, returns StartValue + Index * StepValue.
2450 /// For pointer induction, returns StartValue[Index * StepValue].
2451 /// FIXME: The newly created binary instructions should contain nsw/nuw
2452 /// flags, which can be found from the original scalar operations.
2453 static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index,
2454                                    Value *StartValue, Value *Step,
2455                                    const InductionDescriptor &ID) {
2456   assert(Index->getType()->getScalarType() == Step->getType() &&
2457          "Index scalar type does not match StepValue type");
2458 
2459   // Note: the IR at this point is broken. We cannot use SE to create any new
2460   // SCEV and then expand it, hoping that SCEV's simplification will give us
2461   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2462   // lead to various SCEV crashes. So all we can do is to use builder and rely
2463   // on InstCombine for future simplifications. Here we handle some trivial
2464   // cases only.
2465   auto CreateAdd = [&B](Value *X, Value *Y) {
2466     assert(X->getType() == Y->getType() && "Types don't match!");
2467     if (auto *CX = dyn_cast<ConstantInt>(X))
2468       if (CX->isZero())
2469         return Y;
2470     if (auto *CY = dyn_cast<ConstantInt>(Y))
2471       if (CY->isZero())
2472         return X;
2473     return B.CreateAdd(X, Y);
2474   };
2475 
2476   // We allow X to be a vector type, in which case Y will potentially be
2477   // splatted into a vector with the same element count.
2478   auto CreateMul = [&B](Value *X, Value *Y) {
2479     assert(X->getType()->getScalarType() == Y->getType() &&
2480            "Types don't match!");
2481     if (auto *CX = dyn_cast<ConstantInt>(X))
2482       if (CX->isOne())
2483         return Y;
2484     if (auto *CY = dyn_cast<ConstantInt>(Y))
2485       if (CY->isOne())
2486         return X;
2487     VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2488     if (XVTy && !isa<VectorType>(Y->getType()))
2489       Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2490     return B.CreateMul(X, Y);
2491   };
2492 
2493   switch (ID.getKind()) {
2494   case InductionDescriptor::IK_IntInduction: {
2495     assert(!isa<VectorType>(Index->getType()) &&
2496            "Vector indices not supported for integer inductions yet");
2497     assert(Index->getType() == StartValue->getType() &&
2498            "Index type does not match StartValue type");
2499     if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2500       return B.CreateSub(StartValue, Index);
2501     auto *Offset = CreateMul(Index, Step);
2502     return CreateAdd(StartValue, Offset);
2503   }
2504   case InductionDescriptor::IK_PtrInduction: {
2505     assert(isa<Constant>(Step) &&
2506            "Expected constant step for pointer induction");
2507     return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step));
2508   }
2509   case InductionDescriptor::IK_FpInduction: {
2510     assert(!isa<VectorType>(Index->getType()) &&
2511            "Vector indices not supported for FP inductions yet");
2512     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2513     auto InductionBinOp = ID.getInductionBinOp();
2514     assert(InductionBinOp &&
2515            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2516             InductionBinOp->getOpcode() == Instruction::FSub) &&
2517            "Original bin op should be defined for FP induction");
2518 
2519     Value *MulExp = B.CreateFMul(Step, Index);
2520     return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2521                          "induction");
2522   }
2523   case InductionDescriptor::IK_NoInduction:
2524     return nullptr;
2525   }
2526   llvm_unreachable("invalid enum");
2527 }
2528 
2529 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2530                                                     const VPIteration &Instance,
2531                                                     VPTransformState &State) {
2532   Value *ScalarInst = State.get(Def, Instance);
2533   Value *VectorValue = State.get(Def, Instance.Part);
2534   VectorValue = Builder.CreateInsertElement(
2535       VectorValue, ScalarInst,
2536       Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2537   State.set(Def, VectorValue, Instance.Part);
2538 }
2539 
2540 // Return whether we allow using masked interleave-groups (for dealing with
2541 // strided loads/stores that reside in predicated blocks, or for dealing
2542 // with gaps).
2543 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2544   // If an override option has been passed in for interleaved accesses, use it.
2545   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2546     return EnableMaskedInterleavedMemAccesses;
2547 
2548   return TTI.enableMaskedInterleavedAccessVectorization();
2549 }
2550 
2551 // Try to vectorize the interleave group that \p Instr belongs to.
2552 //
2553 // E.g. Translate following interleaved load group (factor = 3):
2554 //   for (i = 0; i < N; i+=3) {
2555 //     R = Pic[i];             // Member of index 0
2556 //     G = Pic[i+1];           // Member of index 1
2557 //     B = Pic[i+2];           // Member of index 2
2558 //     ... // do something to R, G, B
2559 //   }
2560 // To:
2561 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2562 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2563 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2564 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2565 //
2566 // Or translate following interleaved store group (factor = 3):
2567 //   for (i = 0; i < N; i+=3) {
2568 //     ... do something to R, G, B
2569 //     Pic[i]   = R;           // Member of index 0
2570 //     Pic[i+1] = G;           // Member of index 1
2571 //     Pic[i+2] = B;           // Member of index 2
2572 //   }
2573 // To:
2574 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2575 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2576 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2577 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2578 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2579 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2580     const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2581     VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2582     VPValue *BlockInMask) {
2583   Instruction *Instr = Group->getInsertPos();
2584   const DataLayout &DL = Instr->getModule()->getDataLayout();
2585 
2586   // Prepare for the vector type of the interleaved load/store.
2587   Type *ScalarTy = getLoadStoreType(Instr);
2588   unsigned InterleaveFactor = Group->getFactor();
2589   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2590   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2591 
2592   // Prepare for the new pointers.
2593   SmallVector<Value *, 2> AddrParts;
2594   unsigned Index = Group->getIndex(Instr);
2595 
2596   // TODO: extend the masked interleaved-group support to reversed access.
2597   assert((!BlockInMask || !Group->isReverse()) &&
2598          "Reversed masked interleave-group not supported.");
2599 
2600   // If the group is reverse, adjust the index to refer to the last vector lane
2601   // instead of the first. We adjust the index from the first vector lane,
2602   // rather than directly getting the pointer for lane VF - 1, because the
2603   // pointer operand of the interleaved access is supposed to be uniform. For
2604   // uniform instructions, we're only required to generate a value for the
2605   // first vector lane in each unroll iteration.
2606   if (Group->isReverse())
2607     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2608 
2609   for (unsigned Part = 0; Part < UF; Part++) {
2610     Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2611     setDebugLocFromInst(AddrPart);
2612 
2613     // Notice current instruction could be any index. Need to adjust the address
2614     // to the member of index 0.
2615     //
2616     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2617     //       b = A[i];       // Member of index 0
2618     // Current pointer is pointed to A[i+1], adjust it to A[i].
2619     //
2620     // E.g.  A[i+1] = a;     // Member of index 1
2621     //       A[i]   = b;     // Member of index 0
2622     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2623     // Current pointer is pointed to A[i+2], adjust it to A[i].
2624 
2625     bool InBounds = false;
2626     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2627       InBounds = gep->isInBounds();
2628     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2629     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2630 
2631     // Cast to the vector pointer type.
2632     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2633     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2634     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2635   }
2636 
2637   setDebugLocFromInst(Instr);
2638   Value *PoisonVec = PoisonValue::get(VecTy);
2639 
2640   Value *MaskForGaps = nullptr;
2641   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2642     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2643     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2644   }
2645 
2646   // Vectorize the interleaved load group.
2647   if (isa<LoadInst>(Instr)) {
2648     // For each unroll part, create a wide load for the group.
2649     SmallVector<Value *, 2> NewLoads;
2650     for (unsigned Part = 0; Part < UF; Part++) {
2651       Instruction *NewLoad;
2652       if (BlockInMask || MaskForGaps) {
2653         assert(useMaskedInterleavedAccesses(*TTI) &&
2654                "masked interleaved groups are not allowed.");
2655         Value *GroupMask = MaskForGaps;
2656         if (BlockInMask) {
2657           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2658           Value *ShuffledMask = Builder.CreateShuffleVector(
2659               BlockInMaskPart,
2660               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2661               "interleaved.mask");
2662           GroupMask = MaskForGaps
2663                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2664                                                 MaskForGaps)
2665                           : ShuffledMask;
2666         }
2667         NewLoad =
2668             Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2669                                      GroupMask, PoisonVec, "wide.masked.vec");
2670       }
2671       else
2672         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2673                                             Group->getAlign(), "wide.vec");
2674       Group->addMetadata(NewLoad);
2675       NewLoads.push_back(NewLoad);
2676     }
2677 
2678     // For each member in the group, shuffle out the appropriate data from the
2679     // wide loads.
2680     unsigned J = 0;
2681     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2682       Instruction *Member = Group->getMember(I);
2683 
2684       // Skip the gaps in the group.
2685       if (!Member)
2686         continue;
2687 
2688       auto StrideMask =
2689           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2690       for (unsigned Part = 0; Part < UF; Part++) {
2691         Value *StridedVec = Builder.CreateShuffleVector(
2692             NewLoads[Part], StrideMask, "strided.vec");
2693 
2694         // If this member has different type, cast the result type.
2695         if (Member->getType() != ScalarTy) {
2696           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2697           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2698           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2699         }
2700 
2701         if (Group->isReverse())
2702           StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2703 
2704         State.set(VPDefs[J], StridedVec, Part);
2705       }
2706       ++J;
2707     }
2708     return;
2709   }
2710 
2711   // The sub vector type for current instruction.
2712   auto *SubVT = VectorType::get(ScalarTy, VF);
2713 
2714   // Vectorize the interleaved store group.
2715   MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2716   assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2717          "masked interleaved groups are not allowed.");
2718   assert((!MaskForGaps || !VF.isScalable()) &&
2719          "masking gaps for scalable vectors is not yet supported.");
2720   for (unsigned Part = 0; Part < UF; Part++) {
2721     // Collect the stored vector from each member.
2722     SmallVector<Value *, 4> StoredVecs;
2723     for (unsigned i = 0; i < InterleaveFactor; i++) {
2724       assert((Group->getMember(i) || MaskForGaps) &&
2725              "Fail to get a member from an interleaved store group");
2726       Instruction *Member = Group->getMember(i);
2727 
2728       // Skip the gaps in the group.
2729       if (!Member) {
2730         Value *Undef = PoisonValue::get(SubVT);
2731         StoredVecs.push_back(Undef);
2732         continue;
2733       }
2734 
2735       Value *StoredVec = State.get(StoredValues[i], Part);
2736 
2737       if (Group->isReverse())
2738         StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
2739 
2740       // If this member has different type, cast it to a unified type.
2741 
2742       if (StoredVec->getType() != SubVT)
2743         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2744 
2745       StoredVecs.push_back(StoredVec);
2746     }
2747 
2748     // Concatenate all vectors into a wide vector.
2749     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2750 
2751     // Interleave the elements in the wide vector.
2752     Value *IVec = Builder.CreateShuffleVector(
2753         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2754         "interleaved.vec");
2755 
2756     Instruction *NewStoreInstr;
2757     if (BlockInMask || MaskForGaps) {
2758       Value *GroupMask = MaskForGaps;
2759       if (BlockInMask) {
2760         Value *BlockInMaskPart = State.get(BlockInMask, Part);
2761         Value *ShuffledMask = Builder.CreateShuffleVector(
2762             BlockInMaskPart,
2763             createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2764             "interleaved.mask");
2765         GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And,
2766                                                       ShuffledMask, MaskForGaps)
2767                                 : ShuffledMask;
2768       }
2769       NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2770                                                 Group->getAlign(), GroupMask);
2771     } else
2772       NewStoreInstr =
2773           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2774 
2775     Group->addMetadata(NewStoreInstr);
2776   }
2777 }
2778 
2779 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2780                                                VPReplicateRecipe *RepRecipe,
2781                                                const VPIteration &Instance,
2782                                                bool IfPredicateInstr,
2783                                                VPTransformState &State) {
2784   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2785 
2786   // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2787   // the first lane and part.
2788   if (isa<NoAliasScopeDeclInst>(Instr))
2789     if (!Instance.isFirstIteration())
2790       return;
2791 
2792   setDebugLocFromInst(Instr);
2793 
2794   // Does this instruction return a value ?
2795   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2796 
2797   Instruction *Cloned = Instr->clone();
2798   if (!IsVoidRetTy)
2799     Cloned->setName(Instr->getName() + ".cloned");
2800 
2801   // If the scalarized instruction contributes to the address computation of a
2802   // widen masked load/store which was in a basic block that needed predication
2803   // and is not predicated after vectorization, we can't propagate
2804   // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized
2805   // instruction could feed a poison value to the base address of the widen
2806   // load/store.
2807   if (State.MayGeneratePoisonRecipes.contains(RepRecipe))
2808     Cloned->dropPoisonGeneratingFlags();
2809 
2810   State.Builder.SetInsertPoint(Builder.GetInsertBlock(),
2811                                Builder.GetInsertPoint());
2812   // Replace the operands of the cloned instructions with their scalar
2813   // equivalents in the new loop.
2814   for (auto &I : enumerate(RepRecipe->operands())) {
2815     auto InputInstance = Instance;
2816     VPValue *Operand = I.value();
2817     VPReplicateRecipe *OperandR = dyn_cast<VPReplicateRecipe>(Operand);
2818     if (OperandR && OperandR->isUniform())
2819       InputInstance.Lane = VPLane::getFirstLane();
2820     Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2821   }
2822   addNewMetadata(Cloned, Instr);
2823 
2824   // Place the cloned scalar in the new loop.
2825   Builder.Insert(Cloned);
2826 
2827   State.set(RepRecipe, Cloned, Instance);
2828 
2829   // If we just cloned a new assumption, add it the assumption cache.
2830   if (auto *II = dyn_cast<AssumeInst>(Cloned))
2831     AC->registerAssumption(II);
2832 
2833   // End if-block.
2834   if (IfPredicateInstr)
2835     PredicatedInstructions.push_back(Cloned);
2836 }
2837 
2838 void InnerLoopVectorizer::createHeaderBranch(Loop *L) {
2839   BasicBlock *Header = L->getHeader();
2840   assert(!L->getLoopLatch() && "loop should not have a latch at this point");
2841 
2842   IRBuilder<> B(Header->getTerminator());
2843   Instruction *OldInst =
2844       getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
2845   setDebugLocFromInst(OldInst, &B);
2846 
2847   // Connect the header to the exit and header blocks and replace the old
2848   // terminator.
2849   B.CreateCondBr(B.getTrue(), L->getUniqueExitBlock(), Header);
2850 
2851   // Now we have two terminators. Remove the old one from the block.
2852   Header->getTerminator()->eraseFromParent();
2853 }
2854 
2855 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2856   if (TripCount)
2857     return TripCount;
2858 
2859   assert(L && "Create Trip Count for null loop.");
2860   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2861   // Find the loop boundaries.
2862   ScalarEvolution *SE = PSE.getSE();
2863   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2864   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
2865          "Invalid loop count");
2866 
2867   Type *IdxTy = Legal->getWidestInductionType();
2868   assert(IdxTy && "No type for induction");
2869 
2870   // The exit count might have the type of i64 while the phi is i32. This can
2871   // happen if we have an induction variable that is sign extended before the
2872   // compare. The only way that we get a backedge taken count is that the
2873   // induction variable was signed and as such will not overflow. In such a case
2874   // truncation is legal.
2875   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2876       IdxTy->getPrimitiveSizeInBits())
2877     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2878   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2879 
2880   // Get the total trip count from the count by adding 1.
2881   const SCEV *ExitCount = SE->getAddExpr(
2882       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2883 
2884   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2885 
2886   // Expand the trip count and place the new instructions in the preheader.
2887   // Notice that the pre-header does not change, only the loop body.
2888   SCEVExpander Exp(*SE, DL, "induction");
2889 
2890   // Count holds the overall loop count (N).
2891   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2892                                 L->getLoopPreheader()->getTerminator());
2893 
2894   if (TripCount->getType()->isPointerTy())
2895     TripCount =
2896         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2897                                     L->getLoopPreheader()->getTerminator());
2898 
2899   return TripCount;
2900 }
2901 
2902 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2903   if (VectorTripCount)
2904     return VectorTripCount;
2905 
2906   Value *TC = getOrCreateTripCount(L);
2907   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2908 
2909   Type *Ty = TC->getType();
2910   // This is where we can make the step a runtime constant.
2911   Value *Step = createStepForVF(Builder, Ty, VF, UF);
2912 
2913   // If the tail is to be folded by masking, round the number of iterations N
2914   // up to a multiple of Step instead of rounding down. This is done by first
2915   // adding Step-1 and then rounding down. Note that it's ok if this addition
2916   // overflows: the vector induction variable will eventually wrap to zero given
2917   // that it starts at zero and its Step is a power of two; the loop will then
2918   // exit, with the last early-exit vector comparison also producing all-true.
2919   if (Cost->foldTailByMasking()) {
2920     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2921            "VF*UF must be a power of 2 when folding tail by masking");
2922     Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
2923     TC = Builder.CreateAdd(
2924         TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
2925   }
2926 
2927   // Now we need to generate the expression for the part of the loop that the
2928   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2929   // iterations are not required for correctness, or N - Step, otherwise. Step
2930   // is equal to the vectorization factor (number of SIMD elements) times the
2931   // unroll factor (number of SIMD instructions).
2932   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2933 
2934   // There are cases where we *must* run at least one iteration in the remainder
2935   // loop.  See the cost model for when this can happen.  If the step evenly
2936   // divides the trip count, we set the remainder to be equal to the step. If
2937   // the step does not evenly divide the trip count, no adjustment is necessary
2938   // since there will already be scalar iterations. Note that the minimum
2939   // iterations check ensures that N >= Step.
2940   if (Cost->requiresScalarEpilogue(VF)) {
2941     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2942     R = Builder.CreateSelect(IsZero, Step, R);
2943   }
2944 
2945   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2946 
2947   return VectorTripCount;
2948 }
2949 
2950 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2951                                                    const DataLayout &DL) {
2952   // Verify that V is a vector type with same number of elements as DstVTy.
2953   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
2954   unsigned VF = DstFVTy->getNumElements();
2955   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
2956   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2957   Type *SrcElemTy = SrcVecTy->getElementType();
2958   Type *DstElemTy = DstFVTy->getElementType();
2959   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2960          "Vector elements must have same size");
2961 
2962   // Do a direct cast if element types are castable.
2963   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2964     return Builder.CreateBitOrPointerCast(V, DstFVTy);
2965   }
2966   // V cannot be directly casted to desired vector type.
2967   // May happen when V is a floating point vector but DstVTy is a vector of
2968   // pointers or vice-versa. Handle this using a two-step bitcast using an
2969   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2970   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2971          "Only one type should be a pointer type");
2972   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2973          "Only one type should be a floating point type");
2974   Type *IntTy =
2975       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2976   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
2977   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2978   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2979 }
2980 
2981 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2982                                                          BasicBlock *Bypass) {
2983   Value *Count = getOrCreateTripCount(L);
2984   // Reuse existing vector loop preheader for TC checks.
2985   // Note that new preheader block is generated for vector loop.
2986   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2987   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2988 
2989   // Generate code to check if the loop's trip count is less than VF * UF, or
2990   // equal to it in case a scalar epilogue is required; this implies that the
2991   // vector trip count is zero. This check also covers the case where adding one
2992   // to the backedge-taken count overflowed leading to an incorrect trip count
2993   // of zero. In this case we will also jump to the scalar loop.
2994   auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
2995                                             : ICmpInst::ICMP_ULT;
2996 
2997   // If tail is to be folded, vector loop takes care of all iterations.
2998   Value *CheckMinIters = Builder.getFalse();
2999   if (!Cost->foldTailByMasking()) {
3000     Value *Step = createStepForVF(Builder, Count->getType(), VF, UF);
3001     CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
3002   }
3003   // Create new preheader for vector loop.
3004   LoopVectorPreHeader =
3005       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3006                  "vector.ph");
3007 
3008   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
3009                                DT->getNode(Bypass)->getIDom()) &&
3010          "TC check is expected to dominate Bypass");
3011 
3012   // Update dominator for Bypass & LoopExit (if needed).
3013   DT->changeImmediateDominator(Bypass, TCCheckBlock);
3014   if (!Cost->requiresScalarEpilogue(VF))
3015     // If there is an epilogue which must run, there's no edge from the
3016     // middle block to exit blocks  and thus no need to update the immediate
3017     // dominator of the exit blocks.
3018     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3019 
3020   ReplaceInstWithInst(
3021       TCCheckBlock->getTerminator(),
3022       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3023   LoopBypassBlocks.push_back(TCCheckBlock);
3024 }
3025 
3026 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
3027 
3028   BasicBlock *const SCEVCheckBlock =
3029       RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
3030   if (!SCEVCheckBlock)
3031     return nullptr;
3032 
3033   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3034            (OptForSizeBasedOnProfile &&
3035             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
3036          "Cannot SCEV check stride or overflow when optimizing for size");
3037 
3038 
3039   // Update dominator only if this is first RT check.
3040   if (LoopBypassBlocks.empty()) {
3041     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3042     if (!Cost->requiresScalarEpilogue(VF))
3043       // If there is an epilogue which must run, there's no edge from the
3044       // middle block to exit blocks  and thus no need to update the immediate
3045       // dominator of the exit blocks.
3046       DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3047   }
3048 
3049   LoopBypassBlocks.push_back(SCEVCheckBlock);
3050   AddedSafetyChecks = true;
3051   return SCEVCheckBlock;
3052 }
3053 
3054 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
3055                                                       BasicBlock *Bypass) {
3056   // VPlan-native path does not do any analysis for runtime checks currently.
3057   if (EnableVPlanNativePath)
3058     return nullptr;
3059 
3060   BasicBlock *const MemCheckBlock =
3061       RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader);
3062 
3063   // Check if we generated code that checks in runtime if arrays overlap. We put
3064   // the checks into a separate block to make the more common case of few
3065   // elements faster.
3066   if (!MemCheckBlock)
3067     return nullptr;
3068 
3069   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3070     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3071            "Cannot emit memory checks when optimizing for size, unless forced "
3072            "to vectorize.");
3073     ORE->emit([&]() {
3074       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3075                                         L->getStartLoc(), L->getHeader())
3076              << "Code-size may be reduced by not forcing "
3077                 "vectorization, or by source-code modifications "
3078                 "eliminating the need for runtime checks "
3079                 "(e.g., adding 'restrict').";
3080     });
3081   }
3082 
3083   LoopBypassBlocks.push_back(MemCheckBlock);
3084 
3085   AddedSafetyChecks = true;
3086 
3087   // We currently don't use LoopVersioning for the actual loop cloning but we
3088   // still use it to add the noalias metadata.
3089   LVer = std::make_unique<LoopVersioning>(
3090       *Legal->getLAI(),
3091       Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3092       DT, PSE.getSE());
3093   LVer->prepareNoAliasMetadata();
3094   return MemCheckBlock;
3095 }
3096 
3097 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3098   LoopScalarBody = OrigLoop->getHeader();
3099   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3100   assert(LoopVectorPreHeader && "Invalid loop structure");
3101   LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3102   assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) &&
3103          "multiple exit loop without required epilogue?");
3104 
3105   LoopMiddleBlock =
3106       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3107                  LI, nullptr, Twine(Prefix) + "middle.block");
3108   LoopScalarPreHeader =
3109       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3110                  nullptr, Twine(Prefix) + "scalar.ph");
3111 
3112   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3113 
3114   // Set up the middle block terminator.  Two cases:
3115   // 1) If we know that we must execute the scalar epilogue, emit an
3116   //    unconditional branch.
3117   // 2) Otherwise, we must have a single unique exit block (due to how we
3118   //    implement the multiple exit case).  In this case, set up a conditonal
3119   //    branch from the middle block to the loop scalar preheader, and the
3120   //    exit block.  completeLoopSkeleton will update the condition to use an
3121   //    iteration check, if required to decide whether to execute the remainder.
3122   BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ?
3123     BranchInst::Create(LoopScalarPreHeader) :
3124     BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3125                        Builder.getTrue());
3126   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3127   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3128 
3129   // We intentionally don't let SplitBlock to update LoopInfo since
3130   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3131   // LoopVectorBody is explicitly added to the correct place few lines later.
3132   BasicBlock *LoopVectorBody =
3133       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3134                  nullptr, nullptr, Twine(Prefix) + "vector.body");
3135 
3136   // Update dominator for loop exit.
3137   if (!Cost->requiresScalarEpilogue(VF))
3138     // If there is an epilogue which must run, there's no edge from the
3139     // middle block to exit blocks  and thus no need to update the immediate
3140     // dominator of the exit blocks.
3141     DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3142 
3143   // Create and register the new vector loop.
3144   Loop *Lp = LI->AllocateLoop();
3145   Loop *ParentLoop = OrigLoop->getParentLoop();
3146 
3147   // Insert the new loop into the loop nest and register the new basic blocks
3148   // before calling any utilities such as SCEV that require valid LoopInfo.
3149   if (ParentLoop) {
3150     ParentLoop->addChildLoop(Lp);
3151   } else {
3152     LI->addTopLevelLoop(Lp);
3153   }
3154   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3155   return Lp;
3156 }
3157 
3158 void InnerLoopVectorizer::createInductionResumeValues(
3159     Loop *L, std::pair<BasicBlock *, Value *> AdditionalBypass) {
3160   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3161           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3162          "Inconsistent information about additional bypass.");
3163 
3164   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3165   assert(VectorTripCount && L && "Expected valid arguments");
3166   // We are going to resume the execution of the scalar loop.
3167   // Go over all of the induction variables that we found and fix the
3168   // PHIs that are left in the scalar version of the loop.
3169   // The starting values of PHI nodes depend on the counter of the last
3170   // iteration in the vectorized loop.
3171   // If we come from a bypass edge then we need to start from the original
3172   // start value.
3173   Instruction *OldInduction = Legal->getPrimaryInduction();
3174   for (auto &InductionEntry : Legal->getInductionVars()) {
3175     PHINode *OrigPhi = InductionEntry.first;
3176     InductionDescriptor II = InductionEntry.second;
3177 
3178     // Create phi nodes to merge from the  backedge-taken check block.
3179     PHINode *BCResumeVal =
3180         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3181                         LoopScalarPreHeader->getTerminator());
3182     // Copy original phi DL over to the new one.
3183     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3184     Value *&EndValue = IVEndValues[OrigPhi];
3185     Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3186     if (OrigPhi == OldInduction) {
3187       // We know what the end value is.
3188       EndValue = VectorTripCount;
3189     } else {
3190       IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3191 
3192       // Fast-math-flags propagate from the original induction instruction.
3193       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3194         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3195 
3196       Type *StepType = II.getStep()->getType();
3197       Instruction::CastOps CastOp =
3198           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3199       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3200       Value *Step =
3201           CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3202       EndValue = emitTransformedIndex(B, CRD, II.getStartValue(), Step, II);
3203       EndValue->setName("ind.end");
3204 
3205       // Compute the end value for the additional bypass (if applicable).
3206       if (AdditionalBypass.first) {
3207         B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3208         CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3209                                          StepType, true);
3210         Value *Step =
3211             CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3212         CRD =
3213             B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
3214         EndValueFromAdditionalBypass =
3215             emitTransformedIndex(B, CRD, II.getStartValue(), Step, II);
3216         EndValueFromAdditionalBypass->setName("ind.end");
3217       }
3218     }
3219     // The new PHI merges the original incoming value, in case of a bypass,
3220     // or the value at the end of the vectorized loop.
3221     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3222 
3223     // Fix the scalar body counter (PHI node).
3224     // The old induction's phi node in the scalar body needs the truncated
3225     // value.
3226     for (BasicBlock *BB : LoopBypassBlocks)
3227       BCResumeVal->addIncoming(II.getStartValue(), BB);
3228 
3229     if (AdditionalBypass.first)
3230       BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3231                                             EndValueFromAdditionalBypass);
3232 
3233     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3234   }
3235 }
3236 
3237 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3238                                                       MDNode *OrigLoopID) {
3239   assert(L && "Expected valid loop.");
3240 
3241   // The trip counts should be cached by now.
3242   Value *Count = getOrCreateTripCount(L);
3243   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3244 
3245   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3246 
3247   // Add a check in the middle block to see if we have completed
3248   // all of the iterations in the first vector loop.  Three cases:
3249   // 1) If we require a scalar epilogue, there is no conditional branch as
3250   //    we unconditionally branch to the scalar preheader.  Do nothing.
3251   // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3252   //    Thus if tail is to be folded, we know we don't need to run the
3253   //    remainder and we can use the previous value for the condition (true).
3254   // 3) Otherwise, construct a runtime check.
3255   if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) {
3256     Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3257                                         Count, VectorTripCount, "cmp.n",
3258                                         LoopMiddleBlock->getTerminator());
3259 
3260     // Here we use the same DebugLoc as the scalar loop latch terminator instead
3261     // of the corresponding compare because they may have ended up with
3262     // different line numbers and we want to avoid awkward line stepping while
3263     // debugging. Eg. if the compare has got a line number inside the loop.
3264     CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3265     cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3266   }
3267 
3268   // Get ready to start creating new instructions into the vectorized body.
3269   assert(LoopVectorPreHeader == L->getLoopPreheader() &&
3270          "Inconsistent vector loop preheader");
3271 
3272 #ifdef EXPENSIVE_CHECKS
3273   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3274   LI->verify(*DT);
3275 #endif
3276 
3277   return LoopVectorPreHeader;
3278 }
3279 
3280 std::pair<BasicBlock *, Value *>
3281 InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3282   /*
3283    In this function we generate a new loop. The new loop will contain
3284    the vectorized instructions while the old loop will continue to run the
3285    scalar remainder.
3286 
3287        [ ] <-- loop iteration number check.
3288     /   |
3289    /    v
3290   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3291   |  /  |
3292   | /   v
3293   ||   [ ]     <-- vector pre header.
3294   |/    |
3295   |     v
3296   |    [  ] \
3297   |    [  ]_|   <-- vector loop.
3298   |     |
3299   |     v
3300   \   -[ ]   <--- middle-block.
3301    \/   |
3302    /\   v
3303    | ->[ ]     <--- new preheader.
3304    |    |
3305  (opt)  v      <-- edge from middle to exit iff epilogue is not required.
3306    |   [ ] \
3307    |   [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue).
3308     \   |
3309      \  v
3310       >[ ]     <-- exit block(s).
3311    ...
3312    */
3313 
3314   // Get the metadata of the original loop before it gets modified.
3315   MDNode *OrigLoopID = OrigLoop->getLoopID();
3316 
3317   // Workaround!  Compute the trip count of the original loop and cache it
3318   // before we start modifying the CFG.  This code has a systemic problem
3319   // wherein it tries to run analysis over partially constructed IR; this is
3320   // wrong, and not simply for SCEV.  The trip count of the original loop
3321   // simply happens to be prone to hitting this in practice.  In theory, we
3322   // can hit the same issue for any SCEV, or ValueTracking query done during
3323   // mutation.  See PR49900.
3324   getOrCreateTripCount(OrigLoop);
3325 
3326   // Create an empty vector loop, and prepare basic blocks for the runtime
3327   // checks.
3328   Loop *Lp = createVectorLoopSkeleton("");
3329 
3330   // Now, compare the new count to zero. If it is zero skip the vector loop and
3331   // jump to the scalar loop. This check also covers the case where the
3332   // backedge-taken count is uint##_max: adding one to it will overflow leading
3333   // to an incorrect trip count of zero. In this (rare) case we will also jump
3334   // to the scalar loop.
3335   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3336 
3337   // Generate the code to check any assumptions that we've made for SCEV
3338   // expressions.
3339   emitSCEVChecks(LoopScalarPreHeader);
3340 
3341   // Generate the code that checks in runtime if arrays overlap. We put the
3342   // checks into a separate block to make the more common case of few elements
3343   // faster.
3344   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3345 
3346   createHeaderBranch(Lp);
3347 
3348   // Emit phis for the new starting index of the scalar loop.
3349   createInductionResumeValues(Lp);
3350 
3351   return {completeLoopSkeleton(Lp, OrigLoopID), nullptr};
3352 }
3353 
3354 // Fix up external users of the induction variable. At this point, we are
3355 // in LCSSA form, with all external PHIs that use the IV having one input value,
3356 // coming from the remainder loop. We need those PHIs to also have a correct
3357 // value for the IV when arriving directly from the middle block.
3358 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3359                                        const InductionDescriptor &II,
3360                                        Value *CountRoundDown, Value *EndValue,
3361                                        BasicBlock *MiddleBlock,
3362                                        BasicBlock *VectorHeader) {
3363   // There are two kinds of external IV usages - those that use the value
3364   // computed in the last iteration (the PHI) and those that use the penultimate
3365   // value (the value that feeds into the phi from the loop latch).
3366   // We allow both, but they, obviously, have different values.
3367 
3368   assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3369 
3370   DenseMap<Value *, Value *> MissingVals;
3371 
3372   // An external user of the last iteration's value should see the value that
3373   // the remainder loop uses to initialize its own IV.
3374   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3375   for (User *U : PostInc->users()) {
3376     Instruction *UI = cast<Instruction>(U);
3377     if (!OrigLoop->contains(UI)) {
3378       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3379       MissingVals[UI] = EndValue;
3380     }
3381   }
3382 
3383   // An external user of the penultimate value need to see EndValue - Step.
3384   // The simplest way to get this is to recompute it from the constituent SCEVs,
3385   // that is Start + (Step * (CRD - 1)).
3386   for (User *U : OrigPhi->users()) {
3387     auto *UI = cast<Instruction>(U);
3388     if (!OrigLoop->contains(UI)) {
3389       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3390 
3391       IRBuilder<> B(MiddleBlock->getTerminator());
3392 
3393       // Fast-math-flags propagate from the original induction instruction.
3394       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3395         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3396 
3397       Value *CountMinusOne = B.CreateSub(
3398           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3399       Value *CMO =
3400           !II.getStep()->getType()->isIntegerTy()
3401               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3402                              II.getStep()->getType())
3403               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3404       CMO->setName("cast.cmo");
3405 
3406       Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(),
3407                                     VectorHeader->getTerminator());
3408       Value *Escape =
3409           emitTransformedIndex(B, CMO, II.getStartValue(), Step, II);
3410       Escape->setName("ind.escape");
3411       MissingVals[UI] = Escape;
3412     }
3413   }
3414 
3415   for (auto &I : MissingVals) {
3416     PHINode *PHI = cast<PHINode>(I.first);
3417     // One corner case we have to handle is two IVs "chasing" each-other,
3418     // that is %IV2 = phi [...], [ %IV1, %latch ]
3419     // In this case, if IV1 has an external use, we need to avoid adding both
3420     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3421     // don't already have an incoming value for the middle block.
3422     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3423       PHI->addIncoming(I.second, MiddleBlock);
3424   }
3425 }
3426 
3427 namespace {
3428 
3429 struct CSEDenseMapInfo {
3430   static bool canHandle(const Instruction *I) {
3431     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3432            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3433   }
3434 
3435   static inline Instruction *getEmptyKey() {
3436     return DenseMapInfo<Instruction *>::getEmptyKey();
3437   }
3438 
3439   static inline Instruction *getTombstoneKey() {
3440     return DenseMapInfo<Instruction *>::getTombstoneKey();
3441   }
3442 
3443   static unsigned getHashValue(const Instruction *I) {
3444     assert(canHandle(I) && "Unknown instruction!");
3445     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3446                                                            I->value_op_end()));
3447   }
3448 
3449   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3450     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3451         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3452       return LHS == RHS;
3453     return LHS->isIdenticalTo(RHS);
3454   }
3455 };
3456 
3457 } // end anonymous namespace
3458 
3459 ///Perform cse of induction variable instructions.
3460 static void cse(BasicBlock *BB) {
3461   // Perform simple cse.
3462   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3463   for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3464     if (!CSEDenseMapInfo::canHandle(&In))
3465       continue;
3466 
3467     // Check if we can replace this instruction with any of the
3468     // visited instructions.
3469     if (Instruction *V = CSEMap.lookup(&In)) {
3470       In.replaceAllUsesWith(V);
3471       In.eraseFromParent();
3472       continue;
3473     }
3474 
3475     CSEMap[&In] = &In;
3476   }
3477 }
3478 
3479 InstructionCost
3480 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3481                                               bool &NeedToScalarize) const {
3482   Function *F = CI->getCalledFunction();
3483   Type *ScalarRetTy = CI->getType();
3484   SmallVector<Type *, 4> Tys, ScalarTys;
3485   for (auto &ArgOp : CI->args())
3486     ScalarTys.push_back(ArgOp->getType());
3487 
3488   // Estimate cost of scalarized vector call. The source operands are assumed
3489   // to be vectors, so we need to extract individual elements from there,
3490   // execute VF scalar calls, and then gather the result into the vector return
3491   // value.
3492   InstructionCost ScalarCallCost =
3493       TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
3494   if (VF.isScalar())
3495     return ScalarCallCost;
3496 
3497   // Compute corresponding vector type for return value and arguments.
3498   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3499   for (Type *ScalarTy : ScalarTys)
3500     Tys.push_back(ToVectorTy(ScalarTy, VF));
3501 
3502   // Compute costs of unpacking argument values for the scalar calls and
3503   // packing the return values to a vector.
3504   InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
3505 
3506   InstructionCost Cost =
3507       ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3508 
3509   // If we can't emit a vector call for this function, then the currently found
3510   // cost is the cost we need to return.
3511   NeedToScalarize = true;
3512   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3513   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3514 
3515   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3516     return Cost;
3517 
3518   // If the corresponding vector cost is cheaper, return its cost.
3519   InstructionCost VectorCallCost =
3520       TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
3521   if (VectorCallCost < Cost) {
3522     NeedToScalarize = false;
3523     Cost = VectorCallCost;
3524   }
3525   return Cost;
3526 }
3527 
3528 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3529   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3530     return Elt;
3531   return VectorType::get(Elt, VF);
3532 }
3533 
3534 InstructionCost
3535 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3536                                                    ElementCount VF) const {
3537   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3538   assert(ID && "Expected intrinsic call!");
3539   Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3540   FastMathFlags FMF;
3541   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3542     FMF = FPMO->getFastMathFlags();
3543 
3544   SmallVector<const Value *> Arguments(CI->args());
3545   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3546   SmallVector<Type *> ParamTys;
3547   std::transform(FTy->param_begin(), FTy->param_end(),
3548                  std::back_inserter(ParamTys),
3549                  [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3550 
3551   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3552                                     dyn_cast<IntrinsicInst>(CI));
3553   return TTI.getIntrinsicInstrCost(CostAttrs,
3554                                    TargetTransformInfo::TCK_RecipThroughput);
3555 }
3556 
3557 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3558   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3559   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3560   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3561 }
3562 
3563 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3564   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3565   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3566   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3567 }
3568 
3569 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3570   // For every instruction `I` in MinBWs, truncate the operands, create a
3571   // truncated version of `I` and reextend its result. InstCombine runs
3572   // later and will remove any ext/trunc pairs.
3573   SmallPtrSet<Value *, 4> Erased;
3574   for (const auto &KV : Cost->getMinimalBitwidths()) {
3575     // If the value wasn't vectorized, we must maintain the original scalar
3576     // type. The absence of the value from State indicates that it
3577     // wasn't vectorized.
3578     // FIXME: Should not rely on getVPValue at this point.
3579     VPValue *Def = State.Plan->getVPValue(KV.first, true);
3580     if (!State.hasAnyVectorValue(Def))
3581       continue;
3582     for (unsigned Part = 0; Part < UF; ++Part) {
3583       Value *I = State.get(Def, Part);
3584       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3585         continue;
3586       Type *OriginalTy = I->getType();
3587       Type *ScalarTruncatedTy =
3588           IntegerType::get(OriginalTy->getContext(), KV.second);
3589       auto *TruncatedTy = VectorType::get(
3590           ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
3591       if (TruncatedTy == OriginalTy)
3592         continue;
3593 
3594       IRBuilder<> B(cast<Instruction>(I));
3595       auto ShrinkOperand = [&](Value *V) -> Value * {
3596         if (auto *ZI = dyn_cast<ZExtInst>(V))
3597           if (ZI->getSrcTy() == TruncatedTy)
3598             return ZI->getOperand(0);
3599         return B.CreateZExtOrTrunc(V, TruncatedTy);
3600       };
3601 
3602       // The actual instruction modification depends on the instruction type,
3603       // unfortunately.
3604       Value *NewI = nullptr;
3605       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3606         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3607                              ShrinkOperand(BO->getOperand(1)));
3608 
3609         // Any wrapping introduced by shrinking this operation shouldn't be
3610         // considered undefined behavior. So, we can't unconditionally copy
3611         // arithmetic wrapping flags to NewI.
3612         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3613       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3614         NewI =
3615             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3616                          ShrinkOperand(CI->getOperand(1)));
3617       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3618         NewI = B.CreateSelect(SI->getCondition(),
3619                               ShrinkOperand(SI->getTrueValue()),
3620                               ShrinkOperand(SI->getFalseValue()));
3621       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3622         switch (CI->getOpcode()) {
3623         default:
3624           llvm_unreachable("Unhandled cast!");
3625         case Instruction::Trunc:
3626           NewI = ShrinkOperand(CI->getOperand(0));
3627           break;
3628         case Instruction::SExt:
3629           NewI = B.CreateSExtOrTrunc(
3630               CI->getOperand(0),
3631               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3632           break;
3633         case Instruction::ZExt:
3634           NewI = B.CreateZExtOrTrunc(
3635               CI->getOperand(0),
3636               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3637           break;
3638         }
3639       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3640         auto Elements0 =
3641             cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
3642         auto *O0 = B.CreateZExtOrTrunc(
3643             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3644         auto Elements1 =
3645             cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
3646         auto *O1 = B.CreateZExtOrTrunc(
3647             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3648 
3649         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3650       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3651         // Don't do anything with the operands, just extend the result.
3652         continue;
3653       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3654         auto Elements =
3655             cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
3656         auto *O0 = B.CreateZExtOrTrunc(
3657             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3658         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3659         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3660       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3661         auto Elements =
3662             cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
3663         auto *O0 = B.CreateZExtOrTrunc(
3664             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3665         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3666       } else {
3667         // If we don't know what to do, be conservative and don't do anything.
3668         continue;
3669       }
3670 
3671       // Lastly, extend the result.
3672       NewI->takeName(cast<Instruction>(I));
3673       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3674       I->replaceAllUsesWith(Res);
3675       cast<Instruction>(I)->eraseFromParent();
3676       Erased.insert(I);
3677       State.reset(Def, Res, Part);
3678     }
3679   }
3680 
3681   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3682   for (const auto &KV : Cost->getMinimalBitwidths()) {
3683     // If the value wasn't vectorized, we must maintain the original scalar
3684     // type. The absence of the value from State indicates that it
3685     // wasn't vectorized.
3686     // FIXME: Should not rely on getVPValue at this point.
3687     VPValue *Def = State.Plan->getVPValue(KV.first, true);
3688     if (!State.hasAnyVectorValue(Def))
3689       continue;
3690     for (unsigned Part = 0; Part < UF; ++Part) {
3691       Value *I = State.get(Def, Part);
3692       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3693       if (Inst && Inst->use_empty()) {
3694         Value *NewI = Inst->getOperand(0);
3695         Inst->eraseFromParent();
3696         State.reset(Def, NewI, Part);
3697       }
3698     }
3699   }
3700 }
3701 
3702 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
3703   // Insert truncates and extends for any truncated instructions as hints to
3704   // InstCombine.
3705   if (VF.isVector())
3706     truncateToMinimalBitwidths(State);
3707 
3708   // Fix widened non-induction PHIs by setting up the PHI operands.
3709   if (OrigPHIsToFix.size()) {
3710     assert(EnableVPlanNativePath &&
3711            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3712     fixNonInductionPHIs(State);
3713   }
3714 
3715   // At this point every instruction in the original loop is widened to a
3716   // vector form. Now we need to fix the recurrences in the loop. These PHI
3717   // nodes are currently empty because we did not want to introduce cycles.
3718   // This is the second stage of vectorizing recurrences.
3719   fixCrossIterationPHIs(State);
3720 
3721   // Forget the original basic block.
3722   PSE.getSE()->forgetLoop(OrigLoop);
3723 
3724   Loop *VectorLoop = LI->getLoopFor(State.CFG.PrevBB);
3725   // If we inserted an edge from the middle block to the unique exit block,
3726   // update uses outside the loop (phis) to account for the newly inserted
3727   // edge.
3728   if (!Cost->requiresScalarEpilogue(VF)) {
3729     // Fix-up external users of the induction variables.
3730     for (auto &Entry : Legal->getInductionVars())
3731       fixupIVUsers(
3732           Entry.first, Entry.second, getOrCreateVectorTripCount(VectorLoop),
3733           IVEndValues[Entry.first], LoopMiddleBlock, VectorLoop->getHeader());
3734 
3735     fixLCSSAPHIs(State);
3736   }
3737 
3738   for (Instruction *PI : PredicatedInstructions)
3739     sinkScalarOperands(&*PI);
3740 
3741   // Remove redundant induction instructions.
3742   cse(VectorLoop->getHeader());
3743 
3744   // Set/update profile weights for the vector and remainder loops as original
3745   // loop iterations are now distributed among them. Note that original loop
3746   // represented by LoopScalarBody becomes remainder loop after vectorization.
3747   //
3748   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3749   // end up getting slightly roughened result but that should be OK since
3750   // profile is not inherently precise anyway. Note also possible bypass of
3751   // vector code caused by legality checks is ignored, assigning all the weight
3752   // to the vector loop, optimistically.
3753   //
3754   // For scalable vectorization we can't know at compile time how many iterations
3755   // of the loop are handled in one vector iteration, so instead assume a pessimistic
3756   // vscale of '1'.
3757   setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop,
3758                                LI->getLoopFor(LoopScalarBody),
3759                                VF.getKnownMinValue() * UF);
3760 }
3761 
3762 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
3763   // In order to support recurrences we need to be able to vectorize Phi nodes.
3764   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3765   // stage #2: We now need to fix the recurrences by adding incoming edges to
3766   // the currently empty PHI nodes. At this point every instruction in the
3767   // original loop is widened to a vector form so we can use them to construct
3768   // the incoming edges.
3769   VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock();
3770   for (VPRecipeBase &R : Header->phis()) {
3771     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
3772       fixReduction(ReductionPhi, State);
3773     else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3774       fixFirstOrderRecurrence(FOR, State);
3775   }
3776 }
3777 
3778 void InnerLoopVectorizer::fixFirstOrderRecurrence(
3779     VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
3780   // This is the second phase of vectorizing first-order recurrences. An
3781   // overview of the transformation is described below. Suppose we have the
3782   // following loop.
3783   //
3784   //   for (int i = 0; i < n; ++i)
3785   //     b[i] = a[i] - a[i - 1];
3786   //
3787   // There is a first-order recurrence on "a". For this loop, the shorthand
3788   // scalar IR looks like:
3789   //
3790   //   scalar.ph:
3791   //     s_init = a[-1]
3792   //     br scalar.body
3793   //
3794   //   scalar.body:
3795   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3796   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3797   //     s2 = a[i]
3798   //     b[i] = s2 - s1
3799   //     br cond, scalar.body, ...
3800   //
3801   // In this example, s1 is a recurrence because it's value depends on the
3802   // previous iteration. In the first phase of vectorization, we created a
3803   // vector phi v1 for s1. We now complete the vectorization and produce the
3804   // shorthand vector IR shown below (for VF = 4, UF = 1).
3805   //
3806   //   vector.ph:
3807   //     v_init = vector(..., ..., ..., a[-1])
3808   //     br vector.body
3809   //
3810   //   vector.body
3811   //     i = phi [0, vector.ph], [i+4, vector.body]
3812   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3813   //     v2 = a[i, i+1, i+2, i+3];
3814   //     v3 = vector(v1(3), v2(0, 1, 2))
3815   //     b[i, i+1, i+2, i+3] = v2 - v3
3816   //     br cond, vector.body, middle.block
3817   //
3818   //   middle.block:
3819   //     x = v2(3)
3820   //     br scalar.ph
3821   //
3822   //   scalar.ph:
3823   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3824   //     br scalar.body
3825   //
3826   // After execution completes the vector loop, we extract the next value of
3827   // the recurrence (x) to use as the initial value in the scalar loop.
3828 
3829   // Extract the last vector element in the middle block. This will be the
3830   // initial value for the recurrence when jumping to the scalar loop.
3831   VPValue *PreviousDef = PhiR->getBackedgeValue();
3832   Value *Incoming = State.get(PreviousDef, UF - 1);
3833   auto *ExtractForScalar = Incoming;
3834   auto *IdxTy = Builder.getInt32Ty();
3835   if (VF.isVector()) {
3836     auto *One = ConstantInt::get(IdxTy, 1);
3837     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3838     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3839     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3840     ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
3841                                                     "vector.recur.extract");
3842   }
3843   // Extract the second last element in the middle block if the
3844   // Phi is used outside the loop. We need to extract the phi itself
3845   // and not the last element (the phi update in the current iteration). This
3846   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3847   // when the scalar loop is not run at all.
3848   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3849   if (VF.isVector()) {
3850     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3851     auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
3852     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3853         Incoming, Idx, "vector.recur.extract.for.phi");
3854   } else if (UF > 1)
3855     // When loop is unrolled without vectorizing, initialize
3856     // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
3857     // of `Incoming`. This is analogous to the vectorized case above: extracting
3858     // the second last element when VF > 1.
3859     ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
3860 
3861   // Fix the initial value of the original recurrence in the scalar loop.
3862   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3863   PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
3864   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3865   auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
3866   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3867     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3868     Start->addIncoming(Incoming, BB);
3869   }
3870 
3871   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3872   Phi->setName("scalar.recur");
3873 
3874   // Finally, fix users of the recurrence outside the loop. The users will need
3875   // either the last value of the scalar recurrence or the last value of the
3876   // vector recurrence we extracted in the middle block. Since the loop is in
3877   // LCSSA form, we just need to find all the phi nodes for the original scalar
3878   // recurrence in the exit block, and then add an edge for the middle block.
3879   // Note that LCSSA does not imply single entry when the original scalar loop
3880   // had multiple exiting edges (as we always run the last iteration in the
3881   // scalar epilogue); in that case, there is no edge from middle to exit and
3882   // and thus no phis which needed updated.
3883   if (!Cost->requiresScalarEpilogue(VF))
3884     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
3885       if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi))
3886         LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3887 }
3888 
3889 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
3890                                        VPTransformState &State) {
3891   PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
3892   // Get it's reduction variable descriptor.
3893   assert(Legal->isReductionVariable(OrigPhi) &&
3894          "Unable to find the reduction variable");
3895   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
3896 
3897   RecurKind RK = RdxDesc.getRecurrenceKind();
3898   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3899   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3900   setDebugLocFromInst(ReductionStartValue);
3901 
3902   VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
3903   // This is the vector-clone of the value that leaves the loop.
3904   Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
3905 
3906   // Wrap flags are in general invalid after vectorization, clear them.
3907   clearReductionWrapFlags(RdxDesc, State);
3908 
3909   // Before each round, move the insertion point right between
3910   // the PHIs and the values we are going to write.
3911   // This allows us to write both PHINodes and the extractelement
3912   // instructions.
3913   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3914 
3915   setDebugLocFromInst(LoopExitInst);
3916 
3917   Type *PhiTy = OrigPhi->getType();
3918   BasicBlock *VectorLoopLatch =
3919       LI->getLoopFor(State.CFG.PrevBB)->getLoopLatch();
3920   // If tail is folded by masking, the vector value to leave the loop should be
3921   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3922   // instead of the former. For an inloop reduction the reduction will already
3923   // be predicated, and does not need to be handled here.
3924   if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
3925     for (unsigned Part = 0; Part < UF; ++Part) {
3926       Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
3927       Value *Sel = nullptr;
3928       for (User *U : VecLoopExitInst->users()) {
3929         if (isa<SelectInst>(U)) {
3930           assert(!Sel && "Reduction exit feeding two selects");
3931           Sel = U;
3932         } else
3933           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3934       }
3935       assert(Sel && "Reduction exit feeds no select");
3936       State.reset(LoopExitInstDef, Sel, Part);
3937 
3938       // If the target can create a predicated operator for the reduction at no
3939       // extra cost in the loop (for example a predicated vadd), it can be
3940       // cheaper for the select to remain in the loop than be sunk out of it,
3941       // and so use the select value for the phi instead of the old
3942       // LoopExitValue.
3943       if (PreferPredicatedReductionSelect ||
3944           TTI->preferPredicatedReductionSelect(
3945               RdxDesc.getOpcode(), PhiTy,
3946               TargetTransformInfo::ReductionFlags())) {
3947         auto *VecRdxPhi =
3948             cast<PHINode>(State.get(PhiR, Part));
3949         VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel);
3950       }
3951     }
3952   }
3953 
3954   // If the vector reduction can be performed in a smaller type, we truncate
3955   // then extend the loop exit value to enable InstCombine to evaluate the
3956   // entire expression in the smaller type.
3957   if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
3958     assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
3959     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3960     Builder.SetInsertPoint(VectorLoopLatch->getTerminator());
3961     VectorParts RdxParts(UF);
3962     for (unsigned Part = 0; Part < UF; ++Part) {
3963       RdxParts[Part] = State.get(LoopExitInstDef, Part);
3964       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3965       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3966                                         : Builder.CreateZExt(Trunc, VecTy);
3967       for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users()))
3968         if (U != Trunc) {
3969           U->replaceUsesOfWith(RdxParts[Part], Extnd);
3970           RdxParts[Part] = Extnd;
3971         }
3972     }
3973     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3974     for (unsigned Part = 0; Part < UF; ++Part) {
3975       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3976       State.reset(LoopExitInstDef, RdxParts[Part], Part);
3977     }
3978   }
3979 
3980   // Reduce all of the unrolled parts into a single vector.
3981   Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
3982   unsigned Op = RecurrenceDescriptor::getOpcode(RK);
3983 
3984   // The middle block terminator has already been assigned a DebugLoc here (the
3985   // OrigLoop's single latch terminator). We want the whole middle block to
3986   // appear to execute on this line because: (a) it is all compiler generated,
3987   // (b) these instructions are always executed after evaluating the latch
3988   // conditional branch, and (c) other passes may add new predecessors which
3989   // terminate on this line. This is the easiest way to ensure we don't
3990   // accidentally cause an extra step back into the loop while debugging.
3991   setDebugLocFromInst(LoopMiddleBlock->getTerminator());
3992   if (PhiR->isOrdered())
3993     ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
3994   else {
3995     // Floating-point operations should have some FMF to enable the reduction.
3996     IRBuilderBase::FastMathFlagGuard FMFG(Builder);
3997     Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
3998     for (unsigned Part = 1; Part < UF; ++Part) {
3999       Value *RdxPart = State.get(LoopExitInstDef, Part);
4000       if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
4001         ReducedPartRdx = Builder.CreateBinOp(
4002             (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
4003       } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK))
4004         ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK,
4005                                            ReducedPartRdx, RdxPart);
4006       else
4007         ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
4008     }
4009   }
4010 
4011   // Create the reduction after the loop. Note that inloop reductions create the
4012   // target reduction in the loop using a Reduction recipe.
4013   if (VF.isVector() && !PhiR->isInLoop()) {
4014     ReducedPartRdx =
4015         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi);
4016     // If the reduction can be performed in a smaller type, we need to extend
4017     // the reduction to the wider type before we branch to the original loop.
4018     if (PhiTy != RdxDesc.getRecurrenceType())
4019       ReducedPartRdx = RdxDesc.isSigned()
4020                            ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
4021                            : Builder.CreateZExt(ReducedPartRdx, PhiTy);
4022   }
4023 
4024   PHINode *ResumePhi =
4025       dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
4026 
4027   // Create a phi node that merges control-flow from the backedge-taken check
4028   // block and the middle block.
4029   PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
4030                                         LoopScalarPreHeader->getTerminator());
4031 
4032   // If we are fixing reductions in the epilogue loop then we should already
4033   // have created a bc.merge.rdx Phi after the main vector body. Ensure that
4034   // we carry over the incoming values correctly.
4035   for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
4036     if (Incoming == LoopMiddleBlock)
4037       BCBlockPhi->addIncoming(ReducedPartRdx, Incoming);
4038     else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming))
4039       BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
4040                               Incoming);
4041     else
4042       BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
4043   }
4044 
4045   // Set the resume value for this reduction
4046   ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
4047 
4048   // Now, we need to fix the users of the reduction variable
4049   // inside and outside of the scalar remainder loop.
4050 
4051   // We know that the loop is in LCSSA form. We need to update the PHI nodes
4052   // in the exit blocks.  See comment on analogous loop in
4053   // fixFirstOrderRecurrence for a more complete explaination of the logic.
4054   if (!Cost->requiresScalarEpilogue(VF))
4055     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4056       if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst))
4057         LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4058 
4059   // Fix the scalar loop reduction variable with the incoming reduction sum
4060   // from the vector body and from the backedge value.
4061   int IncomingEdgeBlockIdx =
4062       OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4063   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4064   // Pick the other block.
4065   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4066   OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4067   OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4068 }
4069 
4070 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
4071                                                   VPTransformState &State) {
4072   RecurKind RK = RdxDesc.getRecurrenceKind();
4073   if (RK != RecurKind::Add && RK != RecurKind::Mul)
4074     return;
4075 
4076   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4077   assert(LoopExitInstr && "null loop exit instruction");
4078   SmallVector<Instruction *, 8> Worklist;
4079   SmallPtrSet<Instruction *, 8> Visited;
4080   Worklist.push_back(LoopExitInstr);
4081   Visited.insert(LoopExitInstr);
4082 
4083   while (!Worklist.empty()) {
4084     Instruction *Cur = Worklist.pop_back_val();
4085     if (isa<OverflowingBinaryOperator>(Cur))
4086       for (unsigned Part = 0; Part < UF; ++Part) {
4087         // FIXME: Should not rely on getVPValue at this point.
4088         Value *V = State.get(State.Plan->getVPValue(Cur, true), Part);
4089         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4090       }
4091 
4092     for (User *U : Cur->users()) {
4093       Instruction *UI = cast<Instruction>(U);
4094       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4095           Visited.insert(UI).second)
4096         Worklist.push_back(UI);
4097     }
4098   }
4099 }
4100 
4101 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) {
4102   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4103     if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
4104       // Some phis were already hand updated by the reduction and recurrence
4105       // code above, leave them alone.
4106       continue;
4107 
4108     auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4109     // Non-instruction incoming values will have only one value.
4110 
4111     VPLane Lane = VPLane::getFirstLane();
4112     if (isa<Instruction>(IncomingValue) &&
4113         !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue),
4114                                            VF))
4115       Lane = VPLane::getLastLaneForVF(VF);
4116 
4117     // Can be a loop invariant incoming value or the last scalar value to be
4118     // extracted from the vectorized loop.
4119     // FIXME: Should not rely on getVPValue at this point.
4120     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4121     Value *lastIncomingValue =
4122         OrigLoop->isLoopInvariant(IncomingValue)
4123             ? IncomingValue
4124             : State.get(State.Plan->getVPValue(IncomingValue, true),
4125                         VPIteration(UF - 1, Lane));
4126     LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4127   }
4128 }
4129 
4130 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4131   // The basic block and loop containing the predicated instruction.
4132   auto *PredBB = PredInst->getParent();
4133   auto *VectorLoop = LI->getLoopFor(PredBB);
4134 
4135   // Initialize a worklist with the operands of the predicated instruction.
4136   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4137 
4138   // Holds instructions that we need to analyze again. An instruction may be
4139   // reanalyzed if we don't yet know if we can sink it or not.
4140   SmallVector<Instruction *, 8> InstsToReanalyze;
4141 
4142   // Returns true if a given use occurs in the predicated block. Phi nodes use
4143   // their operands in their corresponding predecessor blocks.
4144   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4145     auto *I = cast<Instruction>(U.getUser());
4146     BasicBlock *BB = I->getParent();
4147     if (auto *Phi = dyn_cast<PHINode>(I))
4148       BB = Phi->getIncomingBlock(
4149           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4150     return BB == PredBB;
4151   };
4152 
4153   // Iteratively sink the scalarized operands of the predicated instruction
4154   // into the block we created for it. When an instruction is sunk, it's
4155   // operands are then added to the worklist. The algorithm ends after one pass
4156   // through the worklist doesn't sink a single instruction.
4157   bool Changed;
4158   do {
4159     // Add the instructions that need to be reanalyzed to the worklist, and
4160     // reset the changed indicator.
4161     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4162     InstsToReanalyze.clear();
4163     Changed = false;
4164 
4165     while (!Worklist.empty()) {
4166       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4167 
4168       // We can't sink an instruction if it is a phi node, is not in the loop,
4169       // or may have side effects.
4170       if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
4171           I->mayHaveSideEffects())
4172         continue;
4173 
4174       // If the instruction is already in PredBB, check if we can sink its
4175       // operands. In that case, VPlan's sinkScalarOperands() succeeded in
4176       // sinking the scalar instruction I, hence it appears in PredBB; but it
4177       // may have failed to sink I's operands (recursively), which we try
4178       // (again) here.
4179       if (I->getParent() == PredBB) {
4180         Worklist.insert(I->op_begin(), I->op_end());
4181         continue;
4182       }
4183 
4184       // It's legal to sink the instruction if all its uses occur in the
4185       // predicated block. Otherwise, there's nothing to do yet, and we may
4186       // need to reanalyze the instruction.
4187       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4188         InstsToReanalyze.push_back(I);
4189         continue;
4190       }
4191 
4192       // Move the instruction to the beginning of the predicated block, and add
4193       // it's operands to the worklist.
4194       I->moveBefore(&*PredBB->getFirstInsertionPt());
4195       Worklist.insert(I->op_begin(), I->op_end());
4196 
4197       // The sinking may have enabled other instructions to be sunk, so we will
4198       // need to iterate.
4199       Changed = true;
4200     }
4201   } while (Changed);
4202 }
4203 
4204 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
4205   for (PHINode *OrigPhi : OrigPHIsToFix) {
4206     VPWidenPHIRecipe *VPPhi =
4207         cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi));
4208     PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4209     // Make sure the builder has a valid insert point.
4210     Builder.SetInsertPoint(NewPhi);
4211     for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4212       VPValue *Inc = VPPhi->getIncomingValue(i);
4213       VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4214       NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4215     }
4216   }
4217 }
4218 
4219 bool InnerLoopVectorizer::useOrderedReductions(
4220     const RecurrenceDescriptor &RdxDesc) {
4221   return Cost->useOrderedReductions(RdxDesc);
4222 }
4223 
4224 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
4225                                               VPWidenPHIRecipe *PhiR,
4226                                               VPTransformState &State) {
4227   PHINode *P = cast<PHINode>(PN);
4228   if (EnableVPlanNativePath) {
4229     // Currently we enter here in the VPlan-native path for non-induction
4230     // PHIs where all control flow is uniform. We simply widen these PHIs.
4231     // Create a vector phi with no operands - the vector phi operands will be
4232     // set at the end of vector code generation.
4233     Type *VecTy = (State.VF.isScalar())
4234                       ? PN->getType()
4235                       : VectorType::get(PN->getType(), State.VF);
4236     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4237     State.set(PhiR, VecPhi, 0);
4238     OrigPHIsToFix.push_back(P);
4239 
4240     return;
4241   }
4242 
4243   assert(PN->getParent() == OrigLoop->getHeader() &&
4244          "Non-header phis should have been handled elsewhere");
4245 
4246   // In order to support recurrences we need to be able to vectorize Phi nodes.
4247   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4248   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4249   // this value when we vectorize all of the instructions that use the PHI.
4250 
4251   assert(!Legal->isReductionVariable(P) &&
4252          "reductions should be handled elsewhere");
4253 
4254   setDebugLocFromInst(P);
4255 
4256   // This PHINode must be an induction variable.
4257   // Make sure that we know about it.
4258   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4259 
4260   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4261   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4262 
4263   auto *IVR = PhiR->getParent()->getPlan()->getCanonicalIV();
4264   PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
4265 
4266   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4267   // which can be found from the original scalar operations.
4268   switch (II.getKind()) {
4269   case InductionDescriptor::IK_NoInduction:
4270     llvm_unreachable("Unknown induction");
4271   case InductionDescriptor::IK_IntInduction:
4272   case InductionDescriptor::IK_FpInduction:
4273     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4274   case InductionDescriptor::IK_PtrInduction: {
4275     // Handle the pointer induction variable case.
4276     assert(P->getType()->isPointerTy() && "Unexpected type.");
4277 
4278     if (Cost->isScalarAfterVectorization(P, State.VF)) {
4279       // This is the normalized GEP that starts counting at zero.
4280       Value *PtrInd =
4281           Builder.CreateSExtOrTrunc(CanonicalIV, II.getStep()->getType());
4282       // Determine the number of scalars we need to generate for each unroll
4283       // iteration. If the instruction is uniform, we only need to generate the
4284       // first lane. Otherwise, we generate all VF values.
4285       bool IsUniform = vputils::onlyFirstLaneUsed(PhiR);
4286       assert((IsUniform || !State.VF.isScalable()) &&
4287              "Cannot scalarize a scalable VF");
4288       unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
4289 
4290       for (unsigned Part = 0; Part < UF; ++Part) {
4291         Value *PartStart =
4292             createStepForVF(Builder, PtrInd->getType(), VF, Part);
4293 
4294         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4295           Value *Idx = Builder.CreateAdd(
4296               PartStart, ConstantInt::get(PtrInd->getType(), Lane));
4297           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4298 
4299           Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(),
4300                                         State.CFG.PrevBB->getTerminator());
4301           Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx,
4302                                                 II.getStartValue(), Step, II);
4303           SclrGep->setName("next.gep");
4304           State.set(PhiR, SclrGep, VPIteration(Part, Lane));
4305         }
4306       }
4307       return;
4308     }
4309     assert(isa<SCEVConstant>(II.getStep()) &&
4310            "Induction step not a SCEV constant!");
4311     Type *PhiType = II.getStep()->getType();
4312 
4313     // Build a pointer phi
4314     Value *ScalarStartValue = PhiR->getStartValue()->getLiveInIRValue();
4315     Type *ScStValueType = ScalarStartValue->getType();
4316     PHINode *NewPointerPhi =
4317         PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
4318     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4319 
4320     // A pointer induction, performed by using a gep
4321     BasicBlock *LoopLatch = LI->getLoopFor(State.CFG.PrevBB)->getLoopLatch();
4322     Instruction *InductionLoc = LoopLatch->getTerminator();
4323     const SCEV *ScalarStep = II.getStep();
4324     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4325     Value *ScalarStepValue =
4326         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4327     Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF);
4328     Value *NumUnrolledElems =
4329         Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
4330     Value *InductionGEP = GetElementPtrInst::Create(
4331         II.getElementType(), NewPointerPhi,
4332         Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
4333         InductionLoc);
4334     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4335 
4336     // Create UF many actual address geps that use the pointer
4337     // phi as base and a vectorized version of the step value
4338     // (<step*0, ..., step*N>) as offset.
4339     for (unsigned Part = 0; Part < State.UF; ++Part) {
4340       Type *VecPhiType = VectorType::get(PhiType, State.VF);
4341       Value *StartOffsetScalar =
4342           Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
4343       Value *StartOffset =
4344           Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
4345       // Create a vector of consecutive numbers from zero to VF.
4346       StartOffset =
4347           Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType));
4348 
4349       Value *GEP = Builder.CreateGEP(
4350           II.getElementType(), NewPointerPhi,
4351           Builder.CreateMul(
4352               StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue),
4353               "vector.gep"));
4354       State.set(PhiR, GEP, Part);
4355     }
4356   }
4357   }
4358 }
4359 
4360 /// A helper function for checking whether an integer division-related
4361 /// instruction may divide by zero (in which case it must be predicated if
4362 /// executed conditionally in the scalar code).
4363 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4364 /// Non-zero divisors that are non compile-time constants will not be
4365 /// converted into multiplication, so we will still end up scalarizing
4366 /// the division, but can do so w/o predication.
4367 static bool mayDivideByZero(Instruction &I) {
4368   assert((I.getOpcode() == Instruction::UDiv ||
4369           I.getOpcode() == Instruction::SDiv ||
4370           I.getOpcode() == Instruction::URem ||
4371           I.getOpcode() == Instruction::SRem) &&
4372          "Unexpected instruction");
4373   Value *Divisor = I.getOperand(1);
4374   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4375   return !CInt || CInt->isZero();
4376 }
4377 
4378 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4379                                                VPUser &ArgOperands,
4380                                                VPTransformState &State) {
4381   assert(!isa<DbgInfoIntrinsic>(I) &&
4382          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4383   setDebugLocFromInst(&I);
4384 
4385   Module *M = I.getParent()->getParent()->getParent();
4386   auto *CI = cast<CallInst>(&I);
4387 
4388   SmallVector<Type *, 4> Tys;
4389   for (Value *ArgOperand : CI->args())
4390     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4391 
4392   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4393 
4394   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4395   // version of the instruction.
4396   // Is it beneficial to perform intrinsic call compared to lib call?
4397   bool NeedToScalarize = false;
4398   InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4399   InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
4400   bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
4401   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4402          "Instruction should be scalarized elsewhere.");
4403   assert((IntrinsicCost.isValid() || CallCost.isValid()) &&
4404          "Either the intrinsic cost or vector call cost must be valid");
4405 
4406   for (unsigned Part = 0; Part < UF; ++Part) {
4407     SmallVector<Type *, 2> TysForDecl = {CI->getType()};
4408     SmallVector<Value *, 4> Args;
4409     for (auto &I : enumerate(ArgOperands.operands())) {
4410       // Some intrinsics have a scalar argument - don't replace it with a
4411       // vector.
4412       Value *Arg;
4413       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4414         Arg = State.get(I.value(), Part);
4415       else {
4416         Arg = State.get(I.value(), VPIteration(0, 0));
4417         if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index()))
4418           TysForDecl.push_back(Arg->getType());
4419       }
4420       Args.push_back(Arg);
4421     }
4422 
4423     Function *VectorF;
4424     if (UseVectorIntrinsic) {
4425       // Use vector version of the intrinsic.
4426       if (VF.isVector())
4427         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4428       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4429       assert(VectorF && "Can't retrieve vector intrinsic.");
4430     } else {
4431       // Use vector version of the function call.
4432       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4433 #ifndef NDEBUG
4434       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4435              "Can't create vector function.");
4436 #endif
4437         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4438     }
4439       SmallVector<OperandBundleDef, 1> OpBundles;
4440       CI->getOperandBundlesAsDefs(OpBundles);
4441       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4442 
4443       if (isa<FPMathOperator>(V))
4444         V->copyFastMathFlags(CI);
4445 
4446       State.set(Def, V, Part);
4447       addMetadata(V, &I);
4448   }
4449 }
4450 
4451 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4452   // We should not collect Scalars more than once per VF. Right now, this
4453   // function is called from collectUniformsAndScalars(), which already does
4454   // this check. Collecting Scalars for VF=1 does not make any sense.
4455   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4456          "This function should not be visited twice for the same VF");
4457 
4458   SmallSetVector<Instruction *, 8> Worklist;
4459 
4460   // These sets are used to seed the analysis with pointers used by memory
4461   // accesses that will remain scalar.
4462   SmallSetVector<Instruction *, 8> ScalarPtrs;
4463   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4464   auto *Latch = TheLoop->getLoopLatch();
4465 
4466   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4467   // The pointer operands of loads and stores will be scalar as long as the
4468   // memory access is not a gather or scatter operation. The value operand of a
4469   // store will remain scalar if the store is scalarized.
4470   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4471     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4472     assert(WideningDecision != CM_Unknown &&
4473            "Widening decision should be ready at this moment");
4474     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4475       if (Ptr == Store->getValueOperand())
4476         return WideningDecision == CM_Scalarize;
4477     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4478            "Ptr is neither a value or pointer operand");
4479     return WideningDecision != CM_GatherScatter;
4480   };
4481 
4482   // A helper that returns true if the given value is a bitcast or
4483   // getelementptr instruction contained in the loop.
4484   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4485     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4486             isa<GetElementPtrInst>(V)) &&
4487            !TheLoop->isLoopInvariant(V);
4488   };
4489 
4490   // A helper that evaluates a memory access's use of a pointer. If the use will
4491   // be a scalar use and the pointer is only used by memory accesses, we place
4492   // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4493   // PossibleNonScalarPtrs.
4494   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4495     // We only care about bitcast and getelementptr instructions contained in
4496     // the loop.
4497     if (!isLoopVaryingBitCastOrGEP(Ptr))
4498       return;
4499 
4500     // If the pointer has already been identified as scalar (e.g., if it was
4501     // also identified as uniform), there's nothing to do.
4502     auto *I = cast<Instruction>(Ptr);
4503     if (Worklist.count(I))
4504       return;
4505 
4506     // If the use of the pointer will be a scalar use, and all users of the
4507     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4508     // place the pointer in PossibleNonScalarPtrs.
4509     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4510           return isa<LoadInst>(U) || isa<StoreInst>(U);
4511         }))
4512       ScalarPtrs.insert(I);
4513     else
4514       PossibleNonScalarPtrs.insert(I);
4515   };
4516 
4517   // We seed the scalars analysis with three classes of instructions: (1)
4518   // instructions marked uniform-after-vectorization and (2) bitcast,
4519   // getelementptr and (pointer) phi instructions used by memory accesses
4520   // requiring a scalar use.
4521   //
4522   // (1) Add to the worklist all instructions that have been identified as
4523   // uniform-after-vectorization.
4524   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4525 
4526   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4527   // memory accesses requiring a scalar use. The pointer operands of loads and
4528   // stores will be scalar as long as the memory accesses is not a gather or
4529   // scatter operation. The value operand of a store will remain scalar if the
4530   // store is scalarized.
4531   for (auto *BB : TheLoop->blocks())
4532     for (auto &I : *BB) {
4533       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4534         evaluatePtrUse(Load, Load->getPointerOperand());
4535       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4536         evaluatePtrUse(Store, Store->getPointerOperand());
4537         evaluatePtrUse(Store, Store->getValueOperand());
4538       }
4539     }
4540   for (auto *I : ScalarPtrs)
4541     if (!PossibleNonScalarPtrs.count(I)) {
4542       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4543       Worklist.insert(I);
4544     }
4545 
4546   // Insert the forced scalars.
4547   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4548   // induction variable when the PHI user is scalarized.
4549   auto ForcedScalar = ForcedScalars.find(VF);
4550   if (ForcedScalar != ForcedScalars.end())
4551     for (auto *I : ForcedScalar->second)
4552       Worklist.insert(I);
4553 
4554   // Expand the worklist by looking through any bitcasts and getelementptr
4555   // instructions we've already identified as scalar. This is similar to the
4556   // expansion step in collectLoopUniforms(); however, here we're only
4557   // expanding to include additional bitcasts and getelementptr instructions.
4558   unsigned Idx = 0;
4559   while (Idx != Worklist.size()) {
4560     Instruction *Dst = Worklist[Idx++];
4561     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4562       continue;
4563     auto *Src = cast<Instruction>(Dst->getOperand(0));
4564     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4565           auto *J = cast<Instruction>(U);
4566           return !TheLoop->contains(J) || Worklist.count(J) ||
4567                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4568                   isScalarUse(J, Src));
4569         })) {
4570       Worklist.insert(Src);
4571       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4572     }
4573   }
4574 
4575   // An induction variable will remain scalar if all users of the induction
4576   // variable and induction variable update remain scalar.
4577   for (auto &Induction : Legal->getInductionVars()) {
4578     auto *Ind = Induction.first;
4579     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4580 
4581     // If tail-folding is applied, the primary induction variable will be used
4582     // to feed a vector compare.
4583     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4584       continue;
4585 
4586     // Returns true if \p Indvar is a pointer induction that is used directly by
4587     // load/store instruction \p I.
4588     auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
4589                                               Instruction *I) {
4590       return Induction.second.getKind() ==
4591                  InductionDescriptor::IK_PtrInduction &&
4592              (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
4593              Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
4594     };
4595 
4596     // Determine if all users of the induction variable are scalar after
4597     // vectorization.
4598     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4599       auto *I = cast<Instruction>(U);
4600       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4601              IsDirectLoadStoreFromPtrIndvar(Ind, I);
4602     });
4603     if (!ScalarInd)
4604       continue;
4605 
4606     // Determine if all users of the induction variable update instruction are
4607     // scalar after vectorization.
4608     auto ScalarIndUpdate =
4609         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4610           auto *I = cast<Instruction>(U);
4611           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4612                  IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
4613         });
4614     if (!ScalarIndUpdate)
4615       continue;
4616 
4617     // The induction variable and its update instruction will remain scalar.
4618     Worklist.insert(Ind);
4619     Worklist.insert(IndUpdate);
4620     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4621     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4622                       << "\n");
4623   }
4624 
4625   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4626 }
4627 
4628 bool LoopVectorizationCostModel::isScalarWithPredication(
4629     Instruction *I, ElementCount VF) const {
4630   if (!blockNeedsPredicationForAnyReason(I->getParent()))
4631     return false;
4632   switch(I->getOpcode()) {
4633   default:
4634     break;
4635   case Instruction::Load:
4636   case Instruction::Store: {
4637     if (!Legal->isMaskRequired(I))
4638       return false;
4639     auto *Ptr = getLoadStorePointerOperand(I);
4640     auto *Ty = getLoadStoreType(I);
4641     Type *VTy = Ty;
4642     if (VF.isVector())
4643       VTy = VectorType::get(Ty, VF);
4644     const Align Alignment = getLoadStoreAlignment(I);
4645     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4646                                 TTI.isLegalMaskedGather(VTy, Alignment))
4647                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4648                                 TTI.isLegalMaskedScatter(VTy, Alignment));
4649   }
4650   case Instruction::UDiv:
4651   case Instruction::SDiv:
4652   case Instruction::SRem:
4653   case Instruction::URem:
4654     return mayDivideByZero(*I);
4655   }
4656   return false;
4657 }
4658 
4659 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4660     Instruction *I, ElementCount VF) {
4661   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4662   assert(getWideningDecision(I, VF) == CM_Unknown &&
4663          "Decision should not be set yet.");
4664   auto *Group = getInterleavedAccessGroup(I);
4665   assert(Group && "Must have a group.");
4666 
4667   // If the instruction's allocated size doesn't equal it's type size, it
4668   // requires padding and will be scalarized.
4669   auto &DL = I->getModule()->getDataLayout();
4670   auto *ScalarTy = getLoadStoreType(I);
4671   if (hasIrregularType(ScalarTy, DL))
4672     return false;
4673 
4674   // Check if masking is required.
4675   // A Group may need masking for one of two reasons: it resides in a block that
4676   // needs predication, or it was decided to use masking to deal with gaps
4677   // (either a gap at the end of a load-access that may result in a speculative
4678   // load, or any gaps in a store-access).
4679   bool PredicatedAccessRequiresMasking =
4680       blockNeedsPredicationForAnyReason(I->getParent()) &&
4681       Legal->isMaskRequired(I);
4682   bool LoadAccessWithGapsRequiresEpilogMasking =
4683       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4684       !isScalarEpilogueAllowed();
4685   bool StoreAccessWithGapsRequiresMasking =
4686       isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4687   if (!PredicatedAccessRequiresMasking &&
4688       !LoadAccessWithGapsRequiresEpilogMasking &&
4689       !StoreAccessWithGapsRequiresMasking)
4690     return true;
4691 
4692   // If masked interleaving is required, we expect that the user/target had
4693   // enabled it, because otherwise it either wouldn't have been created or
4694   // it should have been invalidated by the CostModel.
4695   assert(useMaskedInterleavedAccesses(TTI) &&
4696          "Masked interleave-groups for predicated accesses are not enabled.");
4697 
4698   if (Group->isReverse())
4699     return false;
4700 
4701   auto *Ty = getLoadStoreType(I);
4702   const Align Alignment = getLoadStoreAlignment(I);
4703   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4704                           : TTI.isLegalMaskedStore(Ty, Alignment);
4705 }
4706 
4707 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4708     Instruction *I, ElementCount VF) {
4709   // Get and ensure we have a valid memory instruction.
4710   assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
4711 
4712   auto *Ptr = getLoadStorePointerOperand(I);
4713   auto *ScalarTy = getLoadStoreType(I);
4714 
4715   // In order to be widened, the pointer should be consecutive, first of all.
4716   if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
4717     return false;
4718 
4719   // If the instruction is a store located in a predicated block, it will be
4720   // scalarized.
4721   if (isScalarWithPredication(I, VF))
4722     return false;
4723 
4724   // If the instruction's allocated size doesn't equal it's type size, it
4725   // requires padding and will be scalarized.
4726   auto &DL = I->getModule()->getDataLayout();
4727   if (hasIrregularType(ScalarTy, DL))
4728     return false;
4729 
4730   return true;
4731 }
4732 
4733 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4734   // We should not collect Uniforms more than once per VF. Right now,
4735   // this function is called from collectUniformsAndScalars(), which
4736   // already does this check. Collecting Uniforms for VF=1 does not make any
4737   // sense.
4738 
4739   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
4740          "This function should not be visited twice for the same VF");
4741 
4742   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4743   // not analyze again.  Uniforms.count(VF) will return 1.
4744   Uniforms[VF].clear();
4745 
4746   // We now know that the loop is vectorizable!
4747   // Collect instructions inside the loop that will remain uniform after
4748   // vectorization.
4749 
4750   // Global values, params and instructions outside of current loop are out of
4751   // scope.
4752   auto isOutOfScope = [&](Value *V) -> bool {
4753     Instruction *I = dyn_cast<Instruction>(V);
4754     return (!I || !TheLoop->contains(I));
4755   };
4756 
4757   // Worklist containing uniform instructions demanding lane 0.
4758   SetVector<Instruction *> Worklist;
4759   BasicBlock *Latch = TheLoop->getLoopLatch();
4760 
4761   // Add uniform instructions demanding lane 0 to the worklist. Instructions
4762   // that are scalar with predication must not be considered uniform after
4763   // vectorization, because that would create an erroneous replicating region
4764   // where only a single instance out of VF should be formed.
4765   // TODO: optimize such seldom cases if found important, see PR40816.
4766   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4767     if (isOutOfScope(I)) {
4768       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
4769                         << *I << "\n");
4770       return;
4771     }
4772     if (isScalarWithPredication(I, VF)) {
4773       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4774                         << *I << "\n");
4775       return;
4776     }
4777     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4778     Worklist.insert(I);
4779   };
4780 
4781   // Start with the conditional branch. If the branch condition is an
4782   // instruction contained in the loop that is only used by the branch, it is
4783   // uniform.
4784   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4785   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4786     addToWorklistIfAllowed(Cmp);
4787 
4788   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
4789     InstWidening WideningDecision = getWideningDecision(I, VF);
4790     assert(WideningDecision != CM_Unknown &&
4791            "Widening decision should be ready at this moment");
4792 
4793     // A uniform memory op is itself uniform.  We exclude uniform stores
4794     // here as they demand the last lane, not the first one.
4795     if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
4796       assert(WideningDecision == CM_Scalarize);
4797       return true;
4798     }
4799 
4800     return (WideningDecision == CM_Widen ||
4801             WideningDecision == CM_Widen_Reverse ||
4802             WideningDecision == CM_Interleave);
4803   };
4804 
4805 
4806   // Returns true if Ptr is the pointer operand of a memory access instruction
4807   // I, and I is known to not require scalarization.
4808   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4809     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4810   };
4811 
4812   // Holds a list of values which are known to have at least one uniform use.
4813   // Note that there may be other uses which aren't uniform.  A "uniform use"
4814   // here is something which only demands lane 0 of the unrolled iterations;
4815   // it does not imply that all lanes produce the same value (e.g. this is not
4816   // the usual meaning of uniform)
4817   SetVector<Value *> HasUniformUse;
4818 
4819   // Scan the loop for instructions which are either a) known to have only
4820   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
4821   for (auto *BB : TheLoop->blocks())
4822     for (auto &I : *BB) {
4823       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
4824         switch (II->getIntrinsicID()) {
4825         case Intrinsic::sideeffect:
4826         case Intrinsic::experimental_noalias_scope_decl:
4827         case Intrinsic::assume:
4828         case Intrinsic::lifetime_start:
4829         case Intrinsic::lifetime_end:
4830           if (TheLoop->hasLoopInvariantOperands(&I))
4831             addToWorklistIfAllowed(&I);
4832           break;
4833         default:
4834           break;
4835         }
4836       }
4837 
4838       // ExtractValue instructions must be uniform, because the operands are
4839       // known to be loop-invariant.
4840       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
4841         assert(isOutOfScope(EVI->getAggregateOperand()) &&
4842                "Expected aggregate value to be loop invariant");
4843         addToWorklistIfAllowed(EVI);
4844         continue;
4845       }
4846 
4847       // If there's no pointer operand, there's nothing to do.
4848       auto *Ptr = getLoadStorePointerOperand(&I);
4849       if (!Ptr)
4850         continue;
4851 
4852       // A uniform memory op is itself uniform.  We exclude uniform stores
4853       // here as they demand the last lane, not the first one.
4854       if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
4855         addToWorklistIfAllowed(&I);
4856 
4857       if (isUniformDecision(&I, VF)) {
4858         assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
4859         HasUniformUse.insert(Ptr);
4860       }
4861     }
4862 
4863   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
4864   // demanding) users.  Since loops are assumed to be in LCSSA form, this
4865   // disallows uses outside the loop as well.
4866   for (auto *V : HasUniformUse) {
4867     if (isOutOfScope(V))
4868       continue;
4869     auto *I = cast<Instruction>(V);
4870     auto UsersAreMemAccesses =
4871       llvm::all_of(I->users(), [&](User *U) -> bool {
4872         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
4873       });
4874     if (UsersAreMemAccesses)
4875       addToWorklistIfAllowed(I);
4876   }
4877 
4878   // Expand Worklist in topological order: whenever a new instruction
4879   // is added , its users should be already inside Worklist.  It ensures
4880   // a uniform instruction will only be used by uniform instructions.
4881   unsigned idx = 0;
4882   while (idx != Worklist.size()) {
4883     Instruction *I = Worklist[idx++];
4884 
4885     for (auto OV : I->operand_values()) {
4886       // isOutOfScope operands cannot be uniform instructions.
4887       if (isOutOfScope(OV))
4888         continue;
4889       // First order recurrence Phi's should typically be considered
4890       // non-uniform.
4891       auto *OP = dyn_cast<PHINode>(OV);
4892       if (OP && Legal->isFirstOrderRecurrence(OP))
4893         continue;
4894       // If all the users of the operand are uniform, then add the
4895       // operand into the uniform worklist.
4896       auto *OI = cast<Instruction>(OV);
4897       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4898             auto *J = cast<Instruction>(U);
4899             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
4900           }))
4901         addToWorklistIfAllowed(OI);
4902     }
4903   }
4904 
4905   // For an instruction to be added into Worklist above, all its users inside
4906   // the loop should also be in Worklist. However, this condition cannot be
4907   // true for phi nodes that form a cyclic dependence. We must process phi
4908   // nodes separately. An induction variable will remain uniform if all users
4909   // of the induction variable and induction variable update remain uniform.
4910   // The code below handles both pointer and non-pointer induction variables.
4911   for (auto &Induction : Legal->getInductionVars()) {
4912     auto *Ind = Induction.first;
4913     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4914 
4915     // Determine if all users of the induction variable are uniform after
4916     // vectorization.
4917     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4918       auto *I = cast<Instruction>(U);
4919       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4920              isVectorizedMemAccessUse(I, Ind);
4921     });
4922     if (!UniformInd)
4923       continue;
4924 
4925     // Determine if all users of the induction variable update instruction are
4926     // uniform after vectorization.
4927     auto UniformIndUpdate =
4928         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4929           auto *I = cast<Instruction>(U);
4930           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4931                  isVectorizedMemAccessUse(I, IndUpdate);
4932         });
4933     if (!UniformIndUpdate)
4934       continue;
4935 
4936     // The induction variable and its update instruction will remain uniform.
4937     addToWorklistIfAllowed(Ind);
4938     addToWorklistIfAllowed(IndUpdate);
4939   }
4940 
4941   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4942 }
4943 
4944 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4945   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4946 
4947   if (Legal->getRuntimePointerChecking()->Need) {
4948     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4949         "runtime pointer checks needed. Enable vectorization of this "
4950         "loop with '#pragma clang loop vectorize(enable)' when "
4951         "compiling with -Os/-Oz",
4952         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4953     return true;
4954   }
4955 
4956   if (!PSE.getPredicate().isAlwaysTrue()) {
4957     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4958         "runtime SCEV checks needed. Enable vectorization of this "
4959         "loop with '#pragma clang loop vectorize(enable)' when "
4960         "compiling with -Os/-Oz",
4961         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4962     return true;
4963   }
4964 
4965   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4966   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4967     reportVectorizationFailure("Runtime stride check for small trip count",
4968         "runtime stride == 1 checks needed. Enable vectorization of "
4969         "this loop without such check by compiling with -Os/-Oz",
4970         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4971     return true;
4972   }
4973 
4974   return false;
4975 }
4976 
4977 ElementCount
4978 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
4979   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
4980     return ElementCount::getScalable(0);
4981 
4982   if (Hints->isScalableVectorizationDisabled()) {
4983     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
4984                             "ScalableVectorizationDisabled", ORE, TheLoop);
4985     return ElementCount::getScalable(0);
4986   }
4987 
4988   LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
4989 
4990   auto MaxScalableVF = ElementCount::getScalable(
4991       std::numeric_limits<ElementCount::ScalarTy>::max());
4992 
4993   // Test that the loop-vectorizer can legalize all operations for this MaxVF.
4994   // FIXME: While for scalable vectors this is currently sufficient, this should
4995   // be replaced by a more detailed mechanism that filters out specific VFs,
4996   // instead of invalidating vectorization for a whole set of VFs based on the
4997   // MaxVF.
4998 
4999   // Disable scalable vectorization if the loop contains unsupported reductions.
5000   if (!canVectorizeReductions(MaxScalableVF)) {
5001     reportVectorizationInfo(
5002         "Scalable vectorization not supported for the reduction "
5003         "operations found in this loop.",
5004         "ScalableVFUnfeasible", ORE, TheLoop);
5005     return ElementCount::getScalable(0);
5006   }
5007 
5008   // Disable scalable vectorization if the loop contains any instructions
5009   // with element types not supported for scalable vectors.
5010   if (any_of(ElementTypesInLoop, [&](Type *Ty) {
5011         return !Ty->isVoidTy() &&
5012                !this->TTI.isElementTypeLegalForScalableVector(Ty);
5013       })) {
5014     reportVectorizationInfo("Scalable vectorization is not supported "
5015                             "for all element types found in this loop.",
5016                             "ScalableVFUnfeasible", ORE, TheLoop);
5017     return ElementCount::getScalable(0);
5018   }
5019 
5020   if (Legal->isSafeForAnyVectorWidth())
5021     return MaxScalableVF;
5022 
5023   // Limit MaxScalableVF by the maximum safe dependence distance.
5024   Optional<unsigned> MaxVScale = TTI.getMaxVScale();
5025   if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange))
5026     MaxVScale =
5027         TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
5028   MaxScalableVF = ElementCount::getScalable(
5029       MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
5030   if (!MaxScalableVF)
5031     reportVectorizationInfo(
5032         "Max legal vector width too small, scalable vectorization "
5033         "unfeasible.",
5034         "ScalableVFUnfeasible", ORE, TheLoop);
5035 
5036   return MaxScalableVF;
5037 }
5038 
5039 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
5040     unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) {
5041   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5042   unsigned SmallestType, WidestType;
5043   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5044 
5045   // Get the maximum safe dependence distance in bits computed by LAA.
5046   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5047   // the memory accesses that is most restrictive (involved in the smallest
5048   // dependence distance).
5049   unsigned MaxSafeElements =
5050       PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
5051 
5052   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
5053   auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
5054 
5055   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
5056                     << ".\n");
5057   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
5058                     << ".\n");
5059 
5060   // First analyze the UserVF, fall back if the UserVF should be ignored.
5061   if (UserVF) {
5062     auto MaxSafeUserVF =
5063         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
5064 
5065     if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
5066       // If `VF=vscale x N` is safe, then so is `VF=N`
5067       if (UserVF.isScalable())
5068         return FixedScalableVFPair(
5069             ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
5070       else
5071         return UserVF;
5072     }
5073 
5074     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
5075 
5076     // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
5077     // is better to ignore the hint and let the compiler choose a suitable VF.
5078     if (!UserVF.isScalable()) {
5079       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5080                         << " is unsafe, clamping to max safe VF="
5081                         << MaxSafeFixedVF << ".\n");
5082       ORE->emit([&]() {
5083         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5084                                           TheLoop->getStartLoc(),
5085                                           TheLoop->getHeader())
5086                << "User-specified vectorization factor "
5087                << ore::NV("UserVectorizationFactor", UserVF)
5088                << " is unsafe, clamping to maximum safe vectorization factor "
5089                << ore::NV("VectorizationFactor", MaxSafeFixedVF);
5090       });
5091       return MaxSafeFixedVF;
5092     }
5093 
5094     if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
5095       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5096                         << " is ignored because scalable vectors are not "
5097                            "available.\n");
5098       ORE->emit([&]() {
5099         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5100                                           TheLoop->getStartLoc(),
5101                                           TheLoop->getHeader())
5102                << "User-specified vectorization factor "
5103                << ore::NV("UserVectorizationFactor", UserVF)
5104                << " is ignored because the target does not support scalable "
5105                   "vectors. The compiler will pick a more suitable value.";
5106       });
5107     } else {
5108       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5109                         << " is unsafe. Ignoring scalable UserVF.\n");
5110       ORE->emit([&]() {
5111         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5112                                           TheLoop->getStartLoc(),
5113                                           TheLoop->getHeader())
5114                << "User-specified vectorization factor "
5115                << ore::NV("UserVectorizationFactor", UserVF)
5116                << " is unsafe. Ignoring the hint to let the compiler pick a "
5117                   "more suitable value.";
5118       });
5119     }
5120   }
5121 
5122   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5123                     << " / " << WidestType << " bits.\n");
5124 
5125   FixedScalableVFPair Result(ElementCount::getFixed(1),
5126                              ElementCount::getScalable(0));
5127   if (auto MaxVF =
5128           getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5129                                   MaxSafeFixedVF, FoldTailByMasking))
5130     Result.FixedVF = MaxVF;
5131 
5132   if (auto MaxVF =
5133           getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5134                                   MaxSafeScalableVF, FoldTailByMasking))
5135     if (MaxVF.isScalable()) {
5136       Result.ScalableVF = MaxVF;
5137       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
5138                         << "\n");
5139     }
5140 
5141   return Result;
5142 }
5143 
5144 FixedScalableVFPair
5145 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5146   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5147     // TODO: It may by useful to do since it's still likely to be dynamically
5148     // uniform if the target can skip.
5149     reportVectorizationFailure(
5150         "Not inserting runtime ptr check for divergent target",
5151         "runtime pointer checks needed. Not enabled for divergent target",
5152         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5153     return FixedScalableVFPair::getNone();
5154   }
5155 
5156   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5157   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5158   if (TC == 1) {
5159     reportVectorizationFailure("Single iteration (non) loop",
5160         "loop trip count is one, irrelevant for vectorization",
5161         "SingleIterationLoop", ORE, TheLoop);
5162     return FixedScalableVFPair::getNone();
5163   }
5164 
5165   switch (ScalarEpilogueStatus) {
5166   case CM_ScalarEpilogueAllowed:
5167     return computeFeasibleMaxVF(TC, UserVF, false);
5168   case CM_ScalarEpilogueNotAllowedUsePredicate:
5169     LLVM_FALLTHROUGH;
5170   case CM_ScalarEpilogueNotNeededUsePredicate:
5171     LLVM_DEBUG(
5172         dbgs() << "LV: vector predicate hint/switch found.\n"
5173                << "LV: Not allowing scalar epilogue, creating predicated "
5174                << "vector loop.\n");
5175     break;
5176   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5177     // fallthrough as a special case of OptForSize
5178   case CM_ScalarEpilogueNotAllowedOptSize:
5179     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5180       LLVM_DEBUG(
5181           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5182     else
5183       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5184                         << "count.\n");
5185 
5186     // Bail if runtime checks are required, which are not good when optimising
5187     // for size.
5188     if (runtimeChecksRequired())
5189       return FixedScalableVFPair::getNone();
5190 
5191     break;
5192   }
5193 
5194   // The only loops we can vectorize without a scalar epilogue, are loops with
5195   // a bottom-test and a single exiting block. We'd have to handle the fact
5196   // that not every instruction executes on the last iteration.  This will
5197   // require a lane mask which varies through the vector loop body.  (TODO)
5198   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5199     // If there was a tail-folding hint/switch, but we can't fold the tail by
5200     // masking, fallback to a vectorization with a scalar epilogue.
5201     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5202       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5203                            "scalar epilogue instead.\n");
5204       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5205       return computeFeasibleMaxVF(TC, UserVF, false);
5206     }
5207     return FixedScalableVFPair::getNone();
5208   }
5209 
5210   // Now try the tail folding
5211 
5212   // Invalidate interleave groups that require an epilogue if we can't mask
5213   // the interleave-group.
5214   if (!useMaskedInterleavedAccesses(TTI)) {
5215     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5216            "No decisions should have been taken at this point");
5217     // Note: There is no need to invalidate any cost modeling decisions here, as
5218     // non where taken so far.
5219     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5220   }
5221 
5222   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true);
5223   // Avoid tail folding if the trip count is known to be a multiple of any VF
5224   // we chose.
5225   // FIXME: The condition below pessimises the case for fixed-width vectors,
5226   // when scalable VFs are also candidates for vectorization.
5227   if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
5228     ElementCount MaxFixedVF = MaxFactors.FixedVF;
5229     assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&
5230            "MaxFixedVF must be a power of 2");
5231     unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
5232                                    : MaxFixedVF.getFixedValue();
5233     ScalarEvolution *SE = PSE.getSE();
5234     const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5235     const SCEV *ExitCount = SE->getAddExpr(
5236         BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5237     const SCEV *Rem = SE->getURemExpr(
5238         SE->applyLoopGuards(ExitCount, TheLoop),
5239         SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5240     if (Rem->isZero()) {
5241       // Accept MaxFixedVF if we do not have a tail.
5242       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5243       return MaxFactors;
5244     }
5245   }
5246 
5247   // For scalable vectors don't use tail folding for low trip counts or
5248   // optimizing for code size. We only permit this if the user has explicitly
5249   // requested it.
5250   if (ScalarEpilogueStatus != CM_ScalarEpilogueNotNeededUsePredicate &&
5251       ScalarEpilogueStatus != CM_ScalarEpilogueNotAllowedUsePredicate &&
5252       MaxFactors.ScalableVF.isVector())
5253     MaxFactors.ScalableVF = ElementCount::getScalable(0);
5254 
5255   // If we don't know the precise trip count, or if the trip count that we
5256   // found modulo the vectorization factor is not zero, try to fold the tail
5257   // by masking.
5258   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5259   if (Legal->prepareToFoldTailByMasking()) {
5260     FoldTailByMasking = true;
5261     return MaxFactors;
5262   }
5263 
5264   // If there was a tail-folding hint/switch, but we can't fold the tail by
5265   // masking, fallback to a vectorization with a scalar epilogue.
5266   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5267     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5268                          "scalar epilogue instead.\n");
5269     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5270     return MaxFactors;
5271   }
5272 
5273   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5274     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5275     return FixedScalableVFPair::getNone();
5276   }
5277 
5278   if (TC == 0) {
5279     reportVectorizationFailure(
5280         "Unable to calculate the loop count due to complex control flow",
5281         "unable to calculate the loop count due to complex control flow",
5282         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5283     return FixedScalableVFPair::getNone();
5284   }
5285 
5286   reportVectorizationFailure(
5287       "Cannot optimize for size and vectorize at the same time.",
5288       "cannot optimize for size and vectorize at the same time. "
5289       "Enable vectorization of this loop with '#pragma clang loop "
5290       "vectorize(enable)' when compiling with -Os/-Oz",
5291       "NoTailLoopWithOptForSize", ORE, TheLoop);
5292   return FixedScalableVFPair::getNone();
5293 }
5294 
5295 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
5296     unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
5297     const ElementCount &MaxSafeVF, bool FoldTailByMasking) {
5298   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
5299   TypeSize WidestRegister = TTI.getRegisterBitWidth(
5300       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5301                            : TargetTransformInfo::RGK_FixedWidthVector);
5302 
5303   // Convenience function to return the minimum of two ElementCounts.
5304   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
5305     assert((LHS.isScalable() == RHS.isScalable()) &&
5306            "Scalable flags must match");
5307     return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
5308   };
5309 
5310   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5311   // Note that both WidestRegister and WidestType may not be a powers of 2.
5312   auto MaxVectorElementCount = ElementCount::get(
5313       PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType),
5314       ComputeScalableMaxVF);
5315   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
5316   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5317                     << (MaxVectorElementCount * WidestType) << " bits.\n");
5318 
5319   if (!MaxVectorElementCount) {
5320     LLVM_DEBUG(dbgs() << "LV: The target has no "
5321                       << (ComputeScalableMaxVF ? "scalable" : "fixed")
5322                       << " vector registers.\n");
5323     return ElementCount::getFixed(1);
5324   }
5325 
5326   const auto TripCountEC = ElementCount::getFixed(ConstTripCount);
5327   if (ConstTripCount &&
5328       ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) &&
5329       (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {
5330     // If loop trip count (TC) is known at compile time there is no point in
5331     // choosing VF greater than TC (as done in the loop below). Select maximum
5332     // power of two which doesn't exceed TC.
5333     // If MaxVectorElementCount is scalable, we only fall back on a fixed VF
5334     // when the TC is less than or equal to the known number of lanes.
5335     auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount);
5336     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
5337                          "exceeding the constant trip count: "
5338                       << ClampedConstTripCount << "\n");
5339     return ElementCount::getFixed(ClampedConstTripCount);
5340   }
5341 
5342   ElementCount MaxVF = MaxVectorElementCount;
5343   if (TTI.shouldMaximizeVectorBandwidth() ||
5344       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5345     auto MaxVectorElementCountMaxBW = ElementCount::get(
5346         PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
5347         ComputeScalableMaxVF);
5348     MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
5349 
5350     // Collect all viable vectorization factors larger than the default MaxVF
5351     // (i.e. MaxVectorElementCount).
5352     SmallVector<ElementCount, 8> VFs;
5353     for (ElementCount VS = MaxVectorElementCount * 2;
5354          ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
5355       VFs.push_back(VS);
5356 
5357     // For each VF calculate its register usage.
5358     auto RUs = calculateRegisterUsage(VFs);
5359 
5360     // Select the largest VF which doesn't require more registers than existing
5361     // ones.
5362     for (int i = RUs.size() - 1; i >= 0; --i) {
5363       bool Selected = true;
5364       for (auto &pair : RUs[i].MaxLocalUsers) {
5365         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5366         if (pair.second > TargetNumRegisters)
5367           Selected = false;
5368       }
5369       if (Selected) {
5370         MaxVF = VFs[i];
5371         break;
5372       }
5373     }
5374     if (ElementCount MinVF =
5375             TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
5376       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
5377         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5378                           << ") with target's minimum: " << MinVF << '\n');
5379         MaxVF = MinVF;
5380       }
5381     }
5382   }
5383   return MaxVF;
5384 }
5385 
5386 Optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const {
5387   if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
5388     auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
5389     auto Min = Attr.getVScaleRangeMin();
5390     auto Max = Attr.getVScaleRangeMax();
5391     if (Max && Min == Max)
5392       return Max;
5393   }
5394 
5395   return TTI.getVScaleForTuning();
5396 }
5397 
5398 bool LoopVectorizationCostModel::isMoreProfitable(
5399     const VectorizationFactor &A, const VectorizationFactor &B) const {
5400   InstructionCost CostA = A.Cost;
5401   InstructionCost CostB = B.Cost;
5402 
5403   unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
5404 
5405   if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking &&
5406       MaxTripCount) {
5407     // If we are folding the tail and the trip count is a known (possibly small)
5408     // constant, the trip count will be rounded up to an integer number of
5409     // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),
5410     // which we compare directly. When not folding the tail, the total cost will
5411     // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is
5412     // approximated with the per-lane cost below instead of using the tripcount
5413     // as here.
5414     auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue());
5415     auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue());
5416     return RTCostA < RTCostB;
5417   }
5418 
5419   // Improve estimate for the vector width if it is scalable.
5420   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
5421   unsigned EstimatedWidthB = B.Width.getKnownMinValue();
5422   if (Optional<unsigned> VScale = getVScaleForTuning()) {
5423     if (A.Width.isScalable())
5424       EstimatedWidthA *= VScale.getValue();
5425     if (B.Width.isScalable())
5426       EstimatedWidthB *= VScale.getValue();
5427   }
5428 
5429   // Assume vscale may be larger than 1 (or the value being tuned for),
5430   // so that scalable vectorization is slightly favorable over fixed-width
5431   // vectorization.
5432   if (A.Width.isScalable() && !B.Width.isScalable())
5433     return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
5434 
5435   // To avoid the need for FP division:
5436   //      (CostA / A.Width) < (CostB / B.Width)
5437   // <=>  (CostA * B.Width) < (CostB * A.Width)
5438   return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
5439 }
5440 
5441 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
5442     const ElementCountSet &VFCandidates) {
5443   InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
5444   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
5445   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
5446   assert(VFCandidates.count(ElementCount::getFixed(1)) &&
5447          "Expected Scalar VF to be a candidate");
5448 
5449   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost);
5450   VectorizationFactor ChosenFactor = ScalarCost;
5451 
5452   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5453   if (ForceVectorization && VFCandidates.size() > 1) {
5454     // Ignore scalar width, because the user explicitly wants vectorization.
5455     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5456     // evaluation.
5457     ChosenFactor.Cost = InstructionCost::getMax();
5458   }
5459 
5460   SmallVector<InstructionVFPair> InvalidCosts;
5461   for (const auto &i : VFCandidates) {
5462     // The cost for scalar VF=1 is already calculated, so ignore it.
5463     if (i.isScalar())
5464       continue;
5465 
5466     VectorizationCostTy C = expectedCost(i, &InvalidCosts);
5467     VectorizationFactor Candidate(i, C.first);
5468 
5469 #ifndef NDEBUG
5470     unsigned AssumedMinimumVscale = 1;
5471     if (Optional<unsigned> VScale = getVScaleForTuning())
5472       AssumedMinimumVscale = VScale.getValue();
5473     unsigned Width =
5474         Candidate.Width.isScalable()
5475             ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5476             : Candidate.Width.getFixedValue();
5477     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5478                       << " costs: " << (Candidate.Cost / Width));
5479     if (i.isScalable())
5480       LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
5481                         << AssumedMinimumVscale << ")");
5482     LLVM_DEBUG(dbgs() << ".\n");
5483 #endif
5484 
5485     if (!C.second && !ForceVectorization) {
5486       LLVM_DEBUG(
5487           dbgs() << "LV: Not considering vector loop of width " << i
5488                  << " because it will not generate any vector instructions.\n");
5489       continue;
5490     }
5491 
5492     // If profitable add it to ProfitableVF list.
5493     if (isMoreProfitable(Candidate, ScalarCost))
5494       ProfitableVFs.push_back(Candidate);
5495 
5496     if (isMoreProfitable(Candidate, ChosenFactor))
5497       ChosenFactor = Candidate;
5498   }
5499 
5500   // Emit a report of VFs with invalid costs in the loop.
5501   if (!InvalidCosts.empty()) {
5502     // Group the remarks per instruction, keeping the instruction order from
5503     // InvalidCosts.
5504     std::map<Instruction *, unsigned> Numbering;
5505     unsigned I = 0;
5506     for (auto &Pair : InvalidCosts)
5507       if (!Numbering.count(Pair.first))
5508         Numbering[Pair.first] = I++;
5509 
5510     // Sort the list, first on instruction(number) then on VF.
5511     llvm::sort(InvalidCosts,
5512                [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
5513                  if (Numbering[A.first] != Numbering[B.first])
5514                    return Numbering[A.first] < Numbering[B.first];
5515                  ElementCountComparator ECC;
5516                  return ECC(A.second, B.second);
5517                });
5518 
5519     // For a list of ordered instruction-vf pairs:
5520     //   [(load, vf1), (load, vf2), (store, vf1)]
5521     // Group the instructions together to emit separate remarks for:
5522     //   load  (vf1, vf2)
5523     //   store (vf1)
5524     auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
5525     auto Subset = ArrayRef<InstructionVFPair>();
5526     do {
5527       if (Subset.empty())
5528         Subset = Tail.take_front(1);
5529 
5530       Instruction *I = Subset.front().first;
5531 
5532       // If the next instruction is different, or if there are no other pairs,
5533       // emit a remark for the collated subset. e.g.
5534       //   [(load, vf1), (load, vf2))]
5535       // to emit:
5536       //  remark: invalid costs for 'load' at VF=(vf, vf2)
5537       if (Subset == Tail || Tail[Subset.size()].first != I) {
5538         std::string OutString;
5539         raw_string_ostream OS(OutString);
5540         assert(!Subset.empty() && "Unexpected empty range");
5541         OS << "Instruction with invalid costs prevented vectorization at VF=(";
5542         for (auto &Pair : Subset)
5543           OS << (Pair.second == Subset.front().second ? "" : ", ")
5544              << Pair.second;
5545         OS << "):";
5546         if (auto *CI = dyn_cast<CallInst>(I))
5547           OS << " call to " << CI->getCalledFunction()->getName();
5548         else
5549           OS << " " << I->getOpcodeName();
5550         OS.flush();
5551         reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
5552         Tail = Tail.drop_front(Subset.size());
5553         Subset = {};
5554       } else
5555         // Grow the subset by one element
5556         Subset = Tail.take_front(Subset.size() + 1);
5557     } while (!Tail.empty());
5558   }
5559 
5560   if (!EnableCondStoresVectorization && NumPredStores) {
5561     reportVectorizationFailure("There are conditional stores.",
5562         "store that is conditionally executed prevents vectorization",
5563         "ConditionalStore", ORE, TheLoop);
5564     ChosenFactor = ScalarCost;
5565   }
5566 
5567   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
5568                  ChosenFactor.Cost >= ScalarCost.Cost) dbgs()
5569              << "LV: Vectorization seems to be not beneficial, "
5570              << "but was forced by a user.\n");
5571   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
5572   return ChosenFactor;
5573 }
5574 
5575 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
5576     const Loop &L, ElementCount VF) const {
5577   // Cross iteration phis such as reductions need special handling and are
5578   // currently unsupported.
5579   if (any_of(L.getHeader()->phis(),
5580              [&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); }))
5581     return false;
5582 
5583   // Phis with uses outside of the loop require special handling and are
5584   // currently unsupported.
5585   for (auto &Entry : Legal->getInductionVars()) {
5586     // Look for uses of the value of the induction at the last iteration.
5587     Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
5588     for (User *U : PostInc->users())
5589       if (!L.contains(cast<Instruction>(U)))
5590         return false;
5591     // Look for uses of penultimate value of the induction.
5592     for (User *U : Entry.first->users())
5593       if (!L.contains(cast<Instruction>(U)))
5594         return false;
5595   }
5596 
5597   // Induction variables that are widened require special handling that is
5598   // currently not supported.
5599   if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
5600         return !(this->isScalarAfterVectorization(Entry.first, VF) ||
5601                  this->isProfitableToScalarize(Entry.first, VF));
5602       }))
5603     return false;
5604 
5605   // Epilogue vectorization code has not been auditted to ensure it handles
5606   // non-latch exits properly.  It may be fine, but it needs auditted and
5607   // tested.
5608   if (L.getExitingBlock() != L.getLoopLatch())
5609     return false;
5610 
5611   return true;
5612 }
5613 
5614 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5615     const ElementCount VF) const {
5616   // FIXME: We need a much better cost-model to take different parameters such
5617   // as register pressure, code size increase and cost of extra branches into
5618   // account. For now we apply a very crude heuristic and only consider loops
5619   // with vectorization factors larger than a certain value.
5620   // We also consider epilogue vectorization unprofitable for targets that don't
5621   // consider interleaving beneficial (eg. MVE).
5622   if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
5623     return false;
5624   // FIXME: We should consider changing the threshold for scalable
5625   // vectors to take VScaleForTuning into account.
5626   if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF)
5627     return true;
5628   return false;
5629 }
5630 
5631 VectorizationFactor
5632 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
5633     const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
5634   VectorizationFactor Result = VectorizationFactor::Disabled();
5635   if (!EnableEpilogueVectorization) {
5636     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
5637     return Result;
5638   }
5639 
5640   if (!isScalarEpilogueAllowed()) {
5641     LLVM_DEBUG(
5642         dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
5643                   "allowed.\n";);
5644     return Result;
5645   }
5646 
5647   // Not really a cost consideration, but check for unsupported cases here to
5648   // simplify the logic.
5649   if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
5650     LLVM_DEBUG(
5651         dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
5652                   "not a supported candidate.\n";);
5653     return Result;
5654   }
5655 
5656   if (EpilogueVectorizationForceVF > 1) {
5657     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
5658     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
5659     if (LVP.hasPlanWithVF(ForcedEC))
5660       return {ForcedEC, 0};
5661     else {
5662       LLVM_DEBUG(
5663           dbgs()
5664               << "LEV: Epilogue vectorization forced factor is not viable.\n";);
5665       return Result;
5666     }
5667   }
5668 
5669   if (TheLoop->getHeader()->getParent()->hasOptSize() ||
5670       TheLoop->getHeader()->getParent()->hasMinSize()) {
5671     LLVM_DEBUG(
5672         dbgs()
5673             << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
5674     return Result;
5675   }
5676 
5677   if (!isEpilogueVectorizationProfitable(MainLoopVF)) {
5678     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
5679                          "this loop\n");
5680     return Result;
5681   }
5682 
5683   // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
5684   // the main loop handles 8 lanes per iteration. We could still benefit from
5685   // vectorizing the epilogue loop with VF=4.
5686   ElementCount EstimatedRuntimeVF = MainLoopVF;
5687   if (MainLoopVF.isScalable()) {
5688     EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
5689     if (Optional<unsigned> VScale = getVScaleForTuning())
5690       EstimatedRuntimeVF *= VScale.getValue();
5691   }
5692 
5693   for (auto &NextVF : ProfitableVFs)
5694     if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
5695           ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) ||
5696          ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) &&
5697         (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) &&
5698         LVP.hasPlanWithVF(NextVF.Width))
5699       Result = NextVF;
5700 
5701   if (Result != VectorizationFactor::Disabled())
5702     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5703                       << Result.Width << "\n";);
5704   return Result;
5705 }
5706 
5707 std::pair<unsigned, unsigned>
5708 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5709   unsigned MinWidth = -1U;
5710   unsigned MaxWidth = 8;
5711   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5712   // For in-loop reductions, no element types are added to ElementTypesInLoop
5713   // if there are no loads/stores in the loop. In this case, check through the
5714   // reduction variables to determine the maximum width.
5715   if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
5716     // Reset MaxWidth so that we can find the smallest type used by recurrences
5717     // in the loop.
5718     MaxWidth = -1U;
5719     for (auto &PhiDescriptorPair : Legal->getReductionVars()) {
5720       const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
5721       // When finding the min width used by the recurrence we need to account
5722       // for casts on the input operands of the recurrence.
5723       MaxWidth = std::min<unsigned>(
5724           MaxWidth, std::min<unsigned>(
5725                         RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
5726                         RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
5727     }
5728   } else {
5729     for (Type *T : ElementTypesInLoop) {
5730       MinWidth = std::min<unsigned>(
5731           MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5732       MaxWidth = std::max<unsigned>(
5733           MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5734     }
5735   }
5736   return {MinWidth, MaxWidth};
5737 }
5738 
5739 void LoopVectorizationCostModel::collectElementTypesForWidening() {
5740   ElementTypesInLoop.clear();
5741   // For each block.
5742   for (BasicBlock *BB : TheLoop->blocks()) {
5743     // For each instruction in the loop.
5744     for (Instruction &I : BB->instructionsWithoutDebug()) {
5745       Type *T = I.getType();
5746 
5747       // Skip ignored values.
5748       if (ValuesToIgnore.count(&I))
5749         continue;
5750 
5751       // Only examine Loads, Stores and PHINodes.
5752       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5753         continue;
5754 
5755       // Examine PHI nodes that are reduction variables. Update the type to
5756       // account for the recurrence type.
5757       if (auto *PN = dyn_cast<PHINode>(&I)) {
5758         if (!Legal->isReductionVariable(PN))
5759           continue;
5760         const RecurrenceDescriptor &RdxDesc =
5761             Legal->getReductionVars().find(PN)->second;
5762         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
5763             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
5764                                       RdxDesc.getRecurrenceType(),
5765                                       TargetTransformInfo::ReductionFlags()))
5766           continue;
5767         T = RdxDesc.getRecurrenceType();
5768       }
5769 
5770       // Examine the stored values.
5771       if (auto *ST = dyn_cast<StoreInst>(&I))
5772         T = ST->getValueOperand()->getType();
5773 
5774       assert(T->isSized() &&
5775              "Expected the load/store/recurrence type to be sized");
5776 
5777       ElementTypesInLoop.insert(T);
5778     }
5779   }
5780 }
5781 
5782 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5783                                                            unsigned LoopCost) {
5784   // -- The interleave heuristics --
5785   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5786   // There are many micro-architectural considerations that we can't predict
5787   // at this level. For example, frontend pressure (on decode or fetch) due to
5788   // code size, or the number and capabilities of the execution ports.
5789   //
5790   // We use the following heuristics to select the interleave count:
5791   // 1. If the code has reductions, then we interleave to break the cross
5792   // iteration dependency.
5793   // 2. If the loop is really small, then we interleave to reduce the loop
5794   // overhead.
5795   // 3. We don't interleave if we think that we will spill registers to memory
5796   // due to the increased register pressure.
5797 
5798   if (!isScalarEpilogueAllowed())
5799     return 1;
5800 
5801   // We used the distance for the interleave count.
5802   if (Legal->getMaxSafeDepDistBytes() != -1U)
5803     return 1;
5804 
5805   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5806   const bool HasReductions = !Legal->getReductionVars().empty();
5807   // Do not interleave loops with a relatively small known or estimated trip
5808   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5809   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5810   // because with the above conditions interleaving can expose ILP and break
5811   // cross iteration dependences for reductions.
5812   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5813       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5814     return 1;
5815 
5816   RegisterUsage R = calculateRegisterUsage({VF})[0];
5817   // We divide by these constants so assume that we have at least one
5818   // instruction that uses at least one register.
5819   for (auto& pair : R.MaxLocalUsers) {
5820     pair.second = std::max(pair.second, 1U);
5821   }
5822 
5823   // We calculate the interleave count using the following formula.
5824   // Subtract the number of loop invariants from the number of available
5825   // registers. These registers are used by all of the interleaved instances.
5826   // Next, divide the remaining registers by the number of registers that is
5827   // required by the loop, in order to estimate how many parallel instances
5828   // fit without causing spills. All of this is rounded down if necessary to be
5829   // a power of two. We want power of two interleave count to simplify any
5830   // addressing operations or alignment considerations.
5831   // We also want power of two interleave counts to ensure that the induction
5832   // variable of the vector loop wraps to zero, when tail is folded by masking;
5833   // this currently happens when OptForSize, in which case IC is set to 1 above.
5834   unsigned IC = UINT_MAX;
5835 
5836   for (auto& pair : R.MaxLocalUsers) {
5837     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5838     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5839                       << " registers of "
5840                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5841     if (VF.isScalar()) {
5842       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5843         TargetNumRegisters = ForceTargetNumScalarRegs;
5844     } else {
5845       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5846         TargetNumRegisters = ForceTargetNumVectorRegs;
5847     }
5848     unsigned MaxLocalUsers = pair.second;
5849     unsigned LoopInvariantRegs = 0;
5850     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5851       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5852 
5853     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5854     // Don't count the induction variable as interleaved.
5855     if (EnableIndVarRegisterHeur) {
5856       TmpIC =
5857           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5858                         std::max(1U, (MaxLocalUsers - 1)));
5859     }
5860 
5861     IC = std::min(IC, TmpIC);
5862   }
5863 
5864   // Clamp the interleave ranges to reasonable counts.
5865   unsigned MaxInterleaveCount =
5866       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
5867 
5868   // Check if the user has overridden the max.
5869   if (VF.isScalar()) {
5870     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5871       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5872   } else {
5873     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5874       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5875   }
5876 
5877   // If trip count is known or estimated compile time constant, limit the
5878   // interleave count to be less than the trip count divided by VF, provided it
5879   // is at least 1.
5880   //
5881   // For scalable vectors we can't know if interleaving is beneficial. It may
5882   // not be beneficial for small loops if none of the lanes in the second vector
5883   // iterations is enabled. However, for larger loops, there is likely to be a
5884   // similar benefit as for fixed-width vectors. For now, we choose to leave
5885   // the InterleaveCount as if vscale is '1', although if some information about
5886   // the vector is known (e.g. min vector size), we can make a better decision.
5887   if (BestKnownTC) {
5888     MaxInterleaveCount =
5889         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
5890     // Make sure MaxInterleaveCount is greater than 0.
5891     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
5892   }
5893 
5894   assert(MaxInterleaveCount > 0 &&
5895          "Maximum interleave count must be greater than 0");
5896 
5897   // Clamp the calculated IC to be between the 1 and the max interleave count
5898   // that the target and trip count allows.
5899   if (IC > MaxInterleaveCount)
5900     IC = MaxInterleaveCount;
5901   else
5902     // Make sure IC is greater than 0.
5903     IC = std::max(1u, IC);
5904 
5905   assert(IC > 0 && "Interleave count must be greater than 0.");
5906 
5907   // If we did not calculate the cost for VF (because the user selected the VF)
5908   // then we calculate the cost of VF here.
5909   if (LoopCost == 0) {
5910     InstructionCost C = expectedCost(VF).first;
5911     assert(C.isValid() && "Expected to have chosen a VF with valid cost");
5912     LoopCost = *C.getValue();
5913   }
5914 
5915   assert(LoopCost && "Non-zero loop cost expected");
5916 
5917   // Interleave if we vectorized this loop and there is a reduction that could
5918   // benefit from interleaving.
5919   if (VF.isVector() && HasReductions) {
5920     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5921     return IC;
5922   }
5923 
5924   // For any scalar loop that either requires runtime checks or predication we
5925   // are better off leaving this to the unroller. Note that if we've already
5926   // vectorized the loop we will have done the runtime check and so interleaving
5927   // won't require further checks.
5928   bool ScalarInterleavingRequiresPredication =
5929       (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5930          return Legal->blockNeedsPredication(BB);
5931        }));
5932   bool ScalarInterleavingRequiresRuntimePointerCheck =
5933       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5934 
5935   // We want to interleave small loops in order to reduce the loop overhead and
5936   // potentially expose ILP opportunities.
5937   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5938                     << "LV: IC is " << IC << '\n'
5939                     << "LV: VF is " << VF << '\n');
5940   const bool AggressivelyInterleaveReductions =
5941       TTI.enableAggressiveInterleaving(HasReductions);
5942   if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5943       !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5944     // We assume that the cost overhead is 1 and we use the cost model
5945     // to estimate the cost of the loop and interleave until the cost of the
5946     // loop overhead is about 5% of the cost of the loop.
5947     unsigned SmallIC =
5948         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5949 
5950     // Interleave until store/load ports (estimated by max interleave count) are
5951     // saturated.
5952     unsigned NumStores = Legal->getNumStores();
5953     unsigned NumLoads = Legal->getNumLoads();
5954     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5955     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5956 
5957     // There is little point in interleaving for reductions containing selects
5958     // and compares when VF=1 since it may just create more overhead than it's
5959     // worth for loops with small trip counts. This is because we still have to
5960     // do the final reduction after the loop.
5961     bool HasSelectCmpReductions =
5962         HasReductions &&
5963         any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5964           const RecurrenceDescriptor &RdxDesc = Reduction.second;
5965           return RecurrenceDescriptor::isSelectCmpRecurrenceKind(
5966               RdxDesc.getRecurrenceKind());
5967         });
5968     if (HasSelectCmpReductions) {
5969       LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5970       return 1;
5971     }
5972 
5973     // If we have a scalar reduction (vector reductions are already dealt with
5974     // by this point), we can increase the critical path length if the loop
5975     // we're interleaving is inside another loop. For tree-wise reductions
5976     // set the limit to 2, and for ordered reductions it's best to disable
5977     // interleaving entirely.
5978     if (HasReductions && TheLoop->getLoopDepth() > 1) {
5979       bool HasOrderedReductions =
5980           any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5981             const RecurrenceDescriptor &RdxDesc = Reduction.second;
5982             return RdxDesc.isOrdered();
5983           });
5984       if (HasOrderedReductions) {
5985         LLVM_DEBUG(
5986             dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5987         return 1;
5988       }
5989 
5990       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5991       SmallIC = std::min(SmallIC, F);
5992       StoresIC = std::min(StoresIC, F);
5993       LoadsIC = std::min(LoadsIC, F);
5994     }
5995 
5996     if (EnableLoadStoreRuntimeInterleave &&
5997         std::max(StoresIC, LoadsIC) > SmallIC) {
5998       LLVM_DEBUG(
5999           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
6000       return std::max(StoresIC, LoadsIC);
6001     }
6002 
6003     // If there are scalar reductions and TTI has enabled aggressive
6004     // interleaving for reductions, we will interleave to expose ILP.
6005     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
6006         AggressivelyInterleaveReductions) {
6007       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6008       // Interleave no less than SmallIC but not as aggressive as the normal IC
6009       // to satisfy the rare situation when resources are too limited.
6010       return std::max(IC / 2, SmallIC);
6011     } else {
6012       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
6013       return SmallIC;
6014     }
6015   }
6016 
6017   // Interleave if this is a large loop (small loops are already dealt with by
6018   // this point) that could benefit from interleaving.
6019   if (AggressivelyInterleaveReductions) {
6020     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6021     return IC;
6022   }
6023 
6024   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
6025   return 1;
6026 }
6027 
6028 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
6029 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
6030   // This function calculates the register usage by measuring the highest number
6031   // of values that are alive at a single location. Obviously, this is a very
6032   // rough estimation. We scan the loop in a topological order in order and
6033   // assign a number to each instruction. We use RPO to ensure that defs are
6034   // met before their users. We assume that each instruction that has in-loop
6035   // users starts an interval. We record every time that an in-loop value is
6036   // used, so we have a list of the first and last occurrences of each
6037   // instruction. Next, we transpose this data structure into a multi map that
6038   // holds the list of intervals that *end* at a specific location. This multi
6039   // map allows us to perform a linear search. We scan the instructions linearly
6040   // and record each time that a new interval starts, by placing it in a set.
6041   // If we find this value in the multi-map then we remove it from the set.
6042   // The max register usage is the maximum size of the set.
6043   // We also search for instructions that are defined outside the loop, but are
6044   // used inside the loop. We need this number separately from the max-interval
6045   // usage number because when we unroll, loop-invariant values do not take
6046   // more register.
6047   LoopBlocksDFS DFS(TheLoop);
6048   DFS.perform(LI);
6049 
6050   RegisterUsage RU;
6051 
6052   // Each 'key' in the map opens a new interval. The values
6053   // of the map are the index of the 'last seen' usage of the
6054   // instruction that is the key.
6055   using IntervalMap = DenseMap<Instruction *, unsigned>;
6056 
6057   // Maps instruction to its index.
6058   SmallVector<Instruction *, 64> IdxToInstr;
6059   // Marks the end of each interval.
6060   IntervalMap EndPoint;
6061   // Saves the list of instruction indices that are used in the loop.
6062   SmallPtrSet<Instruction *, 8> Ends;
6063   // Saves the list of values that are used in the loop but are
6064   // defined outside the loop, such as arguments and constants.
6065   SmallPtrSet<Value *, 8> LoopInvariants;
6066 
6067   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6068     for (Instruction &I : BB->instructionsWithoutDebug()) {
6069       IdxToInstr.push_back(&I);
6070 
6071       // Save the end location of each USE.
6072       for (Value *U : I.operands()) {
6073         auto *Instr = dyn_cast<Instruction>(U);
6074 
6075         // Ignore non-instruction values such as arguments, constants, etc.
6076         if (!Instr)
6077           continue;
6078 
6079         // If this instruction is outside the loop then record it and continue.
6080         if (!TheLoop->contains(Instr)) {
6081           LoopInvariants.insert(Instr);
6082           continue;
6083         }
6084 
6085         // Overwrite previous end points.
6086         EndPoint[Instr] = IdxToInstr.size();
6087         Ends.insert(Instr);
6088       }
6089     }
6090   }
6091 
6092   // Saves the list of intervals that end with the index in 'key'.
6093   using InstrList = SmallVector<Instruction *, 2>;
6094   DenseMap<unsigned, InstrList> TransposeEnds;
6095 
6096   // Transpose the EndPoints to a list of values that end at each index.
6097   for (auto &Interval : EndPoint)
6098     TransposeEnds[Interval.second].push_back(Interval.first);
6099 
6100   SmallPtrSet<Instruction *, 8> OpenIntervals;
6101   SmallVector<RegisterUsage, 8> RUs(VFs.size());
6102   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
6103 
6104   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
6105 
6106   // A lambda that gets the register usage for the given type and VF.
6107   const auto &TTICapture = TTI;
6108   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
6109     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
6110       return 0;
6111     InstructionCost::CostType RegUsage =
6112         *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue();
6113     assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() &&
6114            "Nonsensical values for register usage.");
6115     return RegUsage;
6116   };
6117 
6118   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
6119     Instruction *I = IdxToInstr[i];
6120 
6121     // Remove all of the instructions that end at this location.
6122     InstrList &List = TransposeEnds[i];
6123     for (Instruction *ToRemove : List)
6124       OpenIntervals.erase(ToRemove);
6125 
6126     // Ignore instructions that are never used within the loop.
6127     if (!Ends.count(I))
6128       continue;
6129 
6130     // Skip ignored values.
6131     if (ValuesToIgnore.count(I))
6132       continue;
6133 
6134     // For each VF find the maximum usage of registers.
6135     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6136       // Count the number of live intervals.
6137       SmallMapVector<unsigned, unsigned, 4> RegUsage;
6138 
6139       if (VFs[j].isScalar()) {
6140         for (auto Inst : OpenIntervals) {
6141           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6142           if (RegUsage.find(ClassID) == RegUsage.end())
6143             RegUsage[ClassID] = 1;
6144           else
6145             RegUsage[ClassID] += 1;
6146         }
6147       } else {
6148         collectUniformsAndScalars(VFs[j]);
6149         for (auto Inst : OpenIntervals) {
6150           // Skip ignored values for VF > 1.
6151           if (VecValuesToIgnore.count(Inst))
6152             continue;
6153           if (isScalarAfterVectorization(Inst, VFs[j])) {
6154             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6155             if (RegUsage.find(ClassID) == RegUsage.end())
6156               RegUsage[ClassID] = 1;
6157             else
6158               RegUsage[ClassID] += 1;
6159           } else {
6160             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
6161             if (RegUsage.find(ClassID) == RegUsage.end())
6162               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
6163             else
6164               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6165           }
6166         }
6167       }
6168 
6169       for (auto& pair : RegUsage) {
6170         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
6171           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
6172         else
6173           MaxUsages[j][pair.first] = pair.second;
6174       }
6175     }
6176 
6177     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6178                       << OpenIntervals.size() << '\n');
6179 
6180     // Add the current instruction to the list of open intervals.
6181     OpenIntervals.insert(I);
6182   }
6183 
6184   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6185     SmallMapVector<unsigned, unsigned, 4> Invariant;
6186 
6187     for (auto Inst : LoopInvariants) {
6188       unsigned Usage =
6189           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6190       unsigned ClassID =
6191           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6192       if (Invariant.find(ClassID) == Invariant.end())
6193         Invariant[ClassID] = Usage;
6194       else
6195         Invariant[ClassID] += Usage;
6196     }
6197 
6198     LLVM_DEBUG({
6199       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
6200       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
6201              << " item\n";
6202       for (const auto &pair : MaxUsages[i]) {
6203         dbgs() << "LV(REG): RegisterClass: "
6204                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6205                << " registers\n";
6206       }
6207       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
6208              << " item\n";
6209       for (const auto &pair : Invariant) {
6210         dbgs() << "LV(REG): RegisterClass: "
6211                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6212                << " registers\n";
6213       }
6214     });
6215 
6216     RU.LoopInvariantRegs = Invariant;
6217     RU.MaxLocalUsers = MaxUsages[i];
6218     RUs[i] = RU;
6219   }
6220 
6221   return RUs;
6222 }
6223 
6224 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
6225                                                            ElementCount VF) {
6226   // TODO: Cost model for emulated masked load/store is completely
6227   // broken. This hack guides the cost model to use an artificially
6228   // high enough value to practically disable vectorization with such
6229   // operations, except where previously deployed legality hack allowed
6230   // using very low cost values. This is to avoid regressions coming simply
6231   // from moving "masked load/store" check from legality to cost model.
6232   // Masked Load/Gather emulation was previously never allowed.
6233   // Limited number of Masked Store/Scatter emulation was allowed.
6234   assert(isPredicatedInst(I, VF) && "Expecting a scalar emulated instruction");
6235   return isa<LoadInst>(I) ||
6236          (isa<StoreInst>(I) &&
6237           NumPredStores > NumberOfStoresToPredicate);
6238 }
6239 
6240 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6241   // If we aren't vectorizing the loop, or if we've already collected the
6242   // instructions to scalarize, there's nothing to do. Collection may already
6243   // have occurred if we have a user-selected VF and are now computing the
6244   // expected cost for interleaving.
6245   if (VF.isScalar() || VF.isZero() ||
6246       InstsToScalarize.find(VF) != InstsToScalarize.end())
6247     return;
6248 
6249   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6250   // not profitable to scalarize any instructions, the presence of VF in the
6251   // map will indicate that we've analyzed it already.
6252   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6253 
6254   // Find all the instructions that are scalar with predication in the loop and
6255   // determine if it would be better to not if-convert the blocks they are in.
6256   // If so, we also record the instructions to scalarize.
6257   for (BasicBlock *BB : TheLoop->blocks()) {
6258     if (!blockNeedsPredicationForAnyReason(BB))
6259       continue;
6260     for (Instruction &I : *BB)
6261       if (isScalarWithPredication(&I, VF)) {
6262         ScalarCostsTy ScalarCosts;
6263         // Do not apply discount if scalable, because that would lead to
6264         // invalid scalarization costs.
6265         // Do not apply discount logic if hacked cost is needed
6266         // for emulated masked memrefs.
6267         if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) &&
6268             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6269           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6270         // Remember that BB will remain after vectorization.
6271         PredicatedBBsAfterVectorization.insert(BB);
6272       }
6273   }
6274 }
6275 
6276 int LoopVectorizationCostModel::computePredInstDiscount(
6277     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
6278   assert(!isUniformAfterVectorization(PredInst, VF) &&
6279          "Instruction marked uniform-after-vectorization will be predicated");
6280 
6281   // Initialize the discount to zero, meaning that the scalar version and the
6282   // vector version cost the same.
6283   InstructionCost Discount = 0;
6284 
6285   // Holds instructions to analyze. The instructions we visit are mapped in
6286   // ScalarCosts. Those instructions are the ones that would be scalarized if
6287   // we find that the scalar version costs less.
6288   SmallVector<Instruction *, 8> Worklist;
6289 
6290   // Returns true if the given instruction can be scalarized.
6291   auto canBeScalarized = [&](Instruction *I) -> bool {
6292     // We only attempt to scalarize instructions forming a single-use chain
6293     // from the original predicated block that would otherwise be vectorized.
6294     // Although not strictly necessary, we give up on instructions we know will
6295     // already be scalar to avoid traversing chains that are unlikely to be
6296     // beneficial.
6297     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6298         isScalarAfterVectorization(I, VF))
6299       return false;
6300 
6301     // If the instruction is scalar with predication, it will be analyzed
6302     // separately. We ignore it within the context of PredInst.
6303     if (isScalarWithPredication(I, VF))
6304       return false;
6305 
6306     // If any of the instruction's operands are uniform after vectorization,
6307     // the instruction cannot be scalarized. This prevents, for example, a
6308     // masked load from being scalarized.
6309     //
6310     // We assume we will only emit a value for lane zero of an instruction
6311     // marked uniform after vectorization, rather than VF identical values.
6312     // Thus, if we scalarize an instruction that uses a uniform, we would
6313     // create uses of values corresponding to the lanes we aren't emitting code
6314     // for. This behavior can be changed by allowing getScalarValue to clone
6315     // the lane zero values for uniforms rather than asserting.
6316     for (Use &U : I->operands())
6317       if (auto *J = dyn_cast<Instruction>(U.get()))
6318         if (isUniformAfterVectorization(J, VF))
6319           return false;
6320 
6321     // Otherwise, we can scalarize the instruction.
6322     return true;
6323   };
6324 
6325   // Compute the expected cost discount from scalarizing the entire expression
6326   // feeding the predicated instruction. We currently only consider expressions
6327   // that are single-use instruction chains.
6328   Worklist.push_back(PredInst);
6329   while (!Worklist.empty()) {
6330     Instruction *I = Worklist.pop_back_val();
6331 
6332     // If we've already analyzed the instruction, there's nothing to do.
6333     if (ScalarCosts.find(I) != ScalarCosts.end())
6334       continue;
6335 
6336     // Compute the cost of the vector instruction. Note that this cost already
6337     // includes the scalarization overhead of the predicated instruction.
6338     InstructionCost VectorCost = getInstructionCost(I, VF).first;
6339 
6340     // Compute the cost of the scalarized instruction. This cost is the cost of
6341     // the instruction as if it wasn't if-converted and instead remained in the
6342     // predicated block. We will scale this cost by block probability after
6343     // computing the scalarization overhead.
6344     InstructionCost ScalarCost =
6345         VF.getFixedValue() *
6346         getInstructionCost(I, ElementCount::getFixed(1)).first;
6347 
6348     // Compute the scalarization overhead of needed insertelement instructions
6349     // and phi nodes.
6350     if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
6351       ScalarCost += TTI.getScalarizationOverhead(
6352           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6353           APInt::getAllOnes(VF.getFixedValue()), true, false);
6354       ScalarCost +=
6355           VF.getFixedValue() *
6356           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6357     }
6358 
6359     // Compute the scalarization overhead of needed extractelement
6360     // instructions. For each of the instruction's operands, if the operand can
6361     // be scalarized, add it to the worklist; otherwise, account for the
6362     // overhead.
6363     for (Use &U : I->operands())
6364       if (auto *J = dyn_cast<Instruction>(U.get())) {
6365         assert(VectorType::isValidElementType(J->getType()) &&
6366                "Instruction has non-scalar type");
6367         if (canBeScalarized(J))
6368           Worklist.push_back(J);
6369         else if (needsExtract(J, VF)) {
6370           ScalarCost += TTI.getScalarizationOverhead(
6371               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6372               APInt::getAllOnes(VF.getFixedValue()), false, true);
6373         }
6374       }
6375 
6376     // Scale the total scalar cost by block probability.
6377     ScalarCost /= getReciprocalPredBlockProb();
6378 
6379     // Compute the discount. A non-negative discount means the vector version
6380     // of the instruction costs more, and scalarizing would be beneficial.
6381     Discount += VectorCost - ScalarCost;
6382     ScalarCosts[I] = ScalarCost;
6383   }
6384 
6385   return *Discount.getValue();
6386 }
6387 
6388 LoopVectorizationCostModel::VectorizationCostTy
6389 LoopVectorizationCostModel::expectedCost(
6390     ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
6391   VectorizationCostTy Cost;
6392 
6393   // For each block.
6394   for (BasicBlock *BB : TheLoop->blocks()) {
6395     VectorizationCostTy BlockCost;
6396 
6397     // For each instruction in the old loop.
6398     for (Instruction &I : BB->instructionsWithoutDebug()) {
6399       // Skip ignored values.
6400       if (ValuesToIgnore.count(&I) ||
6401           (VF.isVector() && VecValuesToIgnore.count(&I)))
6402         continue;
6403 
6404       VectorizationCostTy C = getInstructionCost(&I, VF);
6405 
6406       // Check if we should override the cost.
6407       if (C.first.isValid() &&
6408           ForceTargetInstructionCost.getNumOccurrences() > 0)
6409         C.first = InstructionCost(ForceTargetInstructionCost);
6410 
6411       // Keep a list of instructions with invalid costs.
6412       if (Invalid && !C.first.isValid())
6413         Invalid->emplace_back(&I, VF);
6414 
6415       BlockCost.first += C.first;
6416       BlockCost.second |= C.second;
6417       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6418                         << " for VF " << VF << " For instruction: " << I
6419                         << '\n');
6420     }
6421 
6422     // If we are vectorizing a predicated block, it will have been
6423     // if-converted. This means that the block's instructions (aside from
6424     // stores and instructions that may divide by zero) will now be
6425     // unconditionally executed. For the scalar case, we may not always execute
6426     // the predicated block, if it is an if-else block. Thus, scale the block's
6427     // cost by the probability of executing it. blockNeedsPredication from
6428     // Legal is used so as to not include all blocks in tail folded loops.
6429     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6430       BlockCost.first /= getReciprocalPredBlockProb();
6431 
6432     Cost.first += BlockCost.first;
6433     Cost.second |= BlockCost.second;
6434   }
6435 
6436   return Cost;
6437 }
6438 
6439 /// Gets Address Access SCEV after verifying that the access pattern
6440 /// is loop invariant except the induction variable dependence.
6441 ///
6442 /// This SCEV can be sent to the Target in order to estimate the address
6443 /// calculation cost.
6444 static const SCEV *getAddressAccessSCEV(
6445               Value *Ptr,
6446               LoopVectorizationLegality *Legal,
6447               PredicatedScalarEvolution &PSE,
6448               const Loop *TheLoop) {
6449 
6450   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6451   if (!Gep)
6452     return nullptr;
6453 
6454   // We are looking for a gep with all loop invariant indices except for one
6455   // which should be an induction variable.
6456   auto SE = PSE.getSE();
6457   unsigned NumOperands = Gep->getNumOperands();
6458   for (unsigned i = 1; i < NumOperands; ++i) {
6459     Value *Opd = Gep->getOperand(i);
6460     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6461         !Legal->isInductionVariable(Opd))
6462       return nullptr;
6463   }
6464 
6465   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6466   return PSE.getSCEV(Ptr);
6467 }
6468 
6469 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6470   return Legal->hasStride(I->getOperand(0)) ||
6471          Legal->hasStride(I->getOperand(1));
6472 }
6473 
6474 InstructionCost
6475 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6476                                                         ElementCount VF) {
6477   assert(VF.isVector() &&
6478          "Scalarization cost of instruction implies vectorization.");
6479   if (VF.isScalable())
6480     return InstructionCost::getInvalid();
6481 
6482   Type *ValTy = getLoadStoreType(I);
6483   auto SE = PSE.getSE();
6484 
6485   unsigned AS = getLoadStoreAddressSpace(I);
6486   Value *Ptr = getLoadStorePointerOperand(I);
6487   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6488   // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
6489   //       that it is being called from this specific place.
6490 
6491   // Figure out whether the access is strided and get the stride value
6492   // if it's known in compile time
6493   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6494 
6495   // Get the cost of the scalar memory instruction and address computation.
6496   InstructionCost Cost =
6497       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6498 
6499   // Don't pass *I here, since it is scalar but will actually be part of a
6500   // vectorized loop where the user of it is a vectorized instruction.
6501   const Align Alignment = getLoadStoreAlignment(I);
6502   Cost += VF.getKnownMinValue() *
6503           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6504                               AS, TTI::TCK_RecipThroughput);
6505 
6506   // Get the overhead of the extractelement and insertelement instructions
6507   // we might create due to scalarization.
6508   Cost += getScalarizationOverhead(I, VF);
6509 
6510   // If we have a predicated load/store, it will need extra i1 extracts and
6511   // conditional branches, but may not be executed for each vector lane. Scale
6512   // the cost by the probability of executing the predicated block.
6513   if (isPredicatedInst(I, VF)) {
6514     Cost /= getReciprocalPredBlockProb();
6515 
6516     // Add the cost of an i1 extract and a branch
6517     auto *Vec_i1Ty =
6518         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
6519     Cost += TTI.getScalarizationOverhead(
6520         Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
6521         /*Insert=*/false, /*Extract=*/true);
6522     Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
6523 
6524     if (useEmulatedMaskMemRefHack(I, VF))
6525       // Artificially setting to a high enough value to practically disable
6526       // vectorization with such operations.
6527       Cost = 3000000;
6528   }
6529 
6530   return Cost;
6531 }
6532 
6533 InstructionCost
6534 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6535                                                     ElementCount VF) {
6536   Type *ValTy = getLoadStoreType(I);
6537   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6538   Value *Ptr = getLoadStorePointerOperand(I);
6539   unsigned AS = getLoadStoreAddressSpace(I);
6540   int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
6541   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6542 
6543   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6544          "Stride should be 1 or -1 for consecutive memory access");
6545   const Align Alignment = getLoadStoreAlignment(I);
6546   InstructionCost Cost = 0;
6547   if (Legal->isMaskRequired(I))
6548     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6549                                       CostKind);
6550   else
6551     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6552                                 CostKind, I);
6553 
6554   bool Reverse = ConsecutiveStride < 0;
6555   if (Reverse)
6556     Cost +=
6557         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
6558   return Cost;
6559 }
6560 
6561 InstructionCost
6562 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6563                                                 ElementCount VF) {
6564   assert(Legal->isUniformMemOp(*I));
6565 
6566   Type *ValTy = getLoadStoreType(I);
6567   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6568   const Align Alignment = getLoadStoreAlignment(I);
6569   unsigned AS = getLoadStoreAddressSpace(I);
6570   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6571   if (isa<LoadInst>(I)) {
6572     return TTI.getAddressComputationCost(ValTy) +
6573            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6574                                CostKind) +
6575            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6576   }
6577   StoreInst *SI = cast<StoreInst>(I);
6578 
6579   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6580   return TTI.getAddressComputationCost(ValTy) +
6581          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6582                              CostKind) +
6583          (isLoopInvariantStoreValue
6584               ? 0
6585               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6586                                        VF.getKnownMinValue() - 1));
6587 }
6588 
6589 InstructionCost
6590 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6591                                                  ElementCount VF) {
6592   Type *ValTy = getLoadStoreType(I);
6593   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6594   const Align Alignment = getLoadStoreAlignment(I);
6595   const Value *Ptr = getLoadStorePointerOperand(I);
6596 
6597   return TTI.getAddressComputationCost(VectorTy) +
6598          TTI.getGatherScatterOpCost(
6599              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6600              TargetTransformInfo::TCK_RecipThroughput, I);
6601 }
6602 
6603 InstructionCost
6604 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6605                                                    ElementCount VF) {
6606   // TODO: Once we have support for interleaving with scalable vectors
6607   // we can calculate the cost properly here.
6608   if (VF.isScalable())
6609     return InstructionCost::getInvalid();
6610 
6611   Type *ValTy = getLoadStoreType(I);
6612   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6613   unsigned AS = getLoadStoreAddressSpace(I);
6614 
6615   auto Group = getInterleavedAccessGroup(I);
6616   assert(Group && "Fail to get an interleaved access group.");
6617 
6618   unsigned InterleaveFactor = Group->getFactor();
6619   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6620 
6621   // Holds the indices of existing members in the interleaved group.
6622   SmallVector<unsigned, 4> Indices;
6623   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
6624     if (Group->getMember(IF))
6625       Indices.push_back(IF);
6626 
6627   // Calculate the cost of the whole interleaved group.
6628   bool UseMaskForGaps =
6629       (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6630       (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
6631   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
6632       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6633       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6634 
6635   if (Group->isReverse()) {
6636     // TODO: Add support for reversed masked interleaved access.
6637     assert(!Legal->isMaskRequired(I) &&
6638            "Reverse masked interleaved access not supported.");
6639     Cost +=
6640         Group->getNumMembers() *
6641         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
6642   }
6643   return Cost;
6644 }
6645 
6646 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
6647     Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
6648   using namespace llvm::PatternMatch;
6649   // Early exit for no inloop reductions
6650   if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6651     return None;
6652   auto *VectorTy = cast<VectorType>(Ty);
6653 
6654   // We are looking for a pattern of, and finding the minimal acceptable cost:
6655   //  reduce(mul(ext(A), ext(B))) or
6656   //  reduce(mul(A, B)) or
6657   //  reduce(ext(A)) or
6658   //  reduce(A).
6659   // The basic idea is that we walk down the tree to do that, finding the root
6660   // reduction instruction in InLoopReductionImmediateChains. From there we find
6661   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6662   // of the components. If the reduction cost is lower then we return it for the
6663   // reduction instruction and 0 for the other instructions in the pattern. If
6664   // it is not we return an invalid cost specifying the orignal cost method
6665   // should be used.
6666   Instruction *RetI = I;
6667   if (match(RetI, m_ZExtOrSExt(m_Value()))) {
6668     if (!RetI->hasOneUser())
6669       return None;
6670     RetI = RetI->user_back();
6671   }
6672   if (match(RetI, m_Mul(m_Value(), m_Value())) &&
6673       RetI->user_back()->getOpcode() == Instruction::Add) {
6674     if (!RetI->hasOneUser())
6675       return None;
6676     RetI = RetI->user_back();
6677   }
6678 
6679   // Test if the found instruction is a reduction, and if not return an invalid
6680   // cost specifying the parent to use the original cost modelling.
6681   if (!InLoopReductionImmediateChains.count(RetI))
6682     return None;
6683 
6684   // Find the reduction this chain is a part of and calculate the basic cost of
6685   // the reduction on its own.
6686   Instruction *LastChain = InLoopReductionImmediateChains[RetI];
6687   Instruction *ReductionPhi = LastChain;
6688   while (!isa<PHINode>(ReductionPhi))
6689     ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
6690 
6691   const RecurrenceDescriptor &RdxDesc =
6692       Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
6693 
6694   InstructionCost BaseCost = TTI.getArithmeticReductionCost(
6695       RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
6696 
6697   // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6698   // normal fmul instruction to the cost of the fadd reduction.
6699   if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
6700     BaseCost +=
6701         TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
6702 
6703   // If we're using ordered reductions then we can just return the base cost
6704   // here, since getArithmeticReductionCost calculates the full ordered
6705   // reduction cost when FP reassociation is not allowed.
6706   if (useOrderedReductions(RdxDesc))
6707     return BaseCost;
6708 
6709   // Get the operand that was not the reduction chain and match it to one of the
6710   // patterns, returning the better cost if it is found.
6711   Instruction *RedOp = RetI->getOperand(1) == LastChain
6712                            ? dyn_cast<Instruction>(RetI->getOperand(0))
6713                            : dyn_cast<Instruction>(RetI->getOperand(1));
6714 
6715   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6716 
6717   Instruction *Op0, *Op1;
6718   if (RedOp &&
6719       match(RedOp,
6720             m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
6721       match(Op0, m_ZExtOrSExt(m_Value())) &&
6722       Op0->getOpcode() == Op1->getOpcode() &&
6723       Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6724       !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
6725       (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
6726 
6727     // Matched reduce(ext(mul(ext(A), ext(B)))
6728     // Note that the extend opcodes need to all match, or if A==B they will have
6729     // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6730     // which is equally fine.
6731     bool IsUnsigned = isa<ZExtInst>(Op0);
6732     auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6733     auto *MulType = VectorType::get(Op0->getType(), VectorTy);
6734 
6735     InstructionCost ExtCost =
6736         TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
6737                              TTI::CastContextHint::None, CostKind, Op0);
6738     InstructionCost MulCost =
6739         TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
6740     InstructionCost Ext2Cost =
6741         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
6742                              TTI::CastContextHint::None, CostKind, RedOp);
6743 
6744     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6745         /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6746         CostKind);
6747 
6748     if (RedCost.isValid() &&
6749         RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6750       return I == RetI ? RedCost : 0;
6751   } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
6752              !TheLoop->isLoopInvariant(RedOp)) {
6753     // Matched reduce(ext(A))
6754     bool IsUnsigned = isa<ZExtInst>(RedOp);
6755     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6756     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6757         /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6758         CostKind);
6759 
6760     InstructionCost ExtCost =
6761         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6762                              TTI::CastContextHint::None, CostKind, RedOp);
6763     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6764       return I == RetI ? RedCost : 0;
6765   } else if (RedOp &&
6766              match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
6767     if (match(Op0, m_ZExtOrSExt(m_Value())) &&
6768         Op0->getOpcode() == Op1->getOpcode() &&
6769         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
6770       bool IsUnsigned = isa<ZExtInst>(Op0);
6771       Type *Op0Ty = Op0->getOperand(0)->getType();
6772       Type *Op1Ty = Op1->getOperand(0)->getType();
6773       Type *LargestOpTy =
6774           Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
6775                                                                     : Op0Ty;
6776       auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
6777 
6778       // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of
6779       // different sizes. We take the largest type as the ext to reduce, and add
6780       // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6781       InstructionCost ExtCost0 = TTI.getCastInstrCost(
6782           Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
6783           TTI::CastContextHint::None, CostKind, Op0);
6784       InstructionCost ExtCost1 = TTI.getCastInstrCost(
6785           Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
6786           TTI::CastContextHint::None, CostKind, Op1);
6787       InstructionCost MulCost =
6788           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6789 
6790       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6791           /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6792           CostKind);
6793       InstructionCost ExtraExtCost = 0;
6794       if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
6795         Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
6796         ExtraExtCost = TTI.getCastInstrCost(
6797             ExtraExtOp->getOpcode(), ExtType,
6798             VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
6799             TTI::CastContextHint::None, CostKind, ExtraExtOp);
6800       }
6801 
6802       if (RedCost.isValid() &&
6803           (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6804         return I == RetI ? RedCost : 0;
6805     } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
6806       // Matched reduce(mul())
6807       InstructionCost MulCost =
6808           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6809 
6810       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6811           /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy,
6812           CostKind);
6813 
6814       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
6815         return I == RetI ? RedCost : 0;
6816     }
6817   }
6818 
6819   return I == RetI ? Optional<InstructionCost>(BaseCost) : None;
6820 }
6821 
6822 InstructionCost
6823 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6824                                                      ElementCount VF) {
6825   // Calculate scalar cost only. Vectorization cost should be ready at this
6826   // moment.
6827   if (VF.isScalar()) {
6828     Type *ValTy = getLoadStoreType(I);
6829     const Align Alignment = getLoadStoreAlignment(I);
6830     unsigned AS = getLoadStoreAddressSpace(I);
6831 
6832     return TTI.getAddressComputationCost(ValTy) +
6833            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6834                                TTI::TCK_RecipThroughput, I);
6835   }
6836   return getWideningCost(I, VF);
6837 }
6838 
6839 LoopVectorizationCostModel::VectorizationCostTy
6840 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6841                                                ElementCount VF) {
6842   // If we know that this instruction will remain uniform, check the cost of
6843   // the scalar version.
6844   if (isUniformAfterVectorization(I, VF))
6845     VF = ElementCount::getFixed(1);
6846 
6847   if (VF.isVector() && isProfitableToScalarize(I, VF))
6848     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6849 
6850   // Forced scalars do not have any scalarization overhead.
6851   auto ForcedScalar = ForcedScalars.find(VF);
6852   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6853     auto InstSet = ForcedScalar->second;
6854     if (InstSet.count(I))
6855       return VectorizationCostTy(
6856           (getInstructionCost(I, ElementCount::getFixed(1)).first *
6857            VF.getKnownMinValue()),
6858           false);
6859   }
6860 
6861   Type *VectorTy;
6862   InstructionCost C = getInstructionCost(I, VF, VectorTy);
6863 
6864   bool TypeNotScalarized = false;
6865   if (VF.isVector() && VectorTy->isVectorTy()) {
6866     unsigned NumParts = TTI.getNumberOfParts(VectorTy);
6867     if (NumParts)
6868       TypeNotScalarized = NumParts < VF.getKnownMinValue();
6869     else
6870       C = InstructionCost::getInvalid();
6871   }
6872   return VectorizationCostTy(C, TypeNotScalarized);
6873 }
6874 
6875 InstructionCost
6876 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6877                                                      ElementCount VF) const {
6878 
6879   // There is no mechanism yet to create a scalable scalarization loop,
6880   // so this is currently Invalid.
6881   if (VF.isScalable())
6882     return InstructionCost::getInvalid();
6883 
6884   if (VF.isScalar())
6885     return 0;
6886 
6887   InstructionCost Cost = 0;
6888   Type *RetTy = ToVectorTy(I->getType(), VF);
6889   if (!RetTy->isVoidTy() &&
6890       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6891     Cost += TTI.getScalarizationOverhead(
6892         cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true,
6893         false);
6894 
6895   // Some targets keep addresses scalar.
6896   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6897     return Cost;
6898 
6899   // Some targets support efficient element stores.
6900   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6901     return Cost;
6902 
6903   // Collect operands to consider.
6904   CallInst *CI = dyn_cast<CallInst>(I);
6905   Instruction::op_range Ops = CI ? CI->args() : I->operands();
6906 
6907   // Skip operands that do not require extraction/scalarization and do not incur
6908   // any overhead.
6909   SmallVector<Type *> Tys;
6910   for (auto *V : filterExtractingOperands(Ops, VF))
6911     Tys.push_back(MaybeVectorizeType(V->getType(), VF));
6912   return Cost + TTI.getOperandsScalarizationOverhead(
6913                     filterExtractingOperands(Ops, VF), Tys);
6914 }
6915 
6916 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6917   if (VF.isScalar())
6918     return;
6919   NumPredStores = 0;
6920   for (BasicBlock *BB : TheLoop->blocks()) {
6921     // For each instruction in the old loop.
6922     for (Instruction &I : *BB) {
6923       Value *Ptr =  getLoadStorePointerOperand(&I);
6924       if (!Ptr)
6925         continue;
6926 
6927       // TODO: We should generate better code and update the cost model for
6928       // predicated uniform stores. Today they are treated as any other
6929       // predicated store (see added test cases in
6930       // invariant-store-vectorization.ll).
6931       if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6932         NumPredStores++;
6933 
6934       if (Legal->isUniformMemOp(I)) {
6935         // TODO: Avoid replicating loads and stores instead of
6936         // relying on instcombine to remove them.
6937         // Load: Scalar load + broadcast
6938         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6939         InstructionCost Cost;
6940         if (isa<StoreInst>(&I) && VF.isScalable() &&
6941             isLegalGatherOrScatter(&I, VF)) {
6942           Cost = getGatherScatterCost(&I, VF);
6943           setWideningDecision(&I, VF, CM_GatherScatter, Cost);
6944         } else {
6945           assert((isa<LoadInst>(&I) || !VF.isScalable()) &&
6946                  "Cannot yet scalarize uniform stores");
6947           Cost = getUniformMemOpCost(&I, VF);
6948           setWideningDecision(&I, VF, CM_Scalarize, Cost);
6949         }
6950         continue;
6951       }
6952 
6953       // We assume that widening is the best solution when possible.
6954       if (memoryInstructionCanBeWidened(&I, VF)) {
6955         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6956         int ConsecutiveStride = Legal->isConsecutivePtr(
6957             getLoadStoreType(&I), getLoadStorePointerOperand(&I));
6958         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6959                "Expected consecutive stride.");
6960         InstWidening Decision =
6961             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6962         setWideningDecision(&I, VF, Decision, Cost);
6963         continue;
6964       }
6965 
6966       // Choose between Interleaving, Gather/Scatter or Scalarization.
6967       InstructionCost InterleaveCost = InstructionCost::getInvalid();
6968       unsigned NumAccesses = 1;
6969       if (isAccessInterleaved(&I)) {
6970         auto Group = getInterleavedAccessGroup(&I);
6971         assert(Group && "Fail to get an interleaved access group.");
6972 
6973         // Make one decision for the whole group.
6974         if (getWideningDecision(&I, VF) != CM_Unknown)
6975           continue;
6976 
6977         NumAccesses = Group->getNumMembers();
6978         if (interleavedAccessCanBeWidened(&I, VF))
6979           InterleaveCost = getInterleaveGroupCost(&I, VF);
6980       }
6981 
6982       InstructionCost GatherScatterCost =
6983           isLegalGatherOrScatter(&I, VF)
6984               ? getGatherScatterCost(&I, VF) * NumAccesses
6985               : InstructionCost::getInvalid();
6986 
6987       InstructionCost ScalarizationCost =
6988           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6989 
6990       // Choose better solution for the current VF,
6991       // write down this decision and use it during vectorization.
6992       InstructionCost Cost;
6993       InstWidening Decision;
6994       if (InterleaveCost <= GatherScatterCost &&
6995           InterleaveCost < ScalarizationCost) {
6996         Decision = CM_Interleave;
6997         Cost = InterleaveCost;
6998       } else if (GatherScatterCost < ScalarizationCost) {
6999         Decision = CM_GatherScatter;
7000         Cost = GatherScatterCost;
7001       } else {
7002         Decision = CM_Scalarize;
7003         Cost = ScalarizationCost;
7004       }
7005       // If the instructions belongs to an interleave group, the whole group
7006       // receives the same decision. The whole group receives the cost, but
7007       // the cost will actually be assigned to one instruction.
7008       if (auto Group = getInterleavedAccessGroup(&I))
7009         setWideningDecision(Group, VF, Decision, Cost);
7010       else
7011         setWideningDecision(&I, VF, Decision, Cost);
7012     }
7013   }
7014 
7015   // Make sure that any load of address and any other address computation
7016   // remains scalar unless there is gather/scatter support. This avoids
7017   // inevitable extracts into address registers, and also has the benefit of
7018   // activating LSR more, since that pass can't optimize vectorized
7019   // addresses.
7020   if (TTI.prefersVectorizedAddressing())
7021     return;
7022 
7023   // Start with all scalar pointer uses.
7024   SmallPtrSet<Instruction *, 8> AddrDefs;
7025   for (BasicBlock *BB : TheLoop->blocks())
7026     for (Instruction &I : *BB) {
7027       Instruction *PtrDef =
7028         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
7029       if (PtrDef && TheLoop->contains(PtrDef) &&
7030           getWideningDecision(&I, VF) != CM_GatherScatter)
7031         AddrDefs.insert(PtrDef);
7032     }
7033 
7034   // Add all instructions used to generate the addresses.
7035   SmallVector<Instruction *, 4> Worklist;
7036   append_range(Worklist, AddrDefs);
7037   while (!Worklist.empty()) {
7038     Instruction *I = Worklist.pop_back_val();
7039     for (auto &Op : I->operands())
7040       if (auto *InstOp = dyn_cast<Instruction>(Op))
7041         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
7042             AddrDefs.insert(InstOp).second)
7043           Worklist.push_back(InstOp);
7044   }
7045 
7046   for (auto *I : AddrDefs) {
7047     if (isa<LoadInst>(I)) {
7048       // Setting the desired widening decision should ideally be handled in
7049       // by cost functions, but since this involves the task of finding out
7050       // if the loaded register is involved in an address computation, it is
7051       // instead changed here when we know this is the case.
7052       InstWidening Decision = getWideningDecision(I, VF);
7053       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
7054         // Scalarize a widened load of address.
7055         setWideningDecision(
7056             I, VF, CM_Scalarize,
7057             (VF.getKnownMinValue() *
7058              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
7059       else if (auto Group = getInterleavedAccessGroup(I)) {
7060         // Scalarize an interleave group of address loads.
7061         for (unsigned I = 0; I < Group->getFactor(); ++I) {
7062           if (Instruction *Member = Group->getMember(I))
7063             setWideningDecision(
7064                 Member, VF, CM_Scalarize,
7065                 (VF.getKnownMinValue() *
7066                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
7067         }
7068       }
7069     } else
7070       // Make sure I gets scalarized and a cost estimate without
7071       // scalarization overhead.
7072       ForcedScalars[VF].insert(I);
7073   }
7074 }
7075 
7076 InstructionCost
7077 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
7078                                                Type *&VectorTy) {
7079   Type *RetTy = I->getType();
7080   if (canTruncateToMinimalBitwidth(I, VF))
7081     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
7082   auto SE = PSE.getSE();
7083   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7084 
7085   auto hasSingleCopyAfterVectorization = [this](Instruction *I,
7086                                                 ElementCount VF) -> bool {
7087     if (VF.isScalar())
7088       return true;
7089 
7090     auto Scalarized = InstsToScalarize.find(VF);
7091     assert(Scalarized != InstsToScalarize.end() &&
7092            "VF not yet analyzed for scalarization profitability");
7093     return !Scalarized->second.count(I) &&
7094            llvm::all_of(I->users(), [&](User *U) {
7095              auto *UI = cast<Instruction>(U);
7096              return !Scalarized->second.count(UI);
7097            });
7098   };
7099   (void) hasSingleCopyAfterVectorization;
7100 
7101   if (isScalarAfterVectorization(I, VF)) {
7102     // With the exception of GEPs and PHIs, after scalarization there should
7103     // only be one copy of the instruction generated in the loop. This is
7104     // because the VF is either 1, or any instructions that need scalarizing
7105     // have already been dealt with by the the time we get here. As a result,
7106     // it means we don't have to multiply the instruction cost by VF.
7107     assert(I->getOpcode() == Instruction::GetElementPtr ||
7108            I->getOpcode() == Instruction::PHI ||
7109            (I->getOpcode() == Instruction::BitCast &&
7110             I->getType()->isPointerTy()) ||
7111            hasSingleCopyAfterVectorization(I, VF));
7112     VectorTy = RetTy;
7113   } else
7114     VectorTy = ToVectorTy(RetTy, VF);
7115 
7116   // TODO: We need to estimate the cost of intrinsic calls.
7117   switch (I->getOpcode()) {
7118   case Instruction::GetElementPtr:
7119     // We mark this instruction as zero-cost because the cost of GEPs in
7120     // vectorized code depends on whether the corresponding memory instruction
7121     // is scalarized or not. Therefore, we handle GEPs with the memory
7122     // instruction cost.
7123     return 0;
7124   case Instruction::Br: {
7125     // In cases of scalarized and predicated instructions, there will be VF
7126     // predicated blocks in the vectorized loop. Each branch around these
7127     // blocks requires also an extract of its vector compare i1 element.
7128     bool ScalarPredicatedBB = false;
7129     BranchInst *BI = cast<BranchInst>(I);
7130     if (VF.isVector() && BI->isConditional() &&
7131         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
7132          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
7133       ScalarPredicatedBB = true;
7134 
7135     if (ScalarPredicatedBB) {
7136       // Not possible to scalarize scalable vector with predicated instructions.
7137       if (VF.isScalable())
7138         return InstructionCost::getInvalid();
7139       // Return cost for branches around scalarized and predicated blocks.
7140       auto *Vec_i1Ty =
7141           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
7142       return (
7143           TTI.getScalarizationOverhead(
7144               Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) +
7145           (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
7146     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
7147       // The back-edge branch will remain, as will all scalar branches.
7148       return TTI.getCFInstrCost(Instruction::Br, CostKind);
7149     else
7150       // This branch will be eliminated by if-conversion.
7151       return 0;
7152     // Note: We currently assume zero cost for an unconditional branch inside
7153     // a predicated block since it will become a fall-through, although we
7154     // may decide in the future to call TTI for all branches.
7155   }
7156   case Instruction::PHI: {
7157     auto *Phi = cast<PHINode>(I);
7158 
7159     // First-order recurrences are replaced by vector shuffles inside the loop.
7160     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
7161     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
7162       return TTI.getShuffleCost(
7163           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
7164           None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
7165 
7166     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7167     // converted into select instructions. We require N - 1 selects per phi
7168     // node, where N is the number of incoming values.
7169     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
7170       return (Phi->getNumIncomingValues() - 1) *
7171              TTI.getCmpSelInstrCost(
7172                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
7173                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
7174                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
7175 
7176     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
7177   }
7178   case Instruction::UDiv:
7179   case Instruction::SDiv:
7180   case Instruction::URem:
7181   case Instruction::SRem:
7182     // If we have a predicated instruction, it may not be executed for each
7183     // vector lane. Get the scalarization cost and scale this amount by the
7184     // probability of executing the predicated block. If the instruction is not
7185     // predicated, we fall through to the next case.
7186     if (VF.isVector() && isScalarWithPredication(I, VF)) {
7187       InstructionCost Cost = 0;
7188 
7189       // These instructions have a non-void type, so account for the phi nodes
7190       // that we will create. This cost is likely to be zero. The phi node
7191       // cost, if any, should be scaled by the block probability because it
7192       // models a copy at the end of each predicated block.
7193       Cost += VF.getKnownMinValue() *
7194               TTI.getCFInstrCost(Instruction::PHI, CostKind);
7195 
7196       // The cost of the non-predicated instruction.
7197       Cost += VF.getKnownMinValue() *
7198               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
7199 
7200       // The cost of insertelement and extractelement instructions needed for
7201       // scalarization.
7202       Cost += getScalarizationOverhead(I, VF);
7203 
7204       // Scale the cost by the probability of executing the predicated blocks.
7205       // This assumes the predicated block for each vector lane is equally
7206       // likely.
7207       return Cost / getReciprocalPredBlockProb();
7208     }
7209     LLVM_FALLTHROUGH;
7210   case Instruction::Add:
7211   case Instruction::FAdd:
7212   case Instruction::Sub:
7213   case Instruction::FSub:
7214   case Instruction::Mul:
7215   case Instruction::FMul:
7216   case Instruction::FDiv:
7217   case Instruction::FRem:
7218   case Instruction::Shl:
7219   case Instruction::LShr:
7220   case Instruction::AShr:
7221   case Instruction::And:
7222   case Instruction::Or:
7223   case Instruction::Xor: {
7224     // Since we will replace the stride by 1 the multiplication should go away.
7225     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7226       return 0;
7227 
7228     // Detect reduction patterns
7229     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7230       return *RedCost;
7231 
7232     // Certain instructions can be cheaper to vectorize if they have a constant
7233     // second vector operand. One example of this are shifts on x86.
7234     Value *Op2 = I->getOperand(1);
7235     TargetTransformInfo::OperandValueProperties Op2VP;
7236     TargetTransformInfo::OperandValueKind Op2VK =
7237         TTI.getOperandInfo(Op2, Op2VP);
7238     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7239       Op2VK = TargetTransformInfo::OK_UniformValue;
7240 
7241     SmallVector<const Value *, 4> Operands(I->operand_values());
7242     return TTI.getArithmeticInstrCost(
7243         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7244         Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
7245   }
7246   case Instruction::FNeg: {
7247     return TTI.getArithmeticInstrCost(
7248         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7249         TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None,
7250         TargetTransformInfo::OP_None, I->getOperand(0), I);
7251   }
7252   case Instruction::Select: {
7253     SelectInst *SI = cast<SelectInst>(I);
7254     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7255     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7256 
7257     const Value *Op0, *Op1;
7258     using namespace llvm::PatternMatch;
7259     if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
7260                         match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
7261       // select x, y, false --> x & y
7262       // select x, true, y --> x | y
7263       TTI::OperandValueProperties Op1VP = TTI::OP_None;
7264       TTI::OperandValueProperties Op2VP = TTI::OP_None;
7265       TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP);
7266       TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP);
7267       assert(Op0->getType()->getScalarSizeInBits() == 1 &&
7268               Op1->getType()->getScalarSizeInBits() == 1);
7269 
7270       SmallVector<const Value *, 2> Operands{Op0, Op1};
7271       return TTI.getArithmeticInstrCost(
7272           match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
7273           CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I);
7274     }
7275 
7276     Type *CondTy = SI->getCondition()->getType();
7277     if (!ScalarCond)
7278       CondTy = VectorType::get(CondTy, VF);
7279 
7280     CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
7281     if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
7282       Pred = Cmp->getPredicate();
7283     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
7284                                   CostKind, I);
7285   }
7286   case Instruction::ICmp:
7287   case Instruction::FCmp: {
7288     Type *ValTy = I->getOperand(0)->getType();
7289     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7290     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7291       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7292     VectorTy = ToVectorTy(ValTy, VF);
7293     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7294                                   cast<CmpInst>(I)->getPredicate(), CostKind,
7295                                   I);
7296   }
7297   case Instruction::Store:
7298   case Instruction::Load: {
7299     ElementCount Width = VF;
7300     if (Width.isVector()) {
7301       InstWidening Decision = getWideningDecision(I, Width);
7302       assert(Decision != CM_Unknown &&
7303              "CM decision should be taken at this point");
7304       if (Decision == CM_Scalarize)
7305         Width = ElementCount::getFixed(1);
7306     }
7307     VectorTy = ToVectorTy(getLoadStoreType(I), Width);
7308     return getMemoryInstructionCost(I, VF);
7309   }
7310   case Instruction::BitCast:
7311     if (I->getType()->isPointerTy())
7312       return 0;
7313     LLVM_FALLTHROUGH;
7314   case Instruction::ZExt:
7315   case Instruction::SExt:
7316   case Instruction::FPToUI:
7317   case Instruction::FPToSI:
7318   case Instruction::FPExt:
7319   case Instruction::PtrToInt:
7320   case Instruction::IntToPtr:
7321   case Instruction::SIToFP:
7322   case Instruction::UIToFP:
7323   case Instruction::Trunc:
7324   case Instruction::FPTrunc: {
7325     // Computes the CastContextHint from a Load/Store instruction.
7326     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7327       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7328              "Expected a load or a store!");
7329 
7330       if (VF.isScalar() || !TheLoop->contains(I))
7331         return TTI::CastContextHint::Normal;
7332 
7333       switch (getWideningDecision(I, VF)) {
7334       case LoopVectorizationCostModel::CM_GatherScatter:
7335         return TTI::CastContextHint::GatherScatter;
7336       case LoopVectorizationCostModel::CM_Interleave:
7337         return TTI::CastContextHint::Interleave;
7338       case LoopVectorizationCostModel::CM_Scalarize:
7339       case LoopVectorizationCostModel::CM_Widen:
7340         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7341                                         : TTI::CastContextHint::Normal;
7342       case LoopVectorizationCostModel::CM_Widen_Reverse:
7343         return TTI::CastContextHint::Reversed;
7344       case LoopVectorizationCostModel::CM_Unknown:
7345         llvm_unreachable("Instr did not go through cost modelling?");
7346       }
7347 
7348       llvm_unreachable("Unhandled case!");
7349     };
7350 
7351     unsigned Opcode = I->getOpcode();
7352     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7353     // For Trunc, the context is the only user, which must be a StoreInst.
7354     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7355       if (I->hasOneUse())
7356         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7357           CCH = ComputeCCH(Store);
7358     }
7359     // For Z/Sext, the context is the operand, which must be a LoadInst.
7360     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7361              Opcode == Instruction::FPExt) {
7362       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7363         CCH = ComputeCCH(Load);
7364     }
7365 
7366     // We optimize the truncation of induction variables having constant
7367     // integer steps. The cost of these truncations is the same as the scalar
7368     // operation.
7369     if (isOptimizableIVTruncate(I, VF)) {
7370       auto *Trunc = cast<TruncInst>(I);
7371       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7372                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7373     }
7374 
7375     // Detect reduction patterns
7376     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7377       return *RedCost;
7378 
7379     Type *SrcScalarTy = I->getOperand(0)->getType();
7380     Type *SrcVecTy =
7381         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7382     if (canTruncateToMinimalBitwidth(I, VF)) {
7383       // This cast is going to be shrunk. This may remove the cast or it might
7384       // turn it into slightly different cast. For example, if MinBW == 16,
7385       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7386       //
7387       // Calculate the modified src and dest types.
7388       Type *MinVecTy = VectorTy;
7389       if (Opcode == Instruction::Trunc) {
7390         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7391         VectorTy =
7392             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7393       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7394         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7395         VectorTy =
7396             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7397       }
7398     }
7399 
7400     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7401   }
7402   case Instruction::Call: {
7403     if (RecurrenceDescriptor::isFMulAddIntrinsic(I))
7404       if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7405         return *RedCost;
7406     bool NeedToScalarize;
7407     CallInst *CI = cast<CallInst>(I);
7408     InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7409     if (getVectorIntrinsicIDForCall(CI, TLI)) {
7410       InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
7411       return std::min(CallCost, IntrinsicCost);
7412     }
7413     return CallCost;
7414   }
7415   case Instruction::ExtractValue:
7416     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7417   case Instruction::Alloca:
7418     // We cannot easily widen alloca to a scalable alloca, as
7419     // the result would need to be a vector of pointers.
7420     if (VF.isScalable())
7421       return InstructionCost::getInvalid();
7422     LLVM_FALLTHROUGH;
7423   default:
7424     // This opcode is unknown. Assume that it is the same as 'mul'.
7425     return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7426   } // end of switch.
7427 }
7428 
7429 char LoopVectorize::ID = 0;
7430 
7431 static const char lv_name[] = "Loop Vectorization";
7432 
7433 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7434 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7435 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7436 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
7437 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7438 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7439 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7440 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7441 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7442 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7443 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7444 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7445 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7446 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
7447 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
7448 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7449 
7450 namespace llvm {
7451 
7452 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7453 
7454 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7455                               bool VectorizeOnlyWhenForced) {
7456   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7457 }
7458 
7459 } // end namespace llvm
7460 
7461 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
7462   // Check if the pointer operand of a load or store instruction is
7463   // consecutive.
7464   if (auto *Ptr = getLoadStorePointerOperand(Inst))
7465     return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr);
7466   return false;
7467 }
7468 
7469 void LoopVectorizationCostModel::collectValuesToIgnore() {
7470   // Ignore ephemeral values.
7471   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7472 
7473   // Ignore type-promoting instructions we identified during reduction
7474   // detection.
7475   for (auto &Reduction : Legal->getReductionVars()) {
7476     const RecurrenceDescriptor &RedDes = Reduction.second;
7477     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7478     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7479   }
7480   // Ignore type-casting instructions we identified during induction
7481   // detection.
7482   for (auto &Induction : Legal->getInductionVars()) {
7483     const InductionDescriptor &IndDes = Induction.second;
7484     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7485     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7486   }
7487 }
7488 
7489 void LoopVectorizationCostModel::collectInLoopReductions() {
7490   for (auto &Reduction : Legal->getReductionVars()) {
7491     PHINode *Phi = Reduction.first;
7492     const RecurrenceDescriptor &RdxDesc = Reduction.second;
7493 
7494     // We don't collect reductions that are type promoted (yet).
7495     if (RdxDesc.getRecurrenceType() != Phi->getType())
7496       continue;
7497 
7498     // If the target would prefer this reduction to happen "in-loop", then we
7499     // want to record it as such.
7500     unsigned Opcode = RdxDesc.getOpcode();
7501     if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7502         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7503                                    TargetTransformInfo::ReductionFlags()))
7504       continue;
7505 
7506     // Check that we can correctly put the reductions into the loop, by
7507     // finding the chain of operations that leads from the phi to the loop
7508     // exit value.
7509     SmallVector<Instruction *, 4> ReductionOperations =
7510         RdxDesc.getReductionOpChain(Phi, TheLoop);
7511     bool InLoop = !ReductionOperations.empty();
7512     if (InLoop) {
7513       InLoopReductionChains[Phi] = ReductionOperations;
7514       // Add the elements to InLoopReductionImmediateChains for cost modelling.
7515       Instruction *LastChain = Phi;
7516       for (auto *I : ReductionOperations) {
7517         InLoopReductionImmediateChains[I] = LastChain;
7518         LastChain = I;
7519       }
7520     }
7521     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7522                       << " reduction for phi: " << *Phi << "\n");
7523   }
7524 }
7525 
7526 // TODO: we could return a pair of values that specify the max VF and
7527 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7528 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7529 // doesn't have a cost model that can choose which plan to execute if
7530 // more than one is generated.
7531 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7532                                  LoopVectorizationCostModel &CM) {
7533   unsigned WidestType;
7534   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7535   return WidestVectorRegBits / WidestType;
7536 }
7537 
7538 VectorizationFactor
7539 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7540   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
7541   ElementCount VF = UserVF;
7542   // Outer loop handling: They may require CFG and instruction level
7543   // transformations before even evaluating whether vectorization is profitable.
7544   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7545   // the vectorization pipeline.
7546   if (!OrigLoop->isInnermost()) {
7547     // If the user doesn't provide a vectorization factor, determine a
7548     // reasonable one.
7549     if (UserVF.isZero()) {
7550       VF = ElementCount::getFixed(determineVPlanVF(
7551           TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
7552               .getFixedSize(),
7553           CM));
7554       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7555 
7556       // Make sure we have a VF > 1 for stress testing.
7557       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7558         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7559                           << "overriding computed VF.\n");
7560         VF = ElementCount::getFixed(4);
7561       }
7562     }
7563     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7564     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7565            "VF needs to be a power of two");
7566     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7567                       << "VF " << VF << " to build VPlans.\n");
7568     buildVPlans(VF, VF);
7569 
7570     // For VPlan build stress testing, we bail out after VPlan construction.
7571     if (VPlanBuildStressTest)
7572       return VectorizationFactor::Disabled();
7573 
7574     return {VF, 0 /*Cost*/};
7575   }
7576 
7577   LLVM_DEBUG(
7578       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7579                 "VPlan-native path.\n");
7580   return VectorizationFactor::Disabled();
7581 }
7582 
7583 Optional<VectorizationFactor>
7584 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7585   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7586   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7587   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7588     return None;
7589 
7590   // Invalidate interleave groups if all blocks of loop will be predicated.
7591   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7592       !useMaskedInterleavedAccesses(*TTI)) {
7593     LLVM_DEBUG(
7594         dbgs()
7595         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7596            "which requires masked-interleaved support.\n");
7597     if (CM.InterleaveInfo.invalidateGroups())
7598       // Invalidating interleave groups also requires invalidating all decisions
7599       // based on them, which includes widening decisions and uniform and scalar
7600       // values.
7601       CM.invalidateCostModelingDecisions();
7602   }
7603 
7604   ElementCount MaxUserVF =
7605       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7606   bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
7607   if (!UserVF.isZero() && UserVFIsLegal) {
7608     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7609            "VF needs to be a power of two");
7610     // Collect the instructions (and their associated costs) that will be more
7611     // profitable to scalarize.
7612     if (CM.selectUserVectorizationFactor(UserVF)) {
7613       LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7614       CM.collectInLoopReductions();
7615       buildVPlansWithVPRecipes(UserVF, UserVF);
7616       LLVM_DEBUG(printPlans(dbgs()));
7617       return {{UserVF, 0}};
7618     } else
7619       reportVectorizationInfo("UserVF ignored because of invalid costs.",
7620                               "InvalidCost", ORE, OrigLoop);
7621   }
7622 
7623   // Populate the set of Vectorization Factor Candidates.
7624   ElementCountSet VFCandidates;
7625   for (auto VF = ElementCount::getFixed(1);
7626        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7627     VFCandidates.insert(VF);
7628   for (auto VF = ElementCount::getScalable(1);
7629        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7630     VFCandidates.insert(VF);
7631 
7632   for (const auto &VF : VFCandidates) {
7633     // Collect Uniform and Scalar instructions after vectorization with VF.
7634     CM.collectUniformsAndScalars(VF);
7635 
7636     // Collect the instructions (and their associated costs) that will be more
7637     // profitable to scalarize.
7638     if (VF.isVector())
7639       CM.collectInstsToScalarize(VF);
7640   }
7641 
7642   CM.collectInLoopReductions();
7643   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7644   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7645 
7646   LLVM_DEBUG(printPlans(dbgs()));
7647   if (!MaxFactors.hasVector())
7648     return VectorizationFactor::Disabled();
7649 
7650   // Select the optimal vectorization factor.
7651   auto SelectedVF = CM.selectVectorizationFactor(VFCandidates);
7652 
7653   // Check if it is profitable to vectorize with runtime checks.
7654   unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
7655   if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) {
7656     bool PragmaThresholdReached =
7657         NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
7658     bool ThresholdReached =
7659         NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
7660     if ((ThresholdReached && !Hints.allowReordering()) ||
7661         PragmaThresholdReached) {
7662       ORE->emit([&]() {
7663         return OptimizationRemarkAnalysisAliasing(
7664                    DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(),
7665                    OrigLoop->getHeader())
7666                << "loop not vectorized: cannot prove it is safe to reorder "
7667                   "memory operations";
7668       });
7669       LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
7670       Hints.emitRemarkWithHints();
7671       return VectorizationFactor::Disabled();
7672     }
7673   }
7674   return SelectedVF;
7675 }
7676 
7677 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
7678   assert(count_if(VPlans,
7679                   [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7680              1 &&
7681          "Best VF has not a single VPlan.");
7682 
7683   for (const VPlanPtr &Plan : VPlans) {
7684     if (Plan->hasVF(VF))
7685       return *Plan.get();
7686   }
7687   llvm_unreachable("No plan found!");
7688 }
7689 
7690 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7691   SmallVector<Metadata *, 4> MDs;
7692   // Reserve first location for self reference to the LoopID metadata node.
7693   MDs.push_back(nullptr);
7694   bool IsUnrollMetadata = false;
7695   MDNode *LoopID = L->getLoopID();
7696   if (LoopID) {
7697     // First find existing loop unrolling disable metadata.
7698     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7699       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7700       if (MD) {
7701         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7702         IsUnrollMetadata =
7703             S && S->getString().startswith("llvm.loop.unroll.disable");
7704       }
7705       MDs.push_back(LoopID->getOperand(i));
7706     }
7707   }
7708 
7709   if (!IsUnrollMetadata) {
7710     // Add runtime unroll disable metadata.
7711     LLVMContext &Context = L->getHeader()->getContext();
7712     SmallVector<Metadata *, 1> DisableOperands;
7713     DisableOperands.push_back(
7714         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7715     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7716     MDs.push_back(DisableNode);
7717     MDNode *NewLoopID = MDNode::get(Context, MDs);
7718     // Set operand 0 to refer to the loop id itself.
7719     NewLoopID->replaceOperandWith(0, NewLoopID);
7720     L->setLoopID(NewLoopID);
7721   }
7722 }
7723 
7724 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
7725                                            VPlan &BestVPlan,
7726                                            InnerLoopVectorizer &ILV,
7727                                            DominatorTree *DT) {
7728   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
7729                     << '\n');
7730 
7731   // Perform the actual loop transformation.
7732 
7733   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
7734   VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
7735   Value *CanonicalIVStartValue;
7736   std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
7737       ILV.createVectorizedLoopSkeleton();
7738   ILV.collectPoisonGeneratingRecipes(State);
7739 
7740   ILV.printDebugTracesAtStart();
7741 
7742   //===------------------------------------------------===//
7743   //
7744   // Notice: any optimization or new instruction that go
7745   // into the code below should also be implemented in
7746   // the cost-model.
7747   //
7748   //===------------------------------------------------===//
7749 
7750   // 2. Copy and widen instructions from the old loop into the new loop.
7751   BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr),
7752                              ILV.getOrCreateVectorTripCount(nullptr),
7753                              CanonicalIVStartValue, State);
7754   BestVPlan.execute(&State);
7755 
7756   // Keep all loop hints from the original loop on the vector loop (we'll
7757   // replace the vectorizer-specific hints below).
7758   MDNode *OrigLoopID = OrigLoop->getLoopID();
7759 
7760   Optional<MDNode *> VectorizedLoopID =
7761       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7762                                       LLVMLoopVectorizeFollowupVectorized});
7763 
7764   Loop *L = LI->getLoopFor(State.CFG.PrevBB);
7765   if (VectorizedLoopID.hasValue())
7766     L->setLoopID(VectorizedLoopID.getValue());
7767   else {
7768     // Keep all loop hints from the original loop on the vector loop (we'll
7769     // replace the vectorizer-specific hints below).
7770     if (MDNode *LID = OrigLoop->getLoopID())
7771       L->setLoopID(LID);
7772 
7773     LoopVectorizeHints Hints(L, true, *ORE);
7774     Hints.setAlreadyVectorized();
7775   }
7776   // Disable runtime unrolling when vectorizing the epilogue loop.
7777   if (CanonicalIVStartValue)
7778     AddRuntimeUnrollDisableMetaData(L);
7779 
7780   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7781   //    predication, updating analyses.
7782   ILV.fixVectorizedLoop(State);
7783 
7784   ILV.printDebugTracesAtEnd();
7785 }
7786 
7787 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
7788 void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
7789   for (const auto &Plan : VPlans)
7790     if (PrintVPlansInDotFormat)
7791       Plan->printDOT(O);
7792     else
7793       Plan->print(O);
7794 }
7795 #endif
7796 
7797 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
7798     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7799 
7800   // We create new control-flow for the vectorized loop, so the original exit
7801   // conditions will be dead after vectorization if it's only used by the
7802   // terminator
7803   SmallVector<BasicBlock*> ExitingBlocks;
7804   OrigLoop->getExitingBlocks(ExitingBlocks);
7805   for (auto *BB : ExitingBlocks) {
7806     auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));
7807     if (!Cmp || !Cmp->hasOneUse())
7808       continue;
7809 
7810     // TODO: we should introduce a getUniqueExitingBlocks on Loop
7811     if (!DeadInstructions.insert(Cmp).second)
7812       continue;
7813 
7814     // The operands of the icmp is often a dead trunc, used by IndUpdate.
7815     // TODO: can recurse through operands in general
7816     for (Value *Op : Cmp->operands()) {
7817       if (isa<TruncInst>(Op) && Op->hasOneUse())
7818           DeadInstructions.insert(cast<Instruction>(Op));
7819     }
7820   }
7821 
7822   // We create new "steps" for induction variable updates to which the original
7823   // induction variables map. An original update instruction will be dead if
7824   // all its users except the induction variable are dead.
7825   auto *Latch = OrigLoop->getLoopLatch();
7826   for (auto &Induction : Legal->getInductionVars()) {
7827     PHINode *Ind = Induction.first;
7828     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
7829 
7830     // If the tail is to be folded by masking, the primary induction variable,
7831     // if exists, isn't dead: it will be used for masking. Don't kill it.
7832     if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
7833       continue;
7834 
7835     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
7836           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
7837         }))
7838       DeadInstructions.insert(IndUpdate);
7839   }
7840 }
7841 
7842 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7843 
7844 //===--------------------------------------------------------------------===//
7845 // EpilogueVectorizerMainLoop
7846 //===--------------------------------------------------------------------===//
7847 
7848 /// This function is partially responsible for generating the control flow
7849 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7850 std::pair<BasicBlock *, Value *>
7851 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
7852   MDNode *OrigLoopID = OrigLoop->getLoopID();
7853   Loop *Lp = createVectorLoopSkeleton("");
7854 
7855   // Generate the code to check the minimum iteration count of the vector
7856   // epilogue (see below).
7857   EPI.EpilogueIterationCountCheck =
7858       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true);
7859   EPI.EpilogueIterationCountCheck->setName("iter.check");
7860 
7861   // Generate the code to check any assumptions that we've made for SCEV
7862   // expressions.
7863   EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader);
7864 
7865   // Generate the code that checks at runtime if arrays overlap. We put the
7866   // checks into a separate block to make the more common case of few elements
7867   // faster.
7868   EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
7869 
7870   // Generate the iteration count check for the main loop, *after* the check
7871   // for the epilogue loop, so that the path-length is shorter for the case
7872   // that goes directly through the vector epilogue. The longer-path length for
7873   // the main loop is compensated for, by the gain from vectorizing the larger
7874   // trip count. Note: the branch will get updated later on when we vectorize
7875   // the epilogue.
7876   EPI.MainLoopIterationCountCheck =
7877       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
7878 
7879   // Generate the induction variable.
7880   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
7881   EPI.VectorTripCount = CountRoundDown;
7882   createHeaderBranch(Lp);
7883 
7884   // Skip induction resume value creation here because they will be created in
7885   // the second pass. If we created them here, they wouldn't be used anyway,
7886   // because the vplan in the second pass still contains the inductions from the
7887   // original loop.
7888 
7889   return {completeLoopSkeleton(Lp, OrigLoopID), nullptr};
7890 }
7891 
7892 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7893   LLVM_DEBUG({
7894     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7895            << "Main Loop VF:" << EPI.MainLoopVF
7896            << ", Main Loop UF:" << EPI.MainLoopUF
7897            << ", Epilogue Loop VF:" << EPI.EpilogueVF
7898            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7899   });
7900 }
7901 
7902 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7903   DEBUG_WITH_TYPE(VerboseDebug, {
7904     dbgs() << "intermediate fn:\n"
7905            << *OrigLoop->getHeader()->getParent() << "\n";
7906   });
7907 }
7908 
7909 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
7910     Loop *L, BasicBlock *Bypass, bool ForEpilogue) {
7911   assert(L && "Expected valid Loop.");
7912   assert(Bypass && "Expected valid bypass basic block.");
7913   ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7914   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7915   Value *Count = getOrCreateTripCount(L);
7916   // Reuse existing vector loop preheader for TC checks.
7917   // Note that new preheader block is generated for vector loop.
7918   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7919   IRBuilder<> Builder(TCCheckBlock->getTerminator());
7920 
7921   // Generate code to check if the loop's trip count is less than VF * UF of the
7922   // main vector loop.
7923   auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ?
7924       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7925 
7926   Value *CheckMinIters = Builder.CreateICmp(
7927       P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7928       "min.iters.check");
7929 
7930   if (!ForEpilogue)
7931     TCCheckBlock->setName("vector.main.loop.iter.check");
7932 
7933   // Create new preheader for vector loop.
7934   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7935                                    DT, LI, nullptr, "vector.ph");
7936 
7937   if (ForEpilogue) {
7938     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7939                                  DT->getNode(Bypass)->getIDom()) &&
7940            "TC check is expected to dominate Bypass");
7941 
7942     // Update dominator for Bypass & LoopExit.
7943     DT->changeImmediateDominator(Bypass, TCCheckBlock);
7944     if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
7945       // For loops with multiple exits, there's no edge from the middle block
7946       // to exit blocks (as the epilogue must run) and thus no need to update
7947       // the immediate dominator of the exit blocks.
7948       DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
7949 
7950     LoopBypassBlocks.push_back(TCCheckBlock);
7951 
7952     // Save the trip count so we don't have to regenerate it in the
7953     // vec.epilog.iter.check. This is safe to do because the trip count
7954     // generated here dominates the vector epilog iter check.
7955     EPI.TripCount = Count;
7956   }
7957 
7958   ReplaceInstWithInst(
7959       TCCheckBlock->getTerminator(),
7960       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7961 
7962   return TCCheckBlock;
7963 }
7964 
7965 //===--------------------------------------------------------------------===//
7966 // EpilogueVectorizerEpilogueLoop
7967 //===--------------------------------------------------------------------===//
7968 
7969 /// This function is partially responsible for generating the control flow
7970 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7971 std::pair<BasicBlock *, Value *>
7972 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
7973   MDNode *OrigLoopID = OrigLoop->getLoopID();
7974   Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
7975 
7976   // Now, compare the remaining count and if there aren't enough iterations to
7977   // execute the vectorized epilogue skip to the scalar part.
7978   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
7979   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
7980   LoopVectorPreHeader =
7981       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
7982                  LI, nullptr, "vec.epilog.ph");
7983   emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
7984                                           VecEpilogueIterationCountCheck);
7985 
7986   // Adjust the control flow taking the state info from the main loop
7987   // vectorization into account.
7988   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7989          "expected this to be saved from the previous pass.");
7990   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7991       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7992 
7993   DT->changeImmediateDominator(LoopVectorPreHeader,
7994                                EPI.MainLoopIterationCountCheck);
7995 
7996   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7997       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7998 
7999   if (EPI.SCEVSafetyCheck)
8000     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
8001         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8002   if (EPI.MemSafetyCheck)
8003     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
8004         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8005 
8006   DT->changeImmediateDominator(
8007       VecEpilogueIterationCountCheck,
8008       VecEpilogueIterationCountCheck->getSinglePredecessor());
8009 
8010   DT->changeImmediateDominator(LoopScalarPreHeader,
8011                                EPI.EpilogueIterationCountCheck);
8012   if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
8013     // If there is an epilogue which must run, there's no edge from the
8014     // middle block to exit blocks  and thus no need to update the immediate
8015     // dominator of the exit blocks.
8016     DT->changeImmediateDominator(LoopExitBlock,
8017                                  EPI.EpilogueIterationCountCheck);
8018 
8019   // Keep track of bypass blocks, as they feed start values to the induction
8020   // phis in the scalar loop preheader.
8021   if (EPI.SCEVSafetyCheck)
8022     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
8023   if (EPI.MemSafetyCheck)
8024     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
8025   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
8026 
8027   // The vec.epilog.iter.check block may contain Phi nodes from reductions which
8028   // merge control-flow from the latch block and the middle block. Update the
8029   // incoming values here and move the Phi into the preheader.
8030   SmallVector<PHINode *, 4> PhisInBlock;
8031   for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
8032     PhisInBlock.push_back(&Phi);
8033 
8034   for (PHINode *Phi : PhisInBlock) {
8035     Phi->replaceIncomingBlockWith(
8036         VecEpilogueIterationCountCheck->getSinglePredecessor(),
8037         VecEpilogueIterationCountCheck);
8038     Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
8039     if (EPI.SCEVSafetyCheck)
8040       Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
8041     if (EPI.MemSafetyCheck)
8042       Phi->removeIncomingValue(EPI.MemSafetyCheck);
8043     Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
8044   }
8045 
8046   // Generate a resume induction for the vector epilogue and put it in the
8047   // vector epilogue preheader
8048   Type *IdxTy = Legal->getWidestInductionType();
8049   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
8050                                          LoopVectorPreHeader->getFirstNonPHI());
8051   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
8052   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
8053                            EPI.MainLoopIterationCountCheck);
8054 
8055   // Generate the induction variable.
8056   createHeaderBranch(Lp);
8057 
8058   // Generate induction resume values. These variables save the new starting
8059   // indexes for the scalar loop. They are used to test if there are any tail
8060   // iterations left once the vector loop has completed.
8061   // Note that when the vectorized epilogue is skipped due to iteration count
8062   // check, then the resume value for the induction variable comes from
8063   // the trip count of the main vector loop, hence passing the AdditionalBypass
8064   // argument.
8065   createInductionResumeValues(Lp, {VecEpilogueIterationCountCheck,
8066                                    EPI.VectorTripCount} /* AdditionalBypass */);
8067 
8068   return {completeLoopSkeleton(Lp, OrigLoopID), EPResumeVal};
8069 }
8070 
8071 BasicBlock *
8072 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
8073     BasicBlock *Bypass, BasicBlock *Insert) {
8074 
8075   assert(EPI.TripCount &&
8076          "Expected trip count to have been safed in the first pass.");
8077   assert(
8078       (!isa<Instruction>(EPI.TripCount) ||
8079        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
8080       "saved trip count does not dominate insertion point.");
8081   Value *TC = EPI.TripCount;
8082   IRBuilder<> Builder(Insert->getTerminator());
8083   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
8084 
8085   // Generate code to check if the loop's trip count is less than VF * UF of the
8086   // vector epilogue loop.
8087   auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ?
8088       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
8089 
8090   Value *CheckMinIters =
8091       Builder.CreateICmp(P, Count,
8092                          createStepForVF(Builder, Count->getType(),
8093                                          EPI.EpilogueVF, EPI.EpilogueUF),
8094                          "min.epilog.iters.check");
8095 
8096   ReplaceInstWithInst(
8097       Insert->getTerminator(),
8098       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
8099 
8100   LoopBypassBlocks.push_back(Insert);
8101   return Insert;
8102 }
8103 
8104 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
8105   LLVM_DEBUG({
8106     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
8107            << "Epilogue Loop VF:" << EPI.EpilogueVF
8108            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8109   });
8110 }
8111 
8112 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
8113   DEBUG_WITH_TYPE(VerboseDebug, {
8114     dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
8115   });
8116 }
8117 
8118 bool LoopVectorizationPlanner::getDecisionAndClampRange(
8119     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
8120   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
8121   bool PredicateAtRangeStart = Predicate(Range.Start);
8122 
8123   for (ElementCount TmpVF = Range.Start * 2;
8124        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
8125     if (Predicate(TmpVF) != PredicateAtRangeStart) {
8126       Range.End = TmpVF;
8127       break;
8128     }
8129 
8130   return PredicateAtRangeStart;
8131 }
8132 
8133 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
8134 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
8135 /// of VF's starting at a given VF and extending it as much as possible. Each
8136 /// vectorization decision can potentially shorten this sub-range during
8137 /// buildVPlan().
8138 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
8139                                            ElementCount MaxVF) {
8140   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8141   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8142     VFRange SubRange = {VF, MaxVFPlusOne};
8143     VPlans.push_back(buildVPlan(SubRange));
8144     VF = SubRange.End;
8145   }
8146 }
8147 
8148 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
8149                                          VPlanPtr &Plan) {
8150   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8151 
8152   // Look for cached value.
8153   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8154   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8155   if (ECEntryIt != EdgeMaskCache.end())
8156     return ECEntryIt->second;
8157 
8158   VPValue *SrcMask = createBlockInMask(Src, Plan);
8159 
8160   // The terminator has to be a branch inst!
8161   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8162   assert(BI && "Unexpected terminator found");
8163 
8164   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8165     return EdgeMaskCache[Edge] = SrcMask;
8166 
8167   // If source is an exiting block, we know the exit edge is dynamically dead
8168   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
8169   // adding uses of an otherwise potentially dead instruction.
8170   if (OrigLoop->isLoopExiting(Src))
8171     return EdgeMaskCache[Edge] = SrcMask;
8172 
8173   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
8174   assert(EdgeMask && "No Edge Mask found for condition");
8175 
8176   if (BI->getSuccessor(0) != Dst)
8177     EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
8178 
8179   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8180     // The condition is 'SrcMask && EdgeMask', which is equivalent to
8181     // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8182     // The select version does not introduce new UB if SrcMask is false and
8183     // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8184     VPValue *False = Plan->getOrAddVPValue(
8185         ConstantInt::getFalse(BI->getCondition()->getType()));
8186     EdgeMask =
8187         Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
8188   }
8189 
8190   return EdgeMaskCache[Edge] = EdgeMask;
8191 }
8192 
8193 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
8194   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8195 
8196   // Look for cached value.
8197   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
8198   if (BCEntryIt != BlockMaskCache.end())
8199     return BCEntryIt->second;
8200 
8201   // All-one mask is modelled as no-mask following the convention for masked
8202   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8203   VPValue *BlockMask = nullptr;
8204 
8205   if (OrigLoop->getHeader() == BB) {
8206     if (!CM.blockNeedsPredicationForAnyReason(BB))
8207       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
8208 
8209     // Introduce the early-exit compare IV <= BTC to form header block mask.
8210     // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8211     // constructing the desired canonical IV in the header block as its first
8212     // non-phi instructions.
8213     assert(CM.foldTailByMasking() && "must fold the tail");
8214     VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock();
8215     auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8216     auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV());
8217     HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi());
8218 
8219     VPBuilder::InsertPointGuard Guard(Builder);
8220     Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8221     if (CM.TTI.emitGetActiveLaneMask()) {
8222       VPValue *TC = Plan->getOrCreateTripCount();
8223       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC});
8224     } else {
8225       VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
8226       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
8227     }
8228     return BlockMaskCache[BB] = BlockMask;
8229   }
8230 
8231   // This is the block mask. We OR all incoming edges.
8232   for (auto *Predecessor : predecessors(BB)) {
8233     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8234     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
8235       return BlockMaskCache[BB] = EdgeMask;
8236 
8237     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8238       BlockMask = EdgeMask;
8239       continue;
8240     }
8241 
8242     BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8243   }
8244 
8245   return BlockMaskCache[BB] = BlockMask;
8246 }
8247 
8248 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
8249                                                 ArrayRef<VPValue *> Operands,
8250                                                 VFRange &Range,
8251                                                 VPlanPtr &Plan) {
8252   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8253          "Must be called with either a load or store");
8254 
8255   auto willWiden = [&](ElementCount VF) -> bool {
8256     if (VF.isScalar())
8257       return false;
8258     LoopVectorizationCostModel::InstWidening Decision =
8259         CM.getWideningDecision(I, VF);
8260     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8261            "CM decision should be taken at this point.");
8262     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8263       return true;
8264     if (CM.isScalarAfterVectorization(I, VF) ||
8265         CM.isProfitableToScalarize(I, VF))
8266       return false;
8267     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8268   };
8269 
8270   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8271     return nullptr;
8272 
8273   VPValue *Mask = nullptr;
8274   if (Legal->isMaskRequired(I))
8275     Mask = createBlockInMask(I->getParent(), Plan);
8276 
8277   // Determine if the pointer operand of the access is either consecutive or
8278   // reverse consecutive.
8279   LoopVectorizationCostModel::InstWidening Decision =
8280       CM.getWideningDecision(I, Range.Start);
8281   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8282   bool Consecutive =
8283       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8284 
8285   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8286     return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
8287                                               Consecutive, Reverse);
8288 
8289   StoreInst *Store = cast<StoreInst>(I);
8290   return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
8291                                             Mask, Consecutive, Reverse);
8292 }
8293 
8294 static VPWidenIntOrFpInductionRecipe *
8295 createWidenInductionRecipe(PHINode *Phi, Instruction *PhiOrTrunc,
8296                            VPValue *Start, const InductionDescriptor &IndDesc,
8297                            LoopVectorizationCostModel &CM, ScalarEvolution &SE,
8298                            Loop &OrigLoop, VFRange &Range) {
8299   // Returns true if an instruction \p I should be scalarized instead of
8300   // vectorized for the chosen vectorization factor.
8301   auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) {
8302     return CM.isScalarAfterVectorization(I, VF) ||
8303            CM.isProfitableToScalarize(I, VF);
8304   };
8305 
8306   bool NeedsScalarIV = LoopVectorizationPlanner::getDecisionAndClampRange(
8307       [&](ElementCount VF) {
8308         // Returns true if we should generate a scalar version of \p IV.
8309         if (ShouldScalarizeInstruction(PhiOrTrunc, VF))
8310           return true;
8311         auto isScalarInst = [&](User *U) -> bool {
8312           auto *I = cast<Instruction>(U);
8313           return OrigLoop.contains(I) && ShouldScalarizeInstruction(I, VF);
8314         };
8315         return any_of(PhiOrTrunc->users(), isScalarInst);
8316       },
8317       Range);
8318   bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange(
8319       [&](ElementCount VF) {
8320         return ShouldScalarizeInstruction(PhiOrTrunc, VF);
8321       },
8322       Range);
8323   assert(IndDesc.getStartValue() ==
8324          Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8325   assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8326          "step must be loop invariant");
8327   if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8328     return new VPWidenIntOrFpInductionRecipe(
8329         Phi, Start, IndDesc, TruncI, NeedsScalarIV, !NeedsScalarIVOnly, SE);
8330   }
8331   assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8332   return new VPWidenIntOrFpInductionRecipe(Phi, Start, IndDesc, NeedsScalarIV,
8333                                            !NeedsScalarIVOnly, SE);
8334 }
8335 
8336 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
8337     PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) const {
8338 
8339   // Check if this is an integer or fp induction. If so, build the recipe that
8340   // produces its scalar and vector values.
8341   if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8342     return createWidenInductionRecipe(Phi, Phi, Operands[0], *II, CM,
8343                                       *PSE.getSE(), *OrigLoop, Range);
8344 
8345   return nullptr;
8346 }
8347 
8348 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8349     TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range,
8350     VPlan &Plan) const {
8351   // Optimize the special case where the source is a constant integer
8352   // induction variable. Notice that we can only optimize the 'trunc' case
8353   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8354   // (c) other casts depend on pointer size.
8355 
8356   // Determine whether \p K is a truncation based on an induction variable that
8357   // can be optimized.
8358   auto isOptimizableIVTruncate =
8359       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8360     return [=](ElementCount VF) -> bool {
8361       return CM.isOptimizableIVTruncate(K, VF);
8362     };
8363   };
8364 
8365   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8366           isOptimizableIVTruncate(I), Range)) {
8367 
8368     auto *Phi = cast<PHINode>(I->getOperand(0));
8369     const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8370     VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
8371     return createWidenInductionRecipe(Phi, I, Start, II, CM, *PSE.getSE(),
8372                                       *OrigLoop, Range);
8373   }
8374   return nullptr;
8375 }
8376 
8377 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
8378                                                 ArrayRef<VPValue *> Operands,
8379                                                 VPlanPtr &Plan) {
8380   // If all incoming values are equal, the incoming VPValue can be used directly
8381   // instead of creating a new VPBlendRecipe.
8382   VPValue *FirstIncoming = Operands[0];
8383   if (all_of(Operands, [FirstIncoming](const VPValue *Inc) {
8384         return FirstIncoming == Inc;
8385       })) {
8386     return Operands[0];
8387   }
8388 
8389   unsigned NumIncoming = Phi->getNumIncomingValues();
8390   // For in-loop reductions, we do not need to create an additional select.
8391   VPValue *InLoopVal = nullptr;
8392   for (unsigned In = 0; In < NumIncoming; In++) {
8393     PHINode *PhiOp =
8394         dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue());
8395     if (PhiOp && CM.isInLoopReduction(PhiOp)) {
8396       assert(!InLoopVal && "Found more than one in-loop reduction!");
8397       InLoopVal = Operands[In];
8398     }
8399   }
8400 
8401   assert((!InLoopVal || NumIncoming == 2) &&
8402          "Found an in-loop reduction for PHI with unexpected number of "
8403          "incoming values");
8404   if (InLoopVal)
8405     return Operands[Operands[0] == InLoopVal ? 1 : 0];
8406 
8407   // We know that all PHIs in non-header blocks are converted into selects, so
8408   // we don't have to worry about the insertion order and we can just use the
8409   // builder. At this point we generate the predication tree. There may be
8410   // duplications since this is a simple recursive scan, but future
8411   // optimizations will clean it up.
8412   SmallVector<VPValue *, 2> OperandsWithMask;
8413 
8414   for (unsigned In = 0; In < NumIncoming; In++) {
8415     VPValue *EdgeMask =
8416       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8417     assert((EdgeMask || NumIncoming == 1) &&
8418            "Multiple predecessors with one having a full mask");
8419     OperandsWithMask.push_back(Operands[In]);
8420     if (EdgeMask)
8421       OperandsWithMask.push_back(EdgeMask);
8422   }
8423   return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
8424 }
8425 
8426 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8427                                                    ArrayRef<VPValue *> Operands,
8428                                                    VFRange &Range) const {
8429 
8430   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8431       [this, CI](ElementCount VF) {
8432         return CM.isScalarWithPredication(CI, VF);
8433       },
8434       Range);
8435 
8436   if (IsPredicated)
8437     return nullptr;
8438 
8439   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8440   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8441              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8442              ID == Intrinsic::pseudoprobe ||
8443              ID == Intrinsic::experimental_noalias_scope_decl))
8444     return nullptr;
8445 
8446   auto willWiden = [&](ElementCount VF) -> bool {
8447     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8448     // The following case may be scalarized depending on the VF.
8449     // The flag shows whether we use Intrinsic or a usual Call for vectorized
8450     // version of the instruction.
8451     // Is it beneficial to perform intrinsic call compared to lib call?
8452     bool NeedToScalarize = false;
8453     InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
8454     InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
8455     bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
8456     return UseVectorIntrinsic || !NeedToScalarize;
8457   };
8458 
8459   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8460     return nullptr;
8461 
8462   ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());
8463   return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()));
8464 }
8465 
8466 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8467   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8468          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8469   // Instruction should be widened, unless it is scalar after vectorization,
8470   // scalarization is profitable or it is predicated.
8471   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8472     return CM.isScalarAfterVectorization(I, VF) ||
8473            CM.isProfitableToScalarize(I, VF) ||
8474            CM.isScalarWithPredication(I, VF);
8475   };
8476   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8477                                                              Range);
8478 }
8479 
8480 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8481                                            ArrayRef<VPValue *> Operands) const {
8482   auto IsVectorizableOpcode = [](unsigned Opcode) {
8483     switch (Opcode) {
8484     case Instruction::Add:
8485     case Instruction::And:
8486     case Instruction::AShr:
8487     case Instruction::BitCast:
8488     case Instruction::FAdd:
8489     case Instruction::FCmp:
8490     case Instruction::FDiv:
8491     case Instruction::FMul:
8492     case Instruction::FNeg:
8493     case Instruction::FPExt:
8494     case Instruction::FPToSI:
8495     case Instruction::FPToUI:
8496     case Instruction::FPTrunc:
8497     case Instruction::FRem:
8498     case Instruction::FSub:
8499     case Instruction::ICmp:
8500     case Instruction::IntToPtr:
8501     case Instruction::LShr:
8502     case Instruction::Mul:
8503     case Instruction::Or:
8504     case Instruction::PtrToInt:
8505     case Instruction::SDiv:
8506     case Instruction::Select:
8507     case Instruction::SExt:
8508     case Instruction::Shl:
8509     case Instruction::SIToFP:
8510     case Instruction::SRem:
8511     case Instruction::Sub:
8512     case Instruction::Trunc:
8513     case Instruction::UDiv:
8514     case Instruction::UIToFP:
8515     case Instruction::URem:
8516     case Instruction::Xor:
8517     case Instruction::ZExt:
8518       return true;
8519     }
8520     return false;
8521   };
8522 
8523   if (!IsVectorizableOpcode(I->getOpcode()))
8524     return nullptr;
8525 
8526   // Success: widen this instruction.
8527   return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8528 }
8529 
8530 void VPRecipeBuilder::fixHeaderPhis() {
8531   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8532   for (VPHeaderPHIRecipe *R : PhisToFix) {
8533     auto *PN = cast<PHINode>(R->getUnderlyingValue());
8534     VPRecipeBase *IncR =
8535         getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8536     R->addOperand(IncR->getVPSingleValue());
8537   }
8538 }
8539 
8540 VPBasicBlock *VPRecipeBuilder::handleReplication(
8541     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8542     VPlanPtr &Plan) {
8543   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8544       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8545       Range);
8546 
8547   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8548       [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); },
8549       Range);
8550 
8551   // Even if the instruction is not marked as uniform, there are certain
8552   // intrinsic calls that can be effectively treated as such, so we check for
8553   // them here. Conservatively, we only do this for scalable vectors, since
8554   // for fixed-width VFs we can always fall back on full scalarization.
8555   if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8556     switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8557     case Intrinsic::assume:
8558     case Intrinsic::lifetime_start:
8559     case Intrinsic::lifetime_end:
8560       // For scalable vectors if one of the operands is variant then we still
8561       // want to mark as uniform, which will generate one instruction for just
8562       // the first lane of the vector. We can't scalarize the call in the same
8563       // way as for fixed-width vectors because we don't know how many lanes
8564       // there are.
8565       //
8566       // The reasons for doing it this way for scalable vectors are:
8567       //   1. For the assume intrinsic generating the instruction for the first
8568       //      lane is still be better than not generating any at all. For
8569       //      example, the input may be a splat across all lanes.
8570       //   2. For the lifetime start/end intrinsics the pointer operand only
8571       //      does anything useful when the input comes from a stack object,
8572       //      which suggests it should always be uniform. For non-stack objects
8573       //      the effect is to poison the object, which still allows us to
8574       //      remove the call.
8575       IsUniform = true;
8576       break;
8577     default:
8578       break;
8579     }
8580   }
8581 
8582   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8583                                        IsUniform, IsPredicated);
8584   setRecipe(I, Recipe);
8585   Plan->addVPValue(I, Recipe);
8586 
8587   // Find if I uses a predicated instruction. If so, it will use its scalar
8588   // value. Avoid hoisting the insert-element which packs the scalar value into
8589   // a vector value, as that happens iff all users use the vector value.
8590   for (VPValue *Op : Recipe->operands()) {
8591     auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef());
8592     if (!PredR)
8593       continue;
8594     auto *RepR =
8595         cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef());
8596     assert(RepR->isPredicated() &&
8597            "expected Replicate recipe to be predicated");
8598     RepR->setAlsoPack(false);
8599   }
8600 
8601   // Finalize the recipe for Instr, first if it is not predicated.
8602   if (!IsPredicated) {
8603     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8604     VPBB->appendRecipe(Recipe);
8605     return VPBB;
8606   }
8607   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8608 
8609   VPBlockBase *SingleSucc = VPBB->getSingleSuccessor();
8610   assert(SingleSucc && "VPBB must have a single successor when handling "
8611                        "predicated replication.");
8612   VPBlockUtils::disconnectBlocks(VPBB, SingleSucc);
8613   // Record predicated instructions for above packing optimizations.
8614   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
8615   VPBlockUtils::insertBlockAfter(Region, VPBB);
8616   auto *RegSucc = new VPBasicBlock();
8617   VPBlockUtils::insertBlockAfter(RegSucc, Region);
8618   VPBlockUtils::connectBlocks(RegSucc, SingleSucc);
8619   return RegSucc;
8620 }
8621 
8622 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
8623                                                       VPRecipeBase *PredRecipe,
8624                                                       VPlanPtr &Plan) {
8625   // Instructions marked for predication are replicated and placed under an
8626   // if-then construct to prevent side-effects.
8627 
8628   // Generate recipes to compute the block mask for this region.
8629   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
8630 
8631   // Build the triangular if-then region.
8632   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8633   assert(Instr->getParent() && "Predicated instruction not in any basic block");
8634   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8635   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8636   auto *PHIRecipe = Instr->getType()->isVoidTy()
8637                         ? nullptr
8638                         : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
8639   if (PHIRecipe) {
8640     Plan->removeVPValueFor(Instr);
8641     Plan->addVPValue(Instr, PHIRecipe);
8642   }
8643   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8644   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8645   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
8646 
8647   // Note: first set Entry as region entry and then connect successors starting
8648   // from it in order, to propagate the "parent" of each VPBasicBlock.
8649   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
8650   VPBlockUtils::connectBlocks(Pred, Exit);
8651 
8652   return Region;
8653 }
8654 
8655 VPRecipeOrVPValueTy
8656 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8657                                         ArrayRef<VPValue *> Operands,
8658                                         VFRange &Range, VPlanPtr &Plan) {
8659   // First, check for specific widening recipes that deal with calls, memory
8660   // operations, inductions and Phi nodes.
8661   if (auto *CI = dyn_cast<CallInst>(Instr))
8662     return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
8663 
8664   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8665     return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
8666 
8667   VPRecipeBase *Recipe;
8668   if (auto Phi = dyn_cast<PHINode>(Instr)) {
8669     if (Phi->getParent() != OrigLoop->getHeader())
8670       return tryToBlend(Phi, Operands, Plan);
8671     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
8672       return toVPRecipeResult(Recipe);
8673 
8674     VPHeaderPHIRecipe *PhiRecipe = nullptr;
8675     if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) {
8676       VPValue *StartV = Operands[0];
8677       if (Legal->isReductionVariable(Phi)) {
8678         const RecurrenceDescriptor &RdxDesc =
8679             Legal->getReductionVars().find(Phi)->second;
8680         assert(RdxDesc.getRecurrenceStartValue() ==
8681                Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8682         PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8683                                              CM.isInLoopReduction(Phi),
8684                                              CM.useOrderedReductions(RdxDesc));
8685       } else {
8686         PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8687       }
8688 
8689       // Record the incoming value from the backedge, so we can add the incoming
8690       // value from the backedge after all recipes have been created.
8691       recordRecipeOf(cast<Instruction>(
8692           Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())));
8693       PhisToFix.push_back(PhiRecipe);
8694     } else {
8695       // TODO: record backedge value for remaining pointer induction phis.
8696       assert(Phi->getType()->isPointerTy() &&
8697              "only pointer phis should be handled here");
8698       assert(Legal->getInductionVars().count(Phi) &&
8699              "Not an induction variable");
8700       InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
8701       VPValue *Start = Plan->getOrAddVPValue(II.getStartValue());
8702       PhiRecipe = new VPWidenPHIRecipe(Phi, Start);
8703     }
8704 
8705     return toVPRecipeResult(PhiRecipe);
8706   }
8707 
8708   if (isa<TruncInst>(Instr) &&
8709       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
8710                                                Range, *Plan)))
8711     return toVPRecipeResult(Recipe);
8712 
8713   if (!shouldWiden(Instr, Range))
8714     return nullptr;
8715 
8716   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8717     return toVPRecipeResult(new VPWidenGEPRecipe(
8718         GEP, make_range(Operands.begin(), Operands.end()), OrigLoop));
8719 
8720   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8721     bool InvariantCond =
8722         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
8723     return toVPRecipeResult(new VPWidenSelectRecipe(
8724         *SI, make_range(Operands.begin(), Operands.end()), InvariantCond));
8725   }
8726 
8727   return toVPRecipeResult(tryToWiden(Instr, Operands));
8728 }
8729 
8730 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8731                                                         ElementCount MaxVF) {
8732   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8733 
8734   // Collect instructions from the original loop that will become trivially dead
8735   // in the vectorized loop. We don't need to vectorize these instructions. For
8736   // example, original induction update instructions can become dead because we
8737   // separately emit induction "steps" when generating code for the new loop.
8738   // Similarly, we create a new latch condition when setting up the structure
8739   // of the new loop, so the old one can become dead.
8740   SmallPtrSet<Instruction *, 4> DeadInstructions;
8741   collectTriviallyDeadInstructions(DeadInstructions);
8742 
8743   // Add assume instructions we need to drop to DeadInstructions, to prevent
8744   // them from being added to the VPlan.
8745   // TODO: We only need to drop assumes in blocks that get flattend. If the
8746   // control flow is preserved, we should keep them.
8747   auto &ConditionalAssumes = Legal->getConditionalAssumes();
8748   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
8749 
8750   MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
8751   // Dead instructions do not need sinking. Remove them from SinkAfter.
8752   for (Instruction *I : DeadInstructions)
8753     SinkAfter.erase(I);
8754 
8755   // Cannot sink instructions after dead instructions (there won't be any
8756   // recipes for them). Instead, find the first non-dead previous instruction.
8757   for (auto &P : Legal->getSinkAfter()) {
8758     Instruction *SinkTarget = P.second;
8759     Instruction *FirstInst = &*SinkTarget->getParent()->begin();
8760     (void)FirstInst;
8761     while (DeadInstructions.contains(SinkTarget)) {
8762       assert(
8763           SinkTarget != FirstInst &&
8764           "Must find a live instruction (at least the one feeding the "
8765           "first-order recurrence PHI) before reaching beginning of the block");
8766       SinkTarget = SinkTarget->getPrevNode();
8767       assert(SinkTarget != P.first &&
8768              "sink source equals target, no sinking required");
8769     }
8770     P.second = SinkTarget;
8771   }
8772 
8773   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8774   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8775     VFRange SubRange = {VF, MaxVFPlusOne};
8776     VPlans.push_back(
8777         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
8778     VF = SubRange.End;
8779   }
8780 }
8781 
8782 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a
8783 // CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a
8784 // BranchOnCount VPInstruction to the latch.
8785 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
8786                                   bool HasNUW, bool IsVPlanNative) {
8787   Value *StartIdx = ConstantInt::get(IdxTy, 0);
8788   auto *StartV = Plan.getOrAddVPValue(StartIdx);
8789 
8790   auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8791   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8792   VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8793   if (IsVPlanNative)
8794     Header = cast<VPBasicBlock>(Header->getSingleSuccessor());
8795   Header->insert(CanonicalIVPHI, Header->begin());
8796 
8797   auto *CanonicalIVIncrement =
8798       new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW
8799                                : VPInstruction::CanonicalIVIncrement,
8800                         {CanonicalIVPHI}, DL);
8801   CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8802 
8803   VPBasicBlock *EB = TopRegion->getExitBasicBlock();
8804   if (IsVPlanNative) {
8805     EB = cast<VPBasicBlock>(EB->getSinglePredecessor());
8806     EB->setCondBit(nullptr);
8807   }
8808   EB->appendRecipe(CanonicalIVIncrement);
8809 
8810   auto *BranchOnCount =
8811       new VPInstruction(VPInstruction::BranchOnCount,
8812                         {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8813   EB->appendRecipe(BranchOnCount);
8814 }
8815 
8816 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
8817     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
8818     const MapVector<Instruction *, Instruction *> &SinkAfter) {
8819 
8820   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8821 
8822   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8823 
8824   // ---------------------------------------------------------------------------
8825   // Pre-construction: record ingredients whose recipes we'll need to further
8826   // process after constructing the initial VPlan.
8827   // ---------------------------------------------------------------------------
8828 
8829   // Mark instructions we'll need to sink later and their targets as
8830   // ingredients whose recipe we'll need to record.
8831   for (auto &Entry : SinkAfter) {
8832     RecipeBuilder.recordRecipeOf(Entry.first);
8833     RecipeBuilder.recordRecipeOf(Entry.second);
8834   }
8835   for (auto &Reduction : CM.getInLoopReductionChains()) {
8836     PHINode *Phi = Reduction.first;
8837     RecurKind Kind =
8838         Legal->getReductionVars().find(Phi)->second.getRecurrenceKind();
8839     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8840 
8841     RecipeBuilder.recordRecipeOf(Phi);
8842     for (auto &R : ReductionOperations) {
8843       RecipeBuilder.recordRecipeOf(R);
8844       // For min/max reducitons, where we have a pair of icmp/select, we also
8845       // need to record the ICmp recipe, so it can be removed later.
8846       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
8847              "Only min/max recurrences allowed for inloop reductions");
8848       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
8849         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
8850     }
8851   }
8852 
8853   // For each interleave group which is relevant for this (possibly trimmed)
8854   // Range, add it to the set of groups to be later applied to the VPlan and add
8855   // placeholders for its members' Recipes which we'll be replacing with a
8856   // single VPInterleaveRecipe.
8857   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8858     auto applyIG = [IG, this](ElementCount VF) -> bool {
8859       return (VF.isVector() && // Query is illegal for VF == 1
8860               CM.getWideningDecision(IG->getInsertPos(), VF) ==
8861                   LoopVectorizationCostModel::CM_Interleave);
8862     };
8863     if (!getDecisionAndClampRange(applyIG, Range))
8864       continue;
8865     InterleaveGroups.insert(IG);
8866     for (unsigned i = 0; i < IG->getFactor(); i++)
8867       if (Instruction *Member = IG->getMember(i))
8868         RecipeBuilder.recordRecipeOf(Member);
8869   };
8870 
8871   // ---------------------------------------------------------------------------
8872   // Build initial VPlan: Scan the body of the loop in a topological order to
8873   // visit each basic block after having visited its predecessor basic blocks.
8874   // ---------------------------------------------------------------------------
8875 
8876   // Create initial VPlan skeleton, with separate header and latch blocks.
8877   VPBasicBlock *HeaderVPBB = new VPBasicBlock();
8878   VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
8879   VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
8880   auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop");
8881   auto Plan = std::make_unique<VPlan>(TopRegion);
8882 
8883   Instruction *DLInst =
8884       getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
8885   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(),
8886                         DLInst ? DLInst->getDebugLoc() : DebugLoc(),
8887                         !CM.foldTailByMasking(), false);
8888 
8889   // Scan the body of the loop in a topological order to visit each basic block
8890   // after having visited its predecessor basic blocks.
8891   LoopBlocksDFS DFS(OrigLoop);
8892   DFS.perform(LI);
8893 
8894   VPBasicBlock *VPBB = HeaderVPBB;
8895   SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove;
8896   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8897     // Relevant instructions from basic block BB will be grouped into VPRecipe
8898     // ingredients and fill a new VPBasicBlock.
8899     unsigned VPBBsForBB = 0;
8900     VPBB->setName(BB->getName());
8901     Builder.setInsertPoint(VPBB);
8902 
8903     // Introduce each ingredient into VPlan.
8904     // TODO: Model and preserve debug instrinsics in VPlan.
8905     for (Instruction &I : BB->instructionsWithoutDebug()) {
8906       Instruction *Instr = &I;
8907 
8908       // First filter out irrelevant instructions, to ensure no recipes are
8909       // built for them.
8910       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
8911         continue;
8912 
8913       SmallVector<VPValue *, 4> Operands;
8914       auto *Phi = dyn_cast<PHINode>(Instr);
8915       if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
8916         Operands.push_back(Plan->getOrAddVPValue(
8917             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
8918       } else {
8919         auto OpRange = Plan->mapToVPValues(Instr->operands());
8920         Operands = {OpRange.begin(), OpRange.end()};
8921       }
8922       if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
8923               Instr, Operands, Range, Plan)) {
8924         // If Instr can be simplified to an existing VPValue, use it.
8925         if (RecipeOrValue.is<VPValue *>()) {
8926           auto *VPV = RecipeOrValue.get<VPValue *>();
8927           Plan->addVPValue(Instr, VPV);
8928           // If the re-used value is a recipe, register the recipe for the
8929           // instruction, in case the recipe for Instr needs to be recorded.
8930           if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef()))
8931             RecipeBuilder.setRecipe(Instr, R);
8932           continue;
8933         }
8934         // Otherwise, add the new recipe.
8935         VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>();
8936         for (auto *Def : Recipe->definedValues()) {
8937           auto *UV = Def->getUnderlyingValue();
8938           Plan->addVPValue(UV, Def);
8939         }
8940 
8941         if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) &&
8942             HeaderVPBB->getFirstNonPhi() != VPBB->end()) {
8943           // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section
8944           // of the header block. That can happen for truncates of induction
8945           // variables. Those recipes are moved to the phi section of the header
8946           // block after applying SinkAfter, which relies on the original
8947           // position of the trunc.
8948           assert(isa<TruncInst>(Instr));
8949           InductionsToMove.push_back(
8950               cast<VPWidenIntOrFpInductionRecipe>(Recipe));
8951         }
8952         RecipeBuilder.setRecipe(Instr, Recipe);
8953         VPBB->appendRecipe(Recipe);
8954         continue;
8955       }
8956 
8957       // Otherwise, if all widening options failed, Instruction is to be
8958       // replicated. This may create a successor for VPBB.
8959       VPBasicBlock *NextVPBB =
8960           RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan);
8961       if (NextVPBB != VPBB) {
8962         VPBB = NextVPBB;
8963         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
8964                                     : "");
8965       }
8966     }
8967 
8968     VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
8969     VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
8970   }
8971 
8972   // Fold the last, empty block into its predecessor.
8973   VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB);
8974   assert(VPBB && "expected to fold last (empty) block");
8975   // After here, VPBB should not be used.
8976   VPBB = nullptr;
8977 
8978   assert(isa<VPRegionBlock>(Plan->getEntry()) &&
8979          !Plan->getEntry()->getEntryBasicBlock()->empty() &&
8980          "entry block must be set to a VPRegionBlock having a non-empty entry "
8981          "VPBasicBlock");
8982   RecipeBuilder.fixHeaderPhis();
8983 
8984   // ---------------------------------------------------------------------------
8985   // Transform initial VPlan: Apply previously taken decisions, in order, to
8986   // bring the VPlan to its final state.
8987   // ---------------------------------------------------------------------------
8988 
8989   // Apply Sink-After legal constraints.
8990   auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {
8991     auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
8992     if (Region && Region->isReplicator()) {
8993       assert(Region->getNumSuccessors() == 1 &&
8994              Region->getNumPredecessors() == 1 && "Expected SESE region!");
8995       assert(R->getParent()->size() == 1 &&
8996              "A recipe in an original replicator region must be the only "
8997              "recipe in its block");
8998       return Region;
8999     }
9000     return nullptr;
9001   };
9002   for (auto &Entry : SinkAfter) {
9003     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
9004     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
9005 
9006     auto *TargetRegion = GetReplicateRegion(Target);
9007     auto *SinkRegion = GetReplicateRegion(Sink);
9008     if (!SinkRegion) {
9009       // If the sink source is not a replicate region, sink the recipe directly.
9010       if (TargetRegion) {
9011         // The target is in a replication region, make sure to move Sink to
9012         // the block after it, not into the replication region itself.
9013         VPBasicBlock *NextBlock =
9014             cast<VPBasicBlock>(TargetRegion->getSuccessors().front());
9015         Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
9016       } else
9017         Sink->moveAfter(Target);
9018       continue;
9019     }
9020 
9021     // The sink source is in a replicate region. Unhook the region from the CFG.
9022     auto *SinkPred = SinkRegion->getSinglePredecessor();
9023     auto *SinkSucc = SinkRegion->getSingleSuccessor();
9024     VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion);
9025     VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc);
9026     VPBlockUtils::connectBlocks(SinkPred, SinkSucc);
9027 
9028     if (TargetRegion) {
9029       // The target recipe is also in a replicate region, move the sink region
9030       // after the target region.
9031       auto *TargetSucc = TargetRegion->getSingleSuccessor();
9032       VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc);
9033       VPBlockUtils::connectBlocks(TargetRegion, SinkRegion);
9034       VPBlockUtils::connectBlocks(SinkRegion, TargetSucc);
9035     } else {
9036       // The sink source is in a replicate region, we need to move the whole
9037       // replicate region, which should only contain a single recipe in the
9038       // main block.
9039       auto *SplitBlock =
9040           Target->getParent()->splitAt(std::next(Target->getIterator()));
9041 
9042       auto *SplitPred = SplitBlock->getSinglePredecessor();
9043 
9044       VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock);
9045       VPBlockUtils::connectBlocks(SplitPred, SinkRegion);
9046       VPBlockUtils::connectBlocks(SinkRegion, SplitBlock);
9047     }
9048   }
9049 
9050   VPlanTransforms::removeRedundantCanonicalIVs(*Plan);
9051   VPlanTransforms::removeRedundantInductionCasts(*Plan);
9052 
9053   // Now that sink-after is done, move induction recipes for optimized truncates
9054   // to the phi section of the header block.
9055   for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove)
9056     Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
9057 
9058   // Adjust the recipes for any inloop reductions.
9059   adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExit()), Plan,
9060                              RecipeBuilder, Range.Start);
9061 
9062   // Introduce a recipe to combine the incoming and previous values of a
9063   // first-order recurrence.
9064   for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
9065     auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R);
9066     if (!RecurPhi)
9067       continue;
9068 
9069     VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe();
9070     VPBasicBlock *InsertBlock = PrevRecipe->getParent();
9071     auto *Region = GetReplicateRegion(PrevRecipe);
9072     if (Region)
9073       InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor());
9074     if (Region || PrevRecipe->isPhi())
9075       Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
9076     else
9077       Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator()));
9078 
9079     auto *RecurSplice = cast<VPInstruction>(
9080         Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
9081                              {RecurPhi, RecurPhi->getBackedgeValue()}));
9082 
9083     RecurPhi->replaceAllUsesWith(RecurSplice);
9084     // Set the first operand of RecurSplice to RecurPhi again, after replacing
9085     // all users.
9086     RecurSplice->setOperand(0, RecurPhi);
9087   }
9088 
9089   // Interleave memory: for each Interleave Group we marked earlier as relevant
9090   // for this VPlan, replace the Recipes widening its memory instructions with a
9091   // single VPInterleaveRecipe at its insertion point.
9092   for (auto IG : InterleaveGroups) {
9093     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
9094         RecipeBuilder.getRecipe(IG->getInsertPos()));
9095     SmallVector<VPValue *, 4> StoredValues;
9096     for (unsigned i = 0; i < IG->getFactor(); ++i)
9097       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
9098         auto *StoreR =
9099             cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
9100         StoredValues.push_back(StoreR->getStoredValue());
9101       }
9102 
9103     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
9104                                         Recipe->getMask());
9105     VPIG->insertBefore(Recipe);
9106     unsigned J = 0;
9107     for (unsigned i = 0; i < IG->getFactor(); ++i)
9108       if (Instruction *Member = IG->getMember(i)) {
9109         if (!Member->getType()->isVoidTy()) {
9110           VPValue *OriginalV = Plan->getVPValue(Member);
9111           Plan->removeVPValueFor(Member);
9112           Plan->addVPValue(Member, VPIG->getVPValue(J));
9113           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
9114           J++;
9115         }
9116         RecipeBuilder.getRecipe(Member)->eraseFromParent();
9117       }
9118   }
9119 
9120   // From this point onwards, VPlan-to-VPlan transformations may change the plan
9121   // in ways that accessing values using original IR values is incorrect.
9122   Plan->disableValue2VPValue();
9123 
9124   VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE());
9125   VPlanTransforms::sinkScalarOperands(*Plan);
9126   VPlanTransforms::mergeReplicateRegions(*Plan);
9127   VPlanTransforms::removeDeadRecipes(*Plan, *OrigLoop);
9128 
9129   std::string PlanName;
9130   raw_string_ostream RSO(PlanName);
9131   ElementCount VF = Range.Start;
9132   Plan->addVF(VF);
9133   RSO << "Initial VPlan for VF={" << VF;
9134   for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
9135     Plan->addVF(VF);
9136     RSO << "," << VF;
9137   }
9138   RSO << "},UF>=1";
9139   RSO.flush();
9140   Plan->setName(PlanName);
9141 
9142   // Fold Exit block into its predecessor if possible.
9143   // TODO: Fold block earlier once all VPlan transforms properly maintain a
9144   // VPBasicBlock as exit.
9145   VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExit());
9146 
9147   assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
9148   return Plan;
9149 }
9150 
9151 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9152   // Outer loop handling: They may require CFG and instruction level
9153   // transformations before even evaluating whether vectorization is profitable.
9154   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9155   // the vectorization pipeline.
9156   assert(!OrigLoop->isInnermost());
9157   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9158 
9159   // Create new empty VPlan
9160   auto Plan = std::make_unique<VPlan>();
9161 
9162   // Build hierarchical CFG
9163   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9164   HCFGBuilder.buildHierarchicalCFG();
9165 
9166   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
9167        VF *= 2)
9168     Plan->addVF(VF);
9169 
9170   if (EnableVPlanPredication) {
9171     VPlanPredicator VPP(*Plan);
9172     VPP.predicate();
9173 
9174     // Avoid running transformation to recipes until masked code generation in
9175     // VPlan-native path is in place.
9176     return Plan;
9177   }
9178 
9179   SmallPtrSet<Instruction *, 1> DeadInstructions;
9180   VPlanTransforms::VPInstructionsToVPRecipes(
9181       OrigLoop, Plan,
9182       [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
9183       DeadInstructions, *PSE.getSE());
9184 
9185   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(),
9186                         true, true);
9187   return Plan;
9188 }
9189 
9190 // Adjust the recipes for reductions. For in-loop reductions the chain of
9191 // instructions leading from the loop exit instr to the phi need to be converted
9192 // to reductions, with one operand being vector and the other being the scalar
9193 // reduction chain. For other reductions, a select is introduced between the phi
9194 // and live-out recipes when folding the tail.
9195 void LoopVectorizationPlanner::adjustRecipesForReductions(
9196     VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
9197     ElementCount MinVF) {
9198   for (auto &Reduction : CM.getInLoopReductionChains()) {
9199     PHINode *Phi = Reduction.first;
9200     const RecurrenceDescriptor &RdxDesc =
9201         Legal->getReductionVars().find(Phi)->second;
9202     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9203 
9204     if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc))
9205       continue;
9206 
9207     // ReductionOperations are orders top-down from the phi's use to the
9208     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
9209     // which of the two operands will remain scalar and which will be reduced.
9210     // For minmax the chain will be the select instructions.
9211     Instruction *Chain = Phi;
9212     for (Instruction *R : ReductionOperations) {
9213       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
9214       RecurKind Kind = RdxDesc.getRecurrenceKind();
9215 
9216       VPValue *ChainOp = Plan->getVPValue(Chain);
9217       unsigned FirstOpId;
9218       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
9219              "Only min/max recurrences allowed for inloop reductions");
9220       // Recognize a call to the llvm.fmuladd intrinsic.
9221       bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9222       assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) &&
9223              "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9224       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9225         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
9226                "Expected to replace a VPWidenSelectSC");
9227         FirstOpId = 1;
9228       } else {
9229         assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) ||
9230                 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) &&
9231                "Expected to replace a VPWidenSC");
9232         FirstOpId = 0;
9233       }
9234       unsigned VecOpId =
9235           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
9236       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
9237 
9238       auto *CondOp = CM.blockNeedsPredicationForAnyReason(R->getParent())
9239                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
9240                          : nullptr;
9241 
9242       if (IsFMulAdd) {
9243         // If the instruction is a call to the llvm.fmuladd intrinsic then we
9244         // need to create an fmul recipe to use as the vector operand for the
9245         // fadd reduction.
9246         VPInstruction *FMulRecipe = new VPInstruction(
9247             Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))});
9248         FMulRecipe->setFastMathFlags(R->getFastMathFlags());
9249         WidenRecipe->getParent()->insert(FMulRecipe,
9250                                          WidenRecipe->getIterator());
9251         VecOp = FMulRecipe;
9252       }
9253       VPReductionRecipe *RedRecipe =
9254           new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
9255       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9256       Plan->removeVPValueFor(R);
9257       Plan->addVPValue(R, RedRecipe);
9258       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
9259       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9260       WidenRecipe->eraseFromParent();
9261 
9262       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9263         VPRecipeBase *CompareRecipe =
9264             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
9265         assert(isa<VPWidenRecipe>(CompareRecipe) &&
9266                "Expected to replace a VPWidenSC");
9267         assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
9268                "Expected no remaining users");
9269         CompareRecipe->eraseFromParent();
9270       }
9271       Chain = R;
9272     }
9273   }
9274 
9275   // If tail is folded by masking, introduce selects between the phi
9276   // and the live-out instruction of each reduction, at the beginning of the
9277   // dedicated latch block.
9278   if (CM.foldTailByMasking()) {
9279     Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin());
9280     for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
9281       VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9282       if (!PhiR || PhiR->isInLoop())
9283         continue;
9284       VPValue *Cond =
9285           RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
9286       VPValue *Red = PhiR->getBackedgeValue();
9287       assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB &&
9288              "reduction recipe must be defined before latch");
9289       Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});
9290     }
9291   }
9292 }
9293 
9294 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9295 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9296                                VPSlotTracker &SlotTracker) const {
9297   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9298   IG->getInsertPos()->printAsOperand(O, false);
9299   O << ", ";
9300   getAddr()->printAsOperand(O, SlotTracker);
9301   VPValue *Mask = getMask();
9302   if (Mask) {
9303     O << ", ";
9304     Mask->printAsOperand(O, SlotTracker);
9305   }
9306 
9307   unsigned OpIdx = 0;
9308   for (unsigned i = 0; i < IG->getFactor(); ++i) {
9309     if (!IG->getMember(i))
9310       continue;
9311     if (getNumStoreOperands() > 0) {
9312       O << "\n" << Indent << "  store ";
9313       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9314       O << " to index " << i;
9315     } else {
9316       O << "\n" << Indent << "  ";
9317       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
9318       O << " = load from index " << i;
9319     }
9320     ++OpIdx;
9321   }
9322 }
9323 #endif
9324 
9325 void VPWidenCallRecipe::execute(VPTransformState &State) {
9326   State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
9327                                   *this, State);
9328 }
9329 
9330 void VPWidenSelectRecipe::execute(VPTransformState &State) {
9331   auto &I = *cast<SelectInst>(getUnderlyingInstr());
9332   State.ILV->setDebugLocFromInst(&I);
9333 
9334   // The condition can be loop invariant  but still defined inside the
9335   // loop. This means that we can't just use the original 'cond' value.
9336   // We have to take the 'vectorized' value and pick the first lane.
9337   // Instcombine will make this a no-op.
9338   auto *InvarCond =
9339       InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr;
9340 
9341   for (unsigned Part = 0; Part < State.UF; ++Part) {
9342     Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part);
9343     Value *Op0 = State.get(getOperand(1), Part);
9344     Value *Op1 = State.get(getOperand(2), Part);
9345     Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
9346     State.set(this, Sel, Part);
9347     State.ILV->addMetadata(Sel, &I);
9348   }
9349 }
9350 
9351 void VPWidenRecipe::execute(VPTransformState &State) {
9352   auto &I = *cast<Instruction>(getUnderlyingValue());
9353   auto &Builder = State.Builder;
9354   switch (I.getOpcode()) {
9355   case Instruction::Call:
9356   case Instruction::Br:
9357   case Instruction::PHI:
9358   case Instruction::GetElementPtr:
9359   case Instruction::Select:
9360     llvm_unreachable("This instruction is handled by a different recipe.");
9361   case Instruction::UDiv:
9362   case Instruction::SDiv:
9363   case Instruction::SRem:
9364   case Instruction::URem:
9365   case Instruction::Add:
9366   case Instruction::FAdd:
9367   case Instruction::Sub:
9368   case Instruction::FSub:
9369   case Instruction::FNeg:
9370   case Instruction::Mul:
9371   case Instruction::FMul:
9372   case Instruction::FDiv:
9373   case Instruction::FRem:
9374   case Instruction::Shl:
9375   case Instruction::LShr:
9376   case Instruction::AShr:
9377   case Instruction::And:
9378   case Instruction::Or:
9379   case Instruction::Xor: {
9380     // Just widen unops and binops.
9381     State.ILV->setDebugLocFromInst(&I);
9382 
9383     for (unsigned Part = 0; Part < State.UF; ++Part) {
9384       SmallVector<Value *, 2> Ops;
9385       for (VPValue *VPOp : operands())
9386         Ops.push_back(State.get(VPOp, Part));
9387 
9388       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
9389 
9390       if (auto *VecOp = dyn_cast<Instruction>(V)) {
9391         VecOp->copyIRFlags(&I);
9392 
9393         // If the instruction is vectorized and was in a basic block that needed
9394         // predication, we can't propagate poison-generating flags (nuw/nsw,
9395         // exact, etc.). The control flow has been linearized and the
9396         // instruction is no longer guarded by the predicate, which could make
9397         // the flag properties to no longer hold.
9398         if (State.MayGeneratePoisonRecipes.contains(this))
9399           VecOp->dropPoisonGeneratingFlags();
9400       }
9401 
9402       // Use this vector value for all users of the original instruction.
9403       State.set(this, V, Part);
9404       State.ILV->addMetadata(V, &I);
9405     }
9406 
9407     break;
9408   }
9409   case Instruction::ICmp:
9410   case Instruction::FCmp: {
9411     // Widen compares. Generate vector compares.
9412     bool FCmp = (I.getOpcode() == Instruction::FCmp);
9413     auto *Cmp = cast<CmpInst>(&I);
9414     State.ILV->setDebugLocFromInst(Cmp);
9415     for (unsigned Part = 0; Part < State.UF; ++Part) {
9416       Value *A = State.get(getOperand(0), Part);
9417       Value *B = State.get(getOperand(1), Part);
9418       Value *C = nullptr;
9419       if (FCmp) {
9420         // Propagate fast math flags.
9421         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
9422         Builder.setFastMathFlags(Cmp->getFastMathFlags());
9423         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
9424       } else {
9425         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
9426       }
9427       State.set(this, C, Part);
9428       State.ILV->addMetadata(C, &I);
9429     }
9430 
9431     break;
9432   }
9433 
9434   case Instruction::ZExt:
9435   case Instruction::SExt:
9436   case Instruction::FPToUI:
9437   case Instruction::FPToSI:
9438   case Instruction::FPExt:
9439   case Instruction::PtrToInt:
9440   case Instruction::IntToPtr:
9441   case Instruction::SIToFP:
9442   case Instruction::UIToFP:
9443   case Instruction::Trunc:
9444   case Instruction::FPTrunc:
9445   case Instruction::BitCast: {
9446     auto *CI = cast<CastInst>(&I);
9447     State.ILV->setDebugLocFromInst(CI);
9448 
9449     /// Vectorize casts.
9450     Type *DestTy = (State.VF.isScalar())
9451                        ? CI->getType()
9452                        : VectorType::get(CI->getType(), State.VF);
9453 
9454     for (unsigned Part = 0; Part < State.UF; ++Part) {
9455       Value *A = State.get(getOperand(0), Part);
9456       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
9457       State.set(this, Cast, Part);
9458       State.ILV->addMetadata(Cast, &I);
9459     }
9460     break;
9461   }
9462   default:
9463     // This instruction is not vectorized by simple widening.
9464     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
9465     llvm_unreachable("Unhandled instruction!");
9466   } // end of switch.
9467 }
9468 
9469 void VPWidenGEPRecipe::execute(VPTransformState &State) {
9470   auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
9471   // Construct a vector GEP by widening the operands of the scalar GEP as
9472   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
9473   // results in a vector of pointers when at least one operand of the GEP
9474   // is vector-typed. Thus, to keep the representation compact, we only use
9475   // vector-typed operands for loop-varying values.
9476 
9477   if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
9478     // If we are vectorizing, but the GEP has only loop-invariant operands,
9479     // the GEP we build (by only using vector-typed operands for
9480     // loop-varying values) would be a scalar pointer. Thus, to ensure we
9481     // produce a vector of pointers, we need to either arbitrarily pick an
9482     // operand to broadcast, or broadcast a clone of the original GEP.
9483     // Here, we broadcast a clone of the original.
9484     //
9485     // TODO: If at some point we decide to scalarize instructions having
9486     //       loop-invariant operands, this special case will no longer be
9487     //       required. We would add the scalarization decision to
9488     //       collectLoopScalars() and teach getVectorValue() to broadcast
9489     //       the lane-zero scalar value.
9490     auto *Clone = State.Builder.Insert(GEP->clone());
9491     for (unsigned Part = 0; Part < State.UF; ++Part) {
9492       Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone);
9493       State.set(this, EntryPart, Part);
9494       State.ILV->addMetadata(EntryPart, GEP);
9495     }
9496   } else {
9497     // If the GEP has at least one loop-varying operand, we are sure to
9498     // produce a vector of pointers. But if we are only unrolling, we want
9499     // to produce a scalar GEP for each unroll part. Thus, the GEP we
9500     // produce with the code below will be scalar (if VF == 1) or vector
9501     // (otherwise). Note that for the unroll-only case, we still maintain
9502     // values in the vector mapping with initVector, as we do for other
9503     // instructions.
9504     for (unsigned Part = 0; Part < State.UF; ++Part) {
9505       // The pointer operand of the new GEP. If it's loop-invariant, we
9506       // won't broadcast it.
9507       auto *Ptr = IsPtrLoopInvariant
9508                       ? State.get(getOperand(0), VPIteration(0, 0))
9509                       : State.get(getOperand(0), Part);
9510 
9511       // Collect all the indices for the new GEP. If any index is
9512       // loop-invariant, we won't broadcast it.
9513       SmallVector<Value *, 4> Indices;
9514       for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
9515         VPValue *Operand = getOperand(I);
9516         if (IsIndexLoopInvariant[I - 1])
9517           Indices.push_back(State.get(Operand, VPIteration(0, 0)));
9518         else
9519           Indices.push_back(State.get(Operand, Part));
9520       }
9521 
9522       // If the GEP instruction is vectorized and was in a basic block that
9523       // needed predication, we can't propagate the poison-generating 'inbounds'
9524       // flag. The control flow has been linearized and the GEP is no longer
9525       // guarded by the predicate, which could make the 'inbounds' properties to
9526       // no longer hold.
9527       bool IsInBounds =
9528           GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0;
9529 
9530       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
9531       // but it should be a vector, otherwise.
9532       auto *NewGEP = IsInBounds
9533                          ? State.Builder.CreateInBoundsGEP(
9534                                GEP->getSourceElementType(), Ptr, Indices)
9535                          : State.Builder.CreateGEP(GEP->getSourceElementType(),
9536                                                    Ptr, Indices);
9537       assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
9538              "NewGEP is not a pointer vector");
9539       State.set(this, NewGEP, Part);
9540       State.ILV->addMetadata(NewGEP, GEP);
9541     }
9542   }
9543 }
9544 
9545 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
9546   assert(!State.Instance && "Int or FP induction being replicated.");
9547 
9548   Value *Start = getStartValue()->getLiveInIRValue();
9549   const InductionDescriptor &ID = getInductionDescriptor();
9550   TruncInst *Trunc = getTruncInst();
9551   IRBuilderBase &Builder = State.Builder;
9552   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
9553   assert(State.VF.isVector() && "must have vector VF");
9554 
9555   // The value from the original loop to which we are mapping the new induction
9556   // variable.
9557   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
9558 
9559   auto &DL = EntryVal->getModule()->getDataLayout();
9560 
9561   // Generate code for the induction step. Note that induction steps are
9562   // required to be loop-invariant
9563   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
9564     if (SE.isSCEVable(IV->getType())) {
9565       SCEVExpander Exp(SE, DL, "induction");
9566       return Exp.expandCodeFor(Step, Step->getType(),
9567                                State.CFG.VectorPreHeader->getTerminator());
9568     }
9569     return cast<SCEVUnknown>(Step)->getValue();
9570   };
9571 
9572   // Fast-math-flags propagate from the original induction instruction.
9573   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
9574   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
9575     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
9576 
9577   // Now do the actual transformations, and start with creating the step value.
9578   Value *Step = CreateStepValue(ID.getStep());
9579 
9580   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
9581          "Expected either an induction phi-node or a truncate of it!");
9582 
9583   // Construct the initial value of the vector IV in the vector loop preheader
9584   auto CurrIP = Builder.saveIP();
9585   Builder.SetInsertPoint(State.CFG.VectorPreHeader->getTerminator());
9586   if (isa<TruncInst>(EntryVal)) {
9587     assert(Start->getType()->isIntegerTy() &&
9588            "Truncation requires an integer type");
9589     auto *TruncType = cast<IntegerType>(EntryVal->getType());
9590     Step = Builder.CreateTrunc(Step, TruncType);
9591     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
9592   }
9593 
9594   Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
9595   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
9596   Value *SteppedStart = getStepVector(
9597       SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
9598 
9599   // We create vector phi nodes for both integer and floating-point induction
9600   // variables. Here, we determine the kind of arithmetic we will perform.
9601   Instruction::BinaryOps AddOp;
9602   Instruction::BinaryOps MulOp;
9603   if (Step->getType()->isIntegerTy()) {
9604     AddOp = Instruction::Add;
9605     MulOp = Instruction::Mul;
9606   } else {
9607     AddOp = ID.getInductionOpcode();
9608     MulOp = Instruction::FMul;
9609   }
9610 
9611   // Multiply the vectorization factor by the step using integer or
9612   // floating-point arithmetic as appropriate.
9613   Type *StepType = Step->getType();
9614   Value *RuntimeVF;
9615   if (Step->getType()->isFloatingPointTy())
9616     RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
9617   else
9618     RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
9619   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
9620 
9621   // Create a vector splat to use in the induction update.
9622   //
9623   // FIXME: If the step is non-constant, we create the vector splat with
9624   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
9625   //        handle a constant vector splat.
9626   Value *SplatVF = isa<Constant>(Mul)
9627                        ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
9628                        : Builder.CreateVectorSplat(State.VF, Mul);
9629   Builder.restoreIP(CurrIP);
9630 
9631   // We may need to add the step a number of times, depending on the unroll
9632   // factor. The last of those goes into the PHI.
9633   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
9634                                     &*State.CFG.PrevBB->getFirstInsertionPt());
9635   VecInd->setDebugLoc(EntryVal->getDebugLoc());
9636   Instruction *LastInduction = VecInd;
9637   for (unsigned Part = 0; Part < State.UF; ++Part) {
9638     State.set(this, LastInduction, Part);
9639 
9640     if (isa<TruncInst>(EntryVal))
9641       State.ILV->addMetadata(LastInduction, EntryVal);
9642 
9643     LastInduction = cast<Instruction>(
9644         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
9645     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
9646   }
9647 
9648   // Move the last step to the end of the latch block. This ensures consistent
9649   // placement of all induction updates.
9650   auto *LoopVectorLatch =
9651       State.LI->getLoopFor(State.CFG.PrevBB)->getLoopLatch();
9652   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
9653   LastInduction->moveBefore(Br);
9654   LastInduction->setName("vec.ind.next");
9655 
9656   VecInd->addIncoming(SteppedStart, State.CFG.VectorPreHeader);
9657   VecInd->addIncoming(LastInduction, LoopVectorLatch);
9658 }
9659 
9660 void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
9661   assert(!State.Instance && "VPScalarIVStepsRecipe being replicated.");
9662 
9663   // Fast-math-flags propagate from the original induction instruction.
9664   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9665   if (IndDesc.getInductionBinOp() &&
9666       isa<FPMathOperator>(IndDesc.getInductionBinOp()))
9667     State.Builder.setFastMathFlags(
9668         IndDesc.getInductionBinOp()->getFastMathFlags());
9669 
9670   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9671   auto CreateScalarIV = [&](Value *&Step) -> Value * {
9672     Value *ScalarIV = State.get(getCanonicalIV(), VPIteration(0, 0));
9673     auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0);
9674     if (!isCanonical() || CanonicalIV->getType() != Ty) {
9675       ScalarIV =
9676           Ty->isIntegerTy()
9677               ? State.Builder.CreateSExtOrTrunc(ScalarIV, Ty)
9678               : State.Builder.CreateCast(Instruction::SIToFP, ScalarIV, Ty);
9679       ScalarIV = emitTransformedIndex(State.Builder, ScalarIV,
9680                                       getStartValue()->getLiveInIRValue(), Step,
9681                                       IndDesc);
9682       ScalarIV->setName("offset.idx");
9683     }
9684     if (TruncToTy) {
9685       assert(Step->getType()->isIntegerTy() &&
9686              "Truncation requires an integer step");
9687       ScalarIV = State.Builder.CreateTrunc(ScalarIV, TruncToTy);
9688       Step = State.Builder.CreateTrunc(Step, TruncToTy);
9689     }
9690     return ScalarIV;
9691   };
9692 
9693   Value *ScalarIV = CreateScalarIV(Step);
9694   if (State.VF.isVector()) {
9695     buildScalarSteps(ScalarIV, Step, IndDesc, this, State);
9696     return;
9697   }
9698 
9699   for (unsigned Part = 0; Part < State.UF; ++Part) {
9700     assert(!State.VF.isScalable() && "scalable vectors not yet supported.");
9701     Value *EntryPart;
9702     if (Step->getType()->isFloatingPointTy()) {
9703       Value *StartIdx =
9704           getRuntimeVFAsFloat(State.Builder, Step->getType(), State.VF * Part);
9705       // Floating-point operations inherit FMF via the builder's flags.
9706       Value *MulOp = State.Builder.CreateFMul(StartIdx, Step);
9707       EntryPart = State.Builder.CreateBinOp(IndDesc.getInductionOpcode(),
9708                                             ScalarIV, MulOp);
9709     } else {
9710       Value *StartIdx =
9711           getRuntimeVF(State.Builder, Step->getType(), State.VF * Part);
9712       EntryPart = State.Builder.CreateAdd(
9713           ScalarIV, State.Builder.CreateMul(StartIdx, Step), "induction");
9714     }
9715     State.set(this, EntryPart, Part);
9716   }
9717 }
9718 
9719 void VPWidenPHIRecipe::execute(VPTransformState &State) {
9720   State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this,
9721                                  State);
9722 }
9723 
9724 void VPBlendRecipe::execute(VPTransformState &State) {
9725   State.ILV->setDebugLocFromInst(Phi, &State.Builder);
9726   // We know that all PHIs in non-header blocks are converted into
9727   // selects, so we don't have to worry about the insertion order and we
9728   // can just use the builder.
9729   // At this point we generate the predication tree. There may be
9730   // duplications since this is a simple recursive scan, but future
9731   // optimizations will clean it up.
9732 
9733   unsigned NumIncoming = getNumIncomingValues();
9734 
9735   // Generate a sequence of selects of the form:
9736   // SELECT(Mask3, In3,
9737   //        SELECT(Mask2, In2,
9738   //               SELECT(Mask1, In1,
9739   //                      In0)))
9740   // Note that Mask0 is never used: lanes for which no path reaches this phi and
9741   // are essentially undef are taken from In0.
9742   InnerLoopVectorizer::VectorParts Entry(State.UF);
9743   for (unsigned In = 0; In < NumIncoming; ++In) {
9744     for (unsigned Part = 0; Part < State.UF; ++Part) {
9745       // We might have single edge PHIs (blocks) - use an identity
9746       // 'select' for the first PHI operand.
9747       Value *In0 = State.get(getIncomingValue(In), Part);
9748       if (In == 0)
9749         Entry[Part] = In0; // Initialize with the first incoming value.
9750       else {
9751         // Select between the current value and the previous incoming edge
9752         // based on the incoming mask.
9753         Value *Cond = State.get(getMask(In), Part);
9754         Entry[Part] =
9755             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
9756       }
9757     }
9758   }
9759   for (unsigned Part = 0; Part < State.UF; ++Part)
9760     State.set(this, Entry[Part], Part);
9761 }
9762 
9763 void VPInterleaveRecipe::execute(VPTransformState &State) {
9764   assert(!State.Instance && "Interleave group being replicated.");
9765   State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9766                                       getStoredValues(), getMask());
9767 }
9768 
9769 void VPReductionRecipe::execute(VPTransformState &State) {
9770   assert(!State.Instance && "Reduction being replicated.");
9771   Value *PrevInChain = State.get(getChainOp(), 0);
9772   RecurKind Kind = RdxDesc->getRecurrenceKind();
9773   bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc);
9774   // Propagate the fast-math flags carried by the underlying instruction.
9775   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
9776   State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags());
9777   for (unsigned Part = 0; Part < State.UF; ++Part) {
9778     Value *NewVecOp = State.get(getVecOp(), Part);
9779     if (VPValue *Cond = getCondOp()) {
9780       Value *NewCond = State.get(Cond, Part);
9781       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
9782       Value *Iden = RdxDesc->getRecurrenceIdentity(
9783           Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags());
9784       Value *IdenVec =
9785           State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
9786       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
9787       NewVecOp = Select;
9788     }
9789     Value *NewRed;
9790     Value *NextInChain;
9791     if (IsOrdered) {
9792       if (State.VF.isVector())
9793         NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp,
9794                                         PrevInChain);
9795       else
9796         NewRed = State.Builder.CreateBinOp(
9797             (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain,
9798             NewVecOp);
9799       PrevInChain = NewRed;
9800     } else {
9801       PrevInChain = State.get(getChainOp(), Part);
9802       NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
9803     }
9804     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9805       NextInChain =
9806           createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
9807                          NewRed, PrevInChain);
9808     } else if (IsOrdered)
9809       NextInChain = NewRed;
9810     else
9811       NextInChain = State.Builder.CreateBinOp(
9812           (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed,
9813           PrevInChain);
9814     State.set(this, NextInChain, Part);
9815   }
9816 }
9817 
9818 void VPReplicateRecipe::execute(VPTransformState &State) {
9819   if (State.Instance) { // Generate a single instance.
9820     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9821     State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance,
9822                                     IsPredicated, State);
9823     // Insert scalar instance packing it into a vector.
9824     if (AlsoPack && State.VF.isVector()) {
9825       // If we're constructing lane 0, initialize to start from poison.
9826       if (State.Instance->Lane.isFirstLane()) {
9827         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9828         Value *Poison = PoisonValue::get(
9829             VectorType::get(getUnderlyingValue()->getType(), State.VF));
9830         State.set(this, Poison, State.Instance->Part);
9831       }
9832       State.ILV->packScalarIntoVectorValue(this, *State.Instance, State);
9833     }
9834     return;
9835   }
9836 
9837   // Generate scalar instances for all VF lanes of all UF parts, unless the
9838   // instruction is uniform inwhich case generate only the first lane for each
9839   // of the UF parts.
9840   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
9841   assert((!State.VF.isScalable() || IsUniform) &&
9842          "Can't scalarize a scalable vector");
9843   for (unsigned Part = 0; Part < State.UF; ++Part)
9844     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9845       State.ILV->scalarizeInstruction(getUnderlyingInstr(), this,
9846                                       VPIteration(Part, Lane), IsPredicated,
9847                                       State);
9848 }
9849 
9850 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
9851   assert(State.Instance && "Branch on Mask works only on single instance.");
9852 
9853   unsigned Part = State.Instance->Part;
9854   unsigned Lane = State.Instance->Lane.getKnownLane();
9855 
9856   Value *ConditionBit = nullptr;
9857   VPValue *BlockInMask = getMask();
9858   if (BlockInMask) {
9859     ConditionBit = State.get(BlockInMask, Part);
9860     if (ConditionBit->getType()->isVectorTy())
9861       ConditionBit = State.Builder.CreateExtractElement(
9862           ConditionBit, State.Builder.getInt32(Lane));
9863   } else // Block in mask is all-one.
9864     ConditionBit = State.Builder.getTrue();
9865 
9866   // Replace the temporary unreachable terminator with a new conditional branch,
9867   // whose two destinations will be set later when they are created.
9868   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
9869   assert(isa<UnreachableInst>(CurrentTerminator) &&
9870          "Expected to replace unreachable terminator with conditional branch.");
9871   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
9872   CondBr->setSuccessor(0, nullptr);
9873   ReplaceInstWithInst(CurrentTerminator, CondBr);
9874 }
9875 
9876 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
9877   assert(State.Instance && "Predicated instruction PHI works per instance.");
9878   Instruction *ScalarPredInst =
9879       cast<Instruction>(State.get(getOperand(0), *State.Instance));
9880   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
9881   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
9882   assert(PredicatingBB && "Predicated block has no single predecessor.");
9883   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
9884          "operand must be VPReplicateRecipe");
9885 
9886   // By current pack/unpack logic we need to generate only a single phi node: if
9887   // a vector value for the predicated instruction exists at this point it means
9888   // the instruction has vector users only, and a phi for the vector value is
9889   // needed. In this case the recipe of the predicated instruction is marked to
9890   // also do that packing, thereby "hoisting" the insert-element sequence.
9891   // Otherwise, a phi node for the scalar value is needed.
9892   unsigned Part = State.Instance->Part;
9893   if (State.hasVectorValue(getOperand(0), Part)) {
9894     Value *VectorValue = State.get(getOperand(0), Part);
9895     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
9896     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
9897     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
9898     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
9899     if (State.hasVectorValue(this, Part))
9900       State.reset(this, VPhi, Part);
9901     else
9902       State.set(this, VPhi, Part);
9903     // NOTE: Currently we need to update the value of the operand, so the next
9904     // predicated iteration inserts its generated value in the correct vector.
9905     State.reset(getOperand(0), VPhi, Part);
9906   } else {
9907     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
9908     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
9909     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
9910                      PredicatingBB);
9911     Phi->addIncoming(ScalarPredInst, PredicatedBB);
9912     if (State.hasScalarValue(this, *State.Instance))
9913       State.reset(this, Phi, *State.Instance);
9914     else
9915       State.set(this, Phi, *State.Instance);
9916     // NOTE: Currently we need to update the value of the operand, so the next
9917     // predicated iteration inserts its generated value in the correct vector.
9918     State.reset(getOperand(0), Phi, *State.Instance);
9919   }
9920 }
9921 
9922 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9923   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9924 
9925   // Attempt to issue a wide load.
9926   LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
9927   StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
9928 
9929   assert((LI || SI) && "Invalid Load/Store instruction");
9930   assert((!SI || StoredValue) && "No stored value provided for widened store");
9931   assert((!LI || !StoredValue) && "Stored value provided for widened load");
9932 
9933   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9934 
9935   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9936   const Align Alignment = getLoadStoreAlignment(&Ingredient);
9937   bool CreateGatherScatter = !Consecutive;
9938 
9939   auto &Builder = State.Builder;
9940   InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
9941   bool isMaskRequired = getMask();
9942   if (isMaskRequired)
9943     for (unsigned Part = 0; Part < State.UF; ++Part)
9944       BlockInMaskParts[Part] = State.get(getMask(), Part);
9945 
9946   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
9947     // Calculate the pointer for the specific unroll-part.
9948     GetElementPtrInst *PartPtr = nullptr;
9949 
9950     bool InBounds = false;
9951     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
9952       InBounds = gep->isInBounds();
9953     if (Reverse) {
9954       // If the address is consecutive but reversed, then the
9955       // wide store needs to start at the last vector element.
9956       // RunTimeVF =  VScale * VF.getKnownMinValue()
9957       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
9958       Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF);
9959       // NumElt = -Part * RunTimeVF
9960       Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
9961       // LastLane = 1 - RunTimeVF
9962       Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
9963       PartPtr =
9964           cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
9965       PartPtr->setIsInBounds(InBounds);
9966       PartPtr = cast<GetElementPtrInst>(
9967           Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
9968       PartPtr->setIsInBounds(InBounds);
9969       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
9970         BlockInMaskParts[Part] =
9971             Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse");
9972     } else {
9973       Value *Increment =
9974           createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part);
9975       PartPtr = cast<GetElementPtrInst>(
9976           Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
9977       PartPtr->setIsInBounds(InBounds);
9978     }
9979 
9980     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
9981     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
9982   };
9983 
9984   // Handle Stores:
9985   if (SI) {
9986     State.ILV->setDebugLocFromInst(SI);
9987 
9988     for (unsigned Part = 0; Part < State.UF; ++Part) {
9989       Instruction *NewSI = nullptr;
9990       Value *StoredVal = State.get(StoredValue, Part);
9991       if (CreateGatherScatter) {
9992         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9993         Value *VectorGep = State.get(getAddr(), Part);
9994         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
9995                                             MaskPart);
9996       } else {
9997         if (Reverse) {
9998           // If we store to reverse consecutive memory locations, then we need
9999           // to reverse the order of elements in the stored value.
10000           StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
10001           // We don't want to update the value in the map as it might be used in
10002           // another expression. So don't call resetVectorValue(StoredVal).
10003         }
10004         auto *VecPtr =
10005             CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
10006         if (isMaskRequired)
10007           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
10008                                             BlockInMaskParts[Part]);
10009         else
10010           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
10011       }
10012       State.ILV->addMetadata(NewSI, SI);
10013     }
10014     return;
10015   }
10016 
10017   // Handle loads.
10018   assert(LI && "Must have a load instruction");
10019   State.ILV->setDebugLocFromInst(LI);
10020   for (unsigned Part = 0; Part < State.UF; ++Part) {
10021     Value *NewLI;
10022     if (CreateGatherScatter) {
10023       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
10024       Value *VectorGep = State.get(getAddr(), Part);
10025       NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
10026                                          nullptr, "wide.masked.gather");
10027       State.ILV->addMetadata(NewLI, LI);
10028     } else {
10029       auto *VecPtr =
10030           CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
10031       if (isMaskRequired)
10032         NewLI = Builder.CreateMaskedLoad(
10033             DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
10034             PoisonValue::get(DataTy), "wide.masked.load");
10035       else
10036         NewLI =
10037             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
10038 
10039       // Add metadata to the load, but setVectorValue to the reverse shuffle.
10040       State.ILV->addMetadata(NewLI, LI);
10041       if (Reverse)
10042         NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
10043     }
10044 
10045     State.set(this, NewLI, Part);
10046   }
10047 }
10048 
10049 // Determine how to lower the scalar epilogue, which depends on 1) optimising
10050 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
10051 // predication, and 4) a TTI hook that analyses whether the loop is suitable
10052 // for predication.
10053 static ScalarEpilogueLowering getScalarEpilogueLowering(
10054     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
10055     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
10056     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
10057     LoopVectorizationLegality &LVL) {
10058   // 1) OptSize takes precedence over all other options, i.e. if this is set,
10059   // don't look at hints or options, and don't request a scalar epilogue.
10060   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
10061   // LoopAccessInfo (due to code dependency and not being able to reliably get
10062   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
10063   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
10064   // versioning when the vectorization is forced, unlike hasOptSize. So revert
10065   // back to the old way and vectorize with versioning when forced. See D81345.)
10066   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
10067                                                       PGSOQueryType::IRPass) &&
10068                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
10069     return CM_ScalarEpilogueNotAllowedOptSize;
10070 
10071   // 2) If set, obey the directives
10072   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
10073     switch (PreferPredicateOverEpilogue) {
10074     case PreferPredicateTy::ScalarEpilogue:
10075       return CM_ScalarEpilogueAllowed;
10076     case PreferPredicateTy::PredicateElseScalarEpilogue:
10077       return CM_ScalarEpilogueNotNeededUsePredicate;
10078     case PreferPredicateTy::PredicateOrDontVectorize:
10079       return CM_ScalarEpilogueNotAllowedUsePredicate;
10080     };
10081   }
10082 
10083   // 3) If set, obey the hints
10084   switch (Hints.getPredicate()) {
10085   case LoopVectorizeHints::FK_Enabled:
10086     return CM_ScalarEpilogueNotNeededUsePredicate;
10087   case LoopVectorizeHints::FK_Disabled:
10088     return CM_ScalarEpilogueAllowed;
10089   };
10090 
10091   // 4) if the TTI hook indicates this is profitable, request predication.
10092   if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
10093                                        LVL.getLAI()))
10094     return CM_ScalarEpilogueNotNeededUsePredicate;
10095 
10096   return CM_ScalarEpilogueAllowed;
10097 }
10098 
10099 Value *VPTransformState::get(VPValue *Def, unsigned Part) {
10100   // If Values have been set for this Def return the one relevant for \p Part.
10101   if (hasVectorValue(Def, Part))
10102     return Data.PerPartOutput[Def][Part];
10103 
10104   if (!hasScalarValue(Def, {Part, 0})) {
10105     Value *IRV = Def->getLiveInIRValue();
10106     Value *B = ILV->getBroadcastInstrs(IRV);
10107     set(Def, B, Part);
10108     return B;
10109   }
10110 
10111   Value *ScalarValue = get(Def, {Part, 0});
10112   // If we aren't vectorizing, we can just copy the scalar map values over
10113   // to the vector map.
10114   if (VF.isScalar()) {
10115     set(Def, ScalarValue, Part);
10116     return ScalarValue;
10117   }
10118 
10119   auto *RepR = dyn_cast<VPReplicateRecipe>(Def);
10120   bool IsUniform = RepR && RepR->isUniform();
10121 
10122   unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
10123   // Check if there is a scalar value for the selected lane.
10124   if (!hasScalarValue(Def, {Part, LastLane})) {
10125     // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform.
10126     assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) ||
10127             isa<VPScalarIVStepsRecipe>(Def->getDef())) &&
10128            "unexpected recipe found to be invariant");
10129     IsUniform = true;
10130     LastLane = 0;
10131   }
10132 
10133   auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));
10134   // Set the insert point after the last scalarized instruction or after the
10135   // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
10136   // will directly follow the scalar definitions.
10137   auto OldIP = Builder.saveIP();
10138   auto NewIP =
10139       isa<PHINode>(LastInst)
10140           ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
10141           : std::next(BasicBlock::iterator(LastInst));
10142   Builder.SetInsertPoint(&*NewIP);
10143 
10144   // However, if we are vectorizing, we need to construct the vector values.
10145   // If the value is known to be uniform after vectorization, we can just
10146   // broadcast the scalar value corresponding to lane zero for each unroll
10147   // iteration. Otherwise, we construct the vector values using
10148   // insertelement instructions. Since the resulting vectors are stored in
10149   // State, we will only generate the insertelements once.
10150   Value *VectorValue = nullptr;
10151   if (IsUniform) {
10152     VectorValue = ILV->getBroadcastInstrs(ScalarValue);
10153     set(Def, VectorValue, Part);
10154   } else {
10155     // Initialize packing with insertelements to start from undef.
10156     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
10157     Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
10158     set(Def, Undef, Part);
10159     for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
10160       ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this);
10161     VectorValue = get(Def, Part);
10162   }
10163   Builder.restoreIP(OldIP);
10164   return VectorValue;
10165 }
10166 
10167 // Process the loop in the VPlan-native vectorization path. This path builds
10168 // VPlan upfront in the vectorization pipeline, which allows to apply
10169 // VPlan-to-VPlan transformations from the very beginning without modifying the
10170 // input LLVM IR.
10171 static bool processLoopInVPlanNativePath(
10172     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
10173     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
10174     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
10175     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
10176     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
10177     LoopVectorizationRequirements &Requirements) {
10178 
10179   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
10180     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
10181     return false;
10182   }
10183   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
10184   Function *F = L->getHeader()->getParent();
10185   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
10186 
10187   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10188       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
10189 
10190   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
10191                                 &Hints, IAI);
10192   // Use the planner for outer loop vectorization.
10193   // TODO: CM is not used at this point inside the planner. Turn CM into an
10194   // optional argument if we don't need it in the future.
10195   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints,
10196                                Requirements, ORE);
10197 
10198   // Get user vectorization factor.
10199   ElementCount UserVF = Hints.getWidth();
10200 
10201   CM.collectElementTypesForWidening();
10202 
10203   // Plan how to best vectorize, return the best VF and its cost.
10204   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
10205 
10206   // If we are stress testing VPlan builds, do not attempt to generate vector
10207   // code. Masked vector code generation support will follow soon.
10208   // Also, do not attempt to vectorize if no vector code will be produced.
10209   if (VPlanBuildStressTest || EnableVPlanPredication ||
10210       VectorizationFactor::Disabled() == VF)
10211     return false;
10212 
10213   VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10214 
10215   {
10216     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10217                              F->getParent()->getDataLayout());
10218     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
10219                            &CM, BFI, PSI, Checks);
10220     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
10221                       << L->getHeader()->getParent()->getName() << "\"\n");
10222     LVP.executePlan(VF.Width, 1, BestPlan, LB, DT);
10223   }
10224 
10225   // Mark the loop as already vectorized to avoid vectorizing again.
10226   Hints.setAlreadyVectorized();
10227   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10228   return true;
10229 }
10230 
10231 // Emit a remark if there are stores to floats that required a floating point
10232 // extension. If the vectorized loop was generated with floating point there
10233 // will be a performance penalty from the conversion overhead and the change in
10234 // the vector width.
10235 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
10236   SmallVector<Instruction *, 4> Worklist;
10237   for (BasicBlock *BB : L->getBlocks()) {
10238     for (Instruction &Inst : *BB) {
10239       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
10240         if (S->getValueOperand()->getType()->isFloatTy())
10241           Worklist.push_back(S);
10242       }
10243     }
10244   }
10245 
10246   // Traverse the floating point stores upwards searching, for floating point
10247   // conversions.
10248   SmallPtrSet<const Instruction *, 4> Visited;
10249   SmallPtrSet<const Instruction *, 4> EmittedRemark;
10250   while (!Worklist.empty()) {
10251     auto *I = Worklist.pop_back_val();
10252     if (!L->contains(I))
10253       continue;
10254     if (!Visited.insert(I).second)
10255       continue;
10256 
10257     // Emit a remark if the floating point store required a floating
10258     // point conversion.
10259     // TODO: More work could be done to identify the root cause such as a
10260     // constant or a function return type and point the user to it.
10261     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
10262       ORE->emit([&]() {
10263         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
10264                                           I->getDebugLoc(), L->getHeader())
10265                << "floating point conversion changes vector width. "
10266                << "Mixed floating point precision requires an up/down "
10267                << "cast that will negatively impact performance.";
10268       });
10269 
10270     for (Use &Op : I->operands())
10271       if (auto *OpI = dyn_cast<Instruction>(Op))
10272         Worklist.push_back(OpI);
10273   }
10274 }
10275 
10276 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
10277     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10278                                !EnableLoopInterleaving),
10279       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10280                               !EnableLoopVectorization) {}
10281 
10282 bool LoopVectorizePass::processLoop(Loop *L) {
10283   assert((EnableVPlanNativePath || L->isInnermost()) &&
10284          "VPlan-native path is not enabled. Only process inner loops.");
10285 
10286 #ifndef NDEBUG
10287   const std::string DebugLocStr = getDebugLocString(L);
10288 #endif /* NDEBUG */
10289 
10290   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
10291                     << L->getHeader()->getParent()->getName() << "' from "
10292                     << DebugLocStr << "\n");
10293 
10294   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
10295 
10296   LLVM_DEBUG(
10297       dbgs() << "LV: Loop hints:"
10298              << " force="
10299              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
10300                      ? "disabled"
10301                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
10302                             ? "enabled"
10303                             : "?"))
10304              << " width=" << Hints.getWidth()
10305              << " interleave=" << Hints.getInterleave() << "\n");
10306 
10307   // Function containing loop
10308   Function *F = L->getHeader()->getParent();
10309 
10310   // Looking at the diagnostic output is the only way to determine if a loop
10311   // was vectorized (other than looking at the IR or machine code), so it
10312   // is important to generate an optimization remark for each loop. Most of
10313   // these messages are generated as OptimizationRemarkAnalysis. Remarks
10314   // generated as OptimizationRemark and OptimizationRemarkMissed are
10315   // less verbose reporting vectorized loops and unvectorized loops that may
10316   // benefit from vectorization, respectively.
10317 
10318   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10319     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10320     return false;
10321   }
10322 
10323   PredicatedScalarEvolution PSE(*SE, *L);
10324 
10325   // Check if it is legal to vectorize the loop.
10326   LoopVectorizationRequirements Requirements;
10327   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
10328                                 &Requirements, &Hints, DB, AC, BFI, PSI);
10329   if (!LVL.canVectorize(EnableVPlanNativePath)) {
10330     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10331     Hints.emitRemarkWithHints();
10332     return false;
10333   }
10334 
10335   // Check the function attributes and profiles to find out if this function
10336   // should be optimized for size.
10337   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10338       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
10339 
10340   // Entrance to the VPlan-native vectorization path. Outer loops are processed
10341   // here. They may require CFG and instruction level transformations before
10342   // even evaluating whether vectorization is profitable. Since we cannot modify
10343   // the incoming IR, we need to build VPlan upfront in the vectorization
10344   // pipeline.
10345   if (!L->isInnermost())
10346     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10347                                         ORE, BFI, PSI, Hints, Requirements);
10348 
10349   assert(L->isInnermost() && "Inner loop expected.");
10350 
10351   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10352   // count by optimizing for size, to minimize overheads.
10353   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
10354   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10355     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10356                       << "This loop is worth vectorizing only if no scalar "
10357                       << "iteration overheads are incurred.");
10358     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10359       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10360     else {
10361       LLVM_DEBUG(dbgs() << "\n");
10362       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10363     }
10364   }
10365 
10366   // Check the function attributes to see if implicit floats are allowed.
10367   // FIXME: This check doesn't seem possibly correct -- what if the loop is
10368   // an integer loop and the vector instructions selected are purely integer
10369   // vector instructions?
10370   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10371     reportVectorizationFailure(
10372         "Can't vectorize when the NoImplicitFloat attribute is used",
10373         "loop not vectorized due to NoImplicitFloat attribute",
10374         "NoImplicitFloat", ORE, L);
10375     Hints.emitRemarkWithHints();
10376     return false;
10377   }
10378 
10379   // Check if the target supports potentially unsafe FP vectorization.
10380   // FIXME: Add a check for the type of safety issue (denormal, signaling)
10381   // for the target we're vectorizing for, to make sure none of the
10382   // additional fp-math flags can help.
10383   if (Hints.isPotentiallyUnsafe() &&
10384       TTI->isFPVectorizationPotentiallyUnsafe()) {
10385     reportVectorizationFailure(
10386         "Potentially unsafe FP op prevents vectorization",
10387         "loop not vectorized due to unsafe FP support.",
10388         "UnsafeFP", ORE, L);
10389     Hints.emitRemarkWithHints();
10390     return false;
10391   }
10392 
10393   bool AllowOrderedReductions;
10394   // If the flag is set, use that instead and override the TTI behaviour.
10395   if (ForceOrderedReductions.getNumOccurrences() > 0)
10396     AllowOrderedReductions = ForceOrderedReductions;
10397   else
10398     AllowOrderedReductions = TTI->enableOrderedReductions();
10399   if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10400     ORE->emit([&]() {
10401       auto *ExactFPMathInst = Requirements.getExactFPInst();
10402       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10403                                                  ExactFPMathInst->getDebugLoc(),
10404                                                  ExactFPMathInst->getParent())
10405              << "loop not vectorized: cannot prove it is safe to reorder "
10406                 "floating-point operations";
10407     });
10408     LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10409                          "reorder floating-point operations\n");
10410     Hints.emitRemarkWithHints();
10411     return false;
10412   }
10413 
10414   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10415   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10416 
10417   // If an override option has been passed in for interleaved accesses, use it.
10418   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
10419     UseInterleaved = EnableInterleavedMemAccesses;
10420 
10421   // Analyze interleaved memory accesses.
10422   if (UseInterleaved) {
10423     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
10424   }
10425 
10426   // Use the cost model.
10427   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10428                                 F, &Hints, IAI);
10429   CM.collectValuesToIgnore();
10430   CM.collectElementTypesForWidening();
10431 
10432   // Use the planner for vectorization.
10433   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints,
10434                                Requirements, ORE);
10435 
10436   // Get user vectorization factor and interleave count.
10437   ElementCount UserVF = Hints.getWidth();
10438   unsigned UserIC = Hints.getInterleave();
10439 
10440   // Plan how to best vectorize, return the best VF and its cost.
10441   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
10442 
10443   VectorizationFactor VF = VectorizationFactor::Disabled();
10444   unsigned IC = 1;
10445 
10446   if (MaybeVF) {
10447     VF = *MaybeVF;
10448     // Select the interleave count.
10449     IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());
10450   }
10451 
10452   // Identify the diagnostic messages that should be produced.
10453   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10454   bool VectorizeLoop = true, InterleaveLoop = true;
10455   if (VF.Width.isScalar()) {
10456     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10457     VecDiagMsg = std::make_pair(
10458         "VectorizationNotBeneficial",
10459         "the cost-model indicates that vectorization is not beneficial");
10460     VectorizeLoop = false;
10461   }
10462 
10463   if (!MaybeVF && UserIC > 1) {
10464     // Tell the user interleaving was avoided up-front, despite being explicitly
10465     // requested.
10466     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10467                          "interleaving should be avoided up front\n");
10468     IntDiagMsg = std::make_pair(
10469         "InterleavingAvoided",
10470         "Ignoring UserIC, because interleaving was avoided up front");
10471     InterleaveLoop = false;
10472   } else if (IC == 1 && UserIC <= 1) {
10473     // Tell the user interleaving is not beneficial.
10474     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10475     IntDiagMsg = std::make_pair(
10476         "InterleavingNotBeneficial",
10477         "the cost-model indicates that interleaving is not beneficial");
10478     InterleaveLoop = false;
10479     if (UserIC == 1) {
10480       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10481       IntDiagMsg.second +=
10482           " and is explicitly disabled or interleave count is set to 1";
10483     }
10484   } else if (IC > 1 && UserIC == 1) {
10485     // Tell the user interleaving is beneficial, but it explicitly disabled.
10486     LLVM_DEBUG(
10487         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10488     IntDiagMsg = std::make_pair(
10489         "InterleavingBeneficialButDisabled",
10490         "the cost-model indicates that interleaving is beneficial "
10491         "but is explicitly disabled or interleave count is set to 1");
10492     InterleaveLoop = false;
10493   }
10494 
10495   // Override IC if user provided an interleave count.
10496   IC = UserIC > 0 ? UserIC : IC;
10497 
10498   // Emit diagnostic messages, if any.
10499   const char *VAPassName = Hints.vectorizeAnalysisPassName();
10500   if (!VectorizeLoop && !InterleaveLoop) {
10501     // Do not vectorize or interleaving the loop.
10502     ORE->emit([&]() {
10503       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10504                                       L->getStartLoc(), L->getHeader())
10505              << VecDiagMsg.second;
10506     });
10507     ORE->emit([&]() {
10508       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10509                                       L->getStartLoc(), L->getHeader())
10510              << IntDiagMsg.second;
10511     });
10512     return false;
10513   } else if (!VectorizeLoop && InterleaveLoop) {
10514     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10515     ORE->emit([&]() {
10516       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10517                                         L->getStartLoc(), L->getHeader())
10518              << VecDiagMsg.second;
10519     });
10520   } else if (VectorizeLoop && !InterleaveLoop) {
10521     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10522                       << ") in " << DebugLocStr << '\n');
10523     ORE->emit([&]() {
10524       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10525                                         L->getStartLoc(), L->getHeader())
10526              << IntDiagMsg.second;
10527     });
10528   } else if (VectorizeLoop && InterleaveLoop) {
10529     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10530                       << ") in " << DebugLocStr << '\n');
10531     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10532   }
10533 
10534   bool DisableRuntimeUnroll = false;
10535   MDNode *OrigLoopID = L->getLoopID();
10536   {
10537     // Optimistically generate runtime checks. Drop them if they turn out to not
10538     // be profitable. Limit the scope of Checks, so the cleanup happens
10539     // immediately after vector codegeneration is done.
10540     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10541                              F->getParent()->getDataLayout());
10542     if (!VF.Width.isScalar() || IC > 1)
10543       Checks.Create(L, *LVL.getLAI(), PSE.getPredicate());
10544 
10545     using namespace ore;
10546     if (!VectorizeLoop) {
10547       assert(IC > 1 && "interleave count should not be 1 or 0");
10548       // If we decided that it is not legal to vectorize the loop, then
10549       // interleave it.
10550       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10551                                  &CM, BFI, PSI, Checks);
10552 
10553       VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10554       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT);
10555 
10556       ORE->emit([&]() {
10557         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10558                                   L->getHeader())
10559                << "interleaved loop (interleaved count: "
10560                << NV("InterleaveCount", IC) << ")";
10561       });
10562     } else {
10563       // If we decided that it is *legal* to vectorize the loop, then do it.
10564 
10565       // Consider vectorizing the epilogue too if it's profitable.
10566       VectorizationFactor EpilogueVF =
10567           CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
10568       if (EpilogueVF.Width.isVector()) {
10569 
10570         // The first pass vectorizes the main loop and creates a scalar epilogue
10571         // to be vectorized by executing the plan (potentially with a different
10572         // factor) again shortly afterwards.
10573         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10574         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10575                                            EPI, &LVL, &CM, BFI, PSI, Checks);
10576 
10577         VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
10578         LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV,
10579                         DT);
10580         ++LoopsVectorized;
10581 
10582         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10583         formLCSSARecursively(*L, *DT, LI, SE);
10584 
10585         // Second pass vectorizes the epilogue and adjusts the control flow
10586         // edges from the first pass.
10587         EPI.MainLoopVF = EPI.EpilogueVF;
10588         EPI.MainLoopUF = EPI.EpilogueUF;
10589         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10590                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
10591                                                  Checks);
10592 
10593         VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
10594 
10595         // Ensure that the start values for any VPReductionPHIRecipes are
10596         // updated before vectorising the epilogue loop.
10597         VPBasicBlock *Header = BestEpiPlan.getEntry()->getEntryBasicBlock();
10598         for (VPRecipeBase &R : Header->phis()) {
10599           if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10600             if (auto *Resume = MainILV.getReductionResumeValue(
10601                     ReductionPhi->getRecurrenceDescriptor())) {
10602               VPValue *StartVal = new VPValue(Resume);
10603               BestEpiPlan.addExternalDef(StartVal);
10604               ReductionPhi->setOperand(0, StartVal);
10605             }
10606           }
10607         }
10608 
10609         LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10610                         DT);
10611         ++LoopsEpilogueVectorized;
10612 
10613         if (!MainILV.areSafetyChecksAdded())
10614           DisableRuntimeUnroll = true;
10615       } else {
10616         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
10617                                &LVL, &CM, BFI, PSI, Checks);
10618 
10619         VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10620         LVP.executePlan(VF.Width, IC, BestPlan, LB, DT);
10621         ++LoopsVectorized;
10622 
10623         // Add metadata to disable runtime unrolling a scalar loop when there
10624         // are no runtime checks about strides and memory. A scalar loop that is
10625         // rarely used is not worth unrolling.
10626         if (!LB.areSafetyChecksAdded())
10627           DisableRuntimeUnroll = true;
10628       }
10629       // Report the vectorization decision.
10630       ORE->emit([&]() {
10631         return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
10632                                   L->getHeader())
10633                << "vectorized loop (vectorization width: "
10634                << NV("VectorizationFactor", VF.Width)
10635                << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
10636       });
10637     }
10638 
10639     if (ORE->allowExtraAnalysis(LV_NAME))
10640       checkMixedPrecision(L, ORE);
10641   }
10642 
10643   Optional<MDNode *> RemainderLoopID =
10644       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10645                                       LLVMLoopVectorizeFollowupEpilogue});
10646   if (RemainderLoopID.hasValue()) {
10647     L->setLoopID(RemainderLoopID.getValue());
10648   } else {
10649     if (DisableRuntimeUnroll)
10650       AddRuntimeUnrollDisableMetaData(L);
10651 
10652     // Mark the loop as already vectorized to avoid vectorizing again.
10653     Hints.setAlreadyVectorized();
10654   }
10655 
10656   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10657   return true;
10658 }
10659 
10660 LoopVectorizeResult LoopVectorizePass::runImpl(
10661     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10662     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
10663     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
10664     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
10665     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10666   SE = &SE_;
10667   LI = &LI_;
10668   TTI = &TTI_;
10669   DT = &DT_;
10670   BFI = &BFI_;
10671   TLI = TLI_;
10672   AA = &AA_;
10673   AC = &AC_;
10674   GetLAA = &GetLAA_;
10675   DB = &DB_;
10676   ORE = &ORE_;
10677   PSI = PSI_;
10678 
10679   // Don't attempt if
10680   // 1. the target claims to have no vector registers, and
10681   // 2. interleaving won't help ILP.
10682   //
10683   // The second condition is necessary because, even if the target has no
10684   // vector registers, loop vectorization may still enable scalar
10685   // interleaving.
10686   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10687       TTI->getMaxInterleaveFactor(1) < 2)
10688     return LoopVectorizeResult(false, false);
10689 
10690   bool Changed = false, CFGChanged = false;
10691 
10692   // The vectorizer requires loops to be in simplified form.
10693   // Since simplification may add new inner loops, it has to run before the
10694   // legality and profitability checks. This means running the loop vectorizer
10695   // will simplify all loops, regardless of whether anything end up being
10696   // vectorized.
10697   for (auto &L : *LI)
10698     Changed |= CFGChanged |=
10699         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10700 
10701   // Build up a worklist of inner-loops to vectorize. This is necessary as
10702   // the act of vectorizing or partially unrolling a loop creates new loops
10703   // and can invalidate iterators across the loops.
10704   SmallVector<Loop *, 8> Worklist;
10705 
10706   for (Loop *L : *LI)
10707     collectSupportedLoops(*L, LI, ORE, Worklist);
10708 
10709   LoopsAnalyzed += Worklist.size();
10710 
10711   // Now walk the identified inner loops.
10712   while (!Worklist.empty()) {
10713     Loop *L = Worklist.pop_back_val();
10714 
10715     // For the inner loops we actually process, form LCSSA to simplify the
10716     // transform.
10717     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10718 
10719     Changed |= CFGChanged |= processLoop(L);
10720   }
10721 
10722   // Process each loop nest in the function.
10723   return LoopVectorizeResult(Changed, CFGChanged);
10724 }
10725 
10726 PreservedAnalyses LoopVectorizePass::run(Function &F,
10727                                          FunctionAnalysisManager &AM) {
10728     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
10729     auto &LI = AM.getResult<LoopAnalysis>(F);
10730     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10731     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10732     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
10733     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10734     auto &AA = AM.getResult<AAManager>(F);
10735     auto &AC = AM.getResult<AssumptionAnalysis>(F);
10736     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10737     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10738 
10739     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
10740     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
10741         [&](Loop &L) -> const LoopAccessInfo & {
10742       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,      SE,
10743                                         TLI, TTI, nullptr, nullptr, nullptr};
10744       return LAM.getResult<LoopAccessAnalysis>(L, AR);
10745     };
10746     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10747     ProfileSummaryInfo *PSI =
10748         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10749     LoopVectorizeResult Result =
10750         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
10751     if (!Result.MadeAnyChange)
10752       return PreservedAnalyses::all();
10753     PreservedAnalyses PA;
10754 
10755     // We currently do not preserve loopinfo/dominator analyses with outer loop
10756     // vectorization. Until this is addressed, mark these analyses as preserved
10757     // only for non-VPlan-native path.
10758     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10759     if (!EnableVPlanNativePath) {
10760       PA.preserve<LoopAnalysis>();
10761       PA.preserve<DominatorTreeAnalysis>();
10762     }
10763 
10764     if (Result.MadeCFGChange) {
10765       // Making CFG changes likely means a loop got vectorized. Indicate that
10766       // extra simplification passes should be run.
10767       // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10768       // be run if runtime checks have been added.
10769       AM.getResult<ShouldRunExtraVectorPasses>(F);
10770       PA.preserve<ShouldRunExtraVectorPasses>();
10771     } else {
10772       PA.preserveSet<CFGAnalyses>();
10773     }
10774     return PA;
10775 }
10776 
10777 void LoopVectorizePass::printPipeline(
10778     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10779   static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10780       OS, MapClassName2PassName);
10781 
10782   OS << "<";
10783   OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10784   OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10785   OS << ">";
10786 }
10787