1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SmallPtrSet.h"
73 #include "llvm/ADT/SmallSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
91 #include "llvm/Analysis/ProfileSummaryInfo.h"
92 #include "llvm/Analysis/ScalarEvolution.h"
93 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
94 #include "llvm/Analysis/TargetLibraryInfo.h"
95 #include "llvm/Analysis/TargetTransformInfo.h"
96 #include "llvm/Analysis/VectorUtils.h"
97 #include "llvm/IR/Attributes.h"
98 #include "llvm/IR/BasicBlock.h"
99 #include "llvm/IR/CFG.h"
100 #include "llvm/IR/Constant.h"
101 #include "llvm/IR/Constants.h"
102 #include "llvm/IR/DataLayout.h"
103 #include "llvm/IR/DebugInfoMetadata.h"
104 #include "llvm/IR/DebugLoc.h"
105 #include "llvm/IR/DerivedTypes.h"
106 #include "llvm/IR/DiagnosticInfo.h"
107 #include "llvm/IR/Dominators.h"
108 #include "llvm/IR/Function.h"
109 #include "llvm/IR/IRBuilder.h"
110 #include "llvm/IR/InstrTypes.h"
111 #include "llvm/IR/Instruction.h"
112 #include "llvm/IR/Instructions.h"
113 #include "llvm/IR/IntrinsicInst.h"
114 #include "llvm/IR/Intrinsics.h"
115 #include "llvm/IR/LLVMContext.h"
116 #include "llvm/IR/Metadata.h"
117 #include "llvm/IR/Module.h"
118 #include "llvm/IR/Operator.h"
119 #include "llvm/IR/PatternMatch.h"
120 #include "llvm/IR/Type.h"
121 #include "llvm/IR/Use.h"
122 #include "llvm/IR/User.h"
123 #include "llvm/IR/Value.h"
124 #include "llvm/IR/ValueHandle.h"
125 #include "llvm/IR/Verifier.h"
126 #include "llvm/InitializePasses.h"
127 #include "llvm/Pass.h"
128 #include "llvm/Support/Casting.h"
129 #include "llvm/Support/CommandLine.h"
130 #include "llvm/Support/Compiler.h"
131 #include "llvm/Support/Debug.h"
132 #include "llvm/Support/ErrorHandling.h"
133 #include "llvm/Support/InstructionCost.h"
134 #include "llvm/Support/MathExtras.h"
135 #include "llvm/Support/raw_ostream.h"
136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
137 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
138 #include "llvm/Transforms/Utils/LoopSimplify.h"
139 #include "llvm/Transforms/Utils/LoopUtils.h"
140 #include "llvm/Transforms/Utils/LoopVersioning.h"
141 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
142 #include "llvm/Transforms/Utils/SizeOpts.h"
143 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
144 #include <algorithm>
145 #include <cassert>
146 #include <cstdint>
147 #include <cstdlib>
148 #include <functional>
149 #include <iterator>
150 #include <limits>
151 #include <memory>
152 #include <string>
153 #include <tuple>
154 #include <utility>
155 
156 using namespace llvm;
157 
158 #define LV_NAME "loop-vectorize"
159 #define DEBUG_TYPE LV_NAME
160 
161 #ifndef NDEBUG
162 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
163 #endif
164 
165 /// @{
166 /// Metadata attribute names
167 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
168 const char LLVMLoopVectorizeFollowupVectorized[] =
169     "llvm.loop.vectorize.followup_vectorized";
170 const char LLVMLoopVectorizeFollowupEpilogue[] =
171     "llvm.loop.vectorize.followup_epilogue";
172 /// @}
173 
174 STATISTIC(LoopsVectorized, "Number of loops vectorized");
175 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
176 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
177 
178 static cl::opt<bool> EnableEpilogueVectorization(
179     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
180     cl::desc("Enable vectorization of epilogue loops."));
181 
182 static cl::opt<unsigned> EpilogueVectorizationForceVF(
183     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
184     cl::desc("When epilogue vectorization is enabled, and a value greater than "
185              "1 is specified, forces the given VF for all applicable epilogue "
186              "loops."));
187 
188 static cl::opt<unsigned> EpilogueVectorizationMinVF(
189     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
190     cl::desc("Only loops with vectorization factor equal to or larger than "
191              "the specified value are considered for epilogue vectorization."));
192 
193 /// Loops with a known constant trip count below this number are vectorized only
194 /// if no scalar iteration overheads are incurred.
195 static cl::opt<unsigned> TinyTripCountVectorThreshold(
196     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
197     cl::desc("Loops with a constant trip count that is smaller than this "
198              "value are vectorized only if no scalar iteration overheads "
199              "are incurred."));
200 
201 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
202     "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
203     cl::desc("The maximum allowed number of runtime memory checks with a "
204              "vectorize(enable) pragma."));
205 
206 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
207 // that predication is preferred, and this lists all options. I.e., the
208 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
209 // and predicate the instructions accordingly. If tail-folding fails, there are
210 // different fallback strategies depending on these values:
211 namespace PreferPredicateTy {
212   enum Option {
213     ScalarEpilogue = 0,
214     PredicateElseScalarEpilogue,
215     PredicateOrDontVectorize
216   };
217 } // namespace PreferPredicateTy
218 
219 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
220     "prefer-predicate-over-epilogue",
221     cl::init(PreferPredicateTy::ScalarEpilogue),
222     cl::Hidden,
223     cl::desc("Tail-folding and predication preferences over creating a scalar "
224              "epilogue loop."),
225     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
226                          "scalar-epilogue",
227                          "Don't tail-predicate loops, create scalar epilogue"),
228               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
229                          "predicate-else-scalar-epilogue",
230                          "prefer tail-folding, create scalar epilogue if tail "
231                          "folding fails."),
232               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
233                          "predicate-dont-vectorize",
234                          "prefers tail-folding, don't attempt vectorization if "
235                          "tail-folding fails.")));
236 
237 static cl::opt<bool> MaximizeBandwidth(
238     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
239     cl::desc("Maximize bandwidth when selecting vectorization factor which "
240              "will be determined by the smallest type in loop."));
241 
242 static cl::opt<bool> EnableInterleavedMemAccesses(
243     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
244     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
245 
246 /// An interleave-group may need masking if it resides in a block that needs
247 /// predication, or in order to mask away gaps.
248 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
249     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
250     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
251 
252 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
253     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
254     cl::desc("We don't interleave loops with a estimated constant trip count "
255              "below this number"));
256 
257 static cl::opt<unsigned> ForceTargetNumScalarRegs(
258     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
259     cl::desc("A flag that overrides the target's number of scalar registers."));
260 
261 static cl::opt<unsigned> ForceTargetNumVectorRegs(
262     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
263     cl::desc("A flag that overrides the target's number of vector registers."));
264 
265 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
266     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
267     cl::desc("A flag that overrides the target's max interleave factor for "
268              "scalar loops."));
269 
270 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
271     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
272     cl::desc("A flag that overrides the target's max interleave factor for "
273              "vectorized loops."));
274 
275 static cl::opt<unsigned> ForceTargetInstructionCost(
276     "force-target-instruction-cost", cl::init(0), cl::Hidden,
277     cl::desc("A flag that overrides the target's expected cost for "
278              "an instruction to a single constant value. Mostly "
279              "useful for getting consistent testing."));
280 
281 static cl::opt<bool> ForceTargetSupportsScalableVectors(
282     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
283     cl::desc(
284         "Pretend that scalable vectors are supported, even if the target does "
285         "not support them. This flag should only be used for testing."));
286 
287 static cl::opt<unsigned> SmallLoopCost(
288     "small-loop-cost", cl::init(20), cl::Hidden,
289     cl::desc(
290         "The cost of a loop that is considered 'small' by the interleaver."));
291 
292 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
293     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
294     cl::desc("Enable the use of the block frequency analysis to access PGO "
295              "heuristics minimizing code growth in cold regions and being more "
296              "aggressive in hot regions."));
297 
298 // Runtime interleave loops for load/store throughput.
299 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
300     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
301     cl::desc(
302         "Enable runtime interleaving until load/store ports are saturated"));
303 
304 /// Interleave small loops with scalar reductions.
305 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
306     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
307     cl::desc("Enable interleaving for loops with small iteration counts that "
308              "contain scalar reductions to expose ILP."));
309 
310 static cl::opt<bool> EnableIndVarRegisterHeur(
311     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
312     cl::desc("Count the induction variable only once when interleaving"));
313 
314 static cl::opt<bool> EnableCondStoresVectorization(
315     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
316     cl::desc("Enable if predication of stores during vectorization."));
317 
318 static cl::opt<unsigned> MaxNestedScalarReductionIC(
319     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
320     cl::desc("The maximum interleave count to use when interleaving a scalar "
321              "reduction in a nested loop."));
322 
323 static cl::opt<bool>
324     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
325                            cl::Hidden,
326                            cl::desc("Prefer in-loop vector reductions, "
327                                     "overriding the targets preference."));
328 
329 static cl::opt<bool> ForceOrderedReductions(
330     "force-ordered-reductions", cl::init(false), cl::Hidden,
331     cl::desc("Enable the vectorisation of loops with in-order (strict) "
332              "FP reductions"));
333 
334 static cl::opt<bool> PreferPredicatedReductionSelect(
335     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
336     cl::desc(
337         "Prefer predicating a reduction operation over an after loop select."));
338 
339 cl::opt<bool> EnableVPlanNativePath(
340     "enable-vplan-native-path", cl::init(false), cl::Hidden,
341     cl::desc("Enable VPlan-native vectorization path with "
342              "support for outer loop vectorization."));
343 
344 // FIXME: Remove this switch once we have divergence analysis. Currently we
345 // assume divergent non-backedge branches when this switch is true.
346 cl::opt<bool> EnableVPlanPredication(
347     "enable-vplan-predication", cl::init(false), cl::Hidden,
348     cl::desc("Enable VPlan-native vectorization path predicator with "
349              "support for outer loop vectorization."));
350 
351 // This flag enables the stress testing of the VPlan H-CFG construction in the
352 // VPlan-native vectorization path. It must be used in conjuction with
353 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
354 // verification of the H-CFGs built.
355 static cl::opt<bool> VPlanBuildStressTest(
356     "vplan-build-stress-test", cl::init(false), cl::Hidden,
357     cl::desc(
358         "Build VPlan for every supported loop nest in the function and bail "
359         "out right after the build (stress test the VPlan H-CFG construction "
360         "in the VPlan-native vectorization path)."));
361 
362 cl::opt<bool> llvm::EnableLoopInterleaving(
363     "interleave-loops", cl::init(true), cl::Hidden,
364     cl::desc("Enable loop interleaving in Loop vectorization passes"));
365 cl::opt<bool> llvm::EnableLoopVectorization(
366     "vectorize-loops", cl::init(true), cl::Hidden,
367     cl::desc("Run the Loop vectorization passes"));
368 
369 cl::opt<bool> PrintVPlansInDotFormat(
370     "vplan-print-in-dot-format", cl::init(false), cl::Hidden,
371     cl::desc("Use dot format instead of plain text when dumping VPlans"));
372 
373 /// A helper function that returns true if the given type is irregular. The
374 /// type is irregular if its allocated size doesn't equal the store size of an
375 /// element of the corresponding vector type.
376 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
377   // Determine if an array of N elements of type Ty is "bitcast compatible"
378   // with a <N x Ty> vector.
379   // This is only true if there is no padding between the array elements.
380   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
381 }
382 
383 /// A helper function that returns the reciprocal of the block probability of
384 /// predicated blocks. If we return X, we are assuming the predicated block
385 /// will execute once for every X iterations of the loop header.
386 ///
387 /// TODO: We should use actual block probability here, if available. Currently,
388 ///       we always assume predicated blocks have a 50% chance of executing.
389 static unsigned getReciprocalPredBlockProb() { return 2; }
390 
391 /// A helper function that returns an integer or floating-point constant with
392 /// value C.
393 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
394   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
395                            : ConstantFP::get(Ty, C);
396 }
397 
398 /// Returns "best known" trip count for the specified loop \p L as defined by
399 /// the following procedure:
400 ///   1) Returns exact trip count if it is known.
401 ///   2) Returns expected trip count according to profile data if any.
402 ///   3) Returns upper bound estimate if it is known.
403 ///   4) Returns None if all of the above failed.
404 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
405   // Check if exact trip count is known.
406   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
407     return ExpectedTC;
408 
409   // Check if there is an expected trip count available from profile data.
410   if (LoopVectorizeWithBlockFrequency)
411     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
412       return EstimatedTC;
413 
414   // Check if upper bound estimate is known.
415   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
416     return ExpectedTC;
417 
418   return None;
419 }
420 
421 // Forward declare GeneratedRTChecks.
422 class GeneratedRTChecks;
423 
424 namespace llvm {
425 
426 AnalysisKey ShouldRunExtraVectorPasses::Key;
427 
428 /// InnerLoopVectorizer vectorizes loops which contain only one basic
429 /// block to a specified vectorization factor (VF).
430 /// This class performs the widening of scalars into vectors, or multiple
431 /// scalars. This class also implements the following features:
432 /// * It inserts an epilogue loop for handling loops that don't have iteration
433 ///   counts that are known to be a multiple of the vectorization factor.
434 /// * It handles the code generation for reduction variables.
435 /// * Scalarization (implementation using scalars) of un-vectorizable
436 ///   instructions.
437 /// InnerLoopVectorizer does not perform any vectorization-legality
438 /// checks, and relies on the caller to check for the different legality
439 /// aspects. The InnerLoopVectorizer relies on the
440 /// LoopVectorizationLegality class to provide information about the induction
441 /// and reduction variables that were found to a given vectorization factor.
442 class InnerLoopVectorizer {
443 public:
444   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
445                       LoopInfo *LI, DominatorTree *DT,
446                       const TargetLibraryInfo *TLI,
447                       const TargetTransformInfo *TTI, AssumptionCache *AC,
448                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
449                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
450                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
451                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
452       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
453         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
454         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
455         PSI(PSI), RTChecks(RTChecks) {
456     // Query this against the original loop and save it here because the profile
457     // of the original loop header may change as the transformation happens.
458     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
459         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
460   }
461 
462   virtual ~InnerLoopVectorizer() = default;
463 
464   /// Create a new empty loop that will contain vectorized instructions later
465   /// on, while the old loop will be used as the scalar remainder. Control flow
466   /// is generated around the vectorized (and scalar epilogue) loops consisting
467   /// of various checks and bypasses. Return the pre-header block of the new
468   /// loop and the start value for the canonical induction, if it is != 0. The
469   /// latter is the case when vectorizing the epilogue loop. In the case of
470   /// epilogue vectorization, this function is overriden to handle the more
471   /// complex control flow around the loops.
472   virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton();
473 
474   /// Widen a single call instruction within the innermost loop.
475   void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
476                             VPTransformState &State);
477 
478   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
479   void fixVectorizedLoop(VPTransformState &State);
480 
481   // Return true if any runtime check is added.
482   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
483 
484   /// A type for vectorized values in the new loop. Each value from the
485   /// original loop, when vectorized, is represented by UF vector values in the
486   /// new unrolled loop, where UF is the unroll factor.
487   using VectorParts = SmallVector<Value *, 2>;
488 
489   /// Vectorize a single first-order recurrence or pointer induction PHINode in
490   /// a block. This method handles the induction variable canonicalization. It
491   /// supports both VF = 1 for unrolled loops and arbitrary length vectors.
492   void widenPHIInstruction(Instruction *PN, VPWidenPHIRecipe *PhiR,
493                            VPTransformState &State);
494 
495   /// A helper function to scalarize a single Instruction in the innermost loop.
496   /// Generates a sequence of scalar instances for each lane between \p MinLane
497   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
498   /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
499   /// Instr's operands.
500   void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe,
501                             const VPIteration &Instance, bool IfPredicateInstr,
502                             VPTransformState &State);
503 
504   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
505   /// is provided, the integer induction variable will first be truncated to
506   /// the corresponding type. \p CanonicalIV is the scalar value generated for
507   /// the canonical induction variable.
508   void widenIntOrFpInduction(PHINode *IV, VPWidenIntOrFpInductionRecipe *Def,
509                              VPTransformState &State, Value *CanonicalIV);
510 
511   /// Construct the vector value of a scalarized value \p V one lane at a time.
512   void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
513                                  VPTransformState &State);
514 
515   /// Try to vectorize interleaved access group \p Group with the base address
516   /// given in \p Addr, optionally masking the vector operations if \p
517   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
518   /// values in the vectorized loop.
519   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
520                                 ArrayRef<VPValue *> VPDefs,
521                                 VPTransformState &State, VPValue *Addr,
522                                 ArrayRef<VPValue *> StoredValues,
523                                 VPValue *BlockInMask = nullptr);
524 
525   /// Set the debug location in the builder \p Ptr using the debug location in
526   /// \p V. If \p Ptr is None then it uses the class member's Builder.
527   void setDebugLocFromInst(const Value *V,
528                            Optional<IRBuilderBase *> CustomBuilder = None);
529 
530   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
531   void fixNonInductionPHIs(VPTransformState &State);
532 
533   /// Returns true if the reordering of FP operations is not allowed, but we are
534   /// able to vectorize with strict in-order reductions for the given RdxDesc.
535   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
536 
537   /// Create a broadcast instruction. This method generates a broadcast
538   /// instruction (shuffle) for loop invariant values and for the induction
539   /// value. If this is the induction variable then we extend it to N, N+1, ...
540   /// this is needed because each iteration in the loop corresponds to a SIMD
541   /// element.
542   virtual Value *getBroadcastInstrs(Value *V);
543 
544   /// Add metadata from one instruction to another.
545   ///
546   /// This includes both the original MDs from \p From and additional ones (\see
547   /// addNewMetadata).  Use this for *newly created* instructions in the vector
548   /// loop.
549   void addMetadata(Instruction *To, Instruction *From);
550 
551   /// Similar to the previous function but it adds the metadata to a
552   /// vector of instructions.
553   void addMetadata(ArrayRef<Value *> To, Instruction *From);
554 
555   // Returns the resume value (bc.merge.rdx) for a reduction as
556   // generated by fixReduction.
557   PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);
558 
559 protected:
560   friend class LoopVectorizationPlanner;
561 
562   /// A small list of PHINodes.
563   using PhiVector = SmallVector<PHINode *, 4>;
564 
565   /// A type for scalarized values in the new loop. Each value from the
566   /// original loop, when scalarized, is represented by UF x VF scalar values
567   /// in the new unrolled loop, where UF is the unroll factor and VF is the
568   /// vectorization factor.
569   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
570 
571   /// Set up the values of the IVs correctly when exiting the vector loop.
572   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
573                     Value *CountRoundDown, Value *EndValue,
574                     BasicBlock *MiddleBlock);
575 
576   /// Introduce a conditional branch (on true, condition to be set later) at the
577   /// end of the header=latch connecting it to itself (across the backedge) and
578   /// to the exit block of \p L.
579   void createHeaderBranch(Loop *L);
580 
581   /// Handle all cross-iteration phis in the header.
582   void fixCrossIterationPHIs(VPTransformState &State);
583 
584   /// Create the exit value of first order recurrences in the middle block and
585   /// update their users.
586   void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
587                                VPTransformState &State);
588 
589   /// Create code for the loop exit value of the reduction.
590   void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
591 
592   /// Clear NSW/NUW flags from reduction instructions if necessary.
593   void clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
594                                VPTransformState &State);
595 
596   /// Fixup the LCSSA phi nodes in the unique exit block.  This simply
597   /// means we need to add the appropriate incoming value from the middle
598   /// block as exiting edges from the scalar epilogue loop (if present) are
599   /// already in place, and we exit the vector loop exclusively to the middle
600   /// block.
601   void fixLCSSAPHIs(VPTransformState &State);
602 
603   /// Iteratively sink the scalarized operands of a predicated instruction into
604   /// the block that was created for it.
605   void sinkScalarOperands(Instruction *PredInst);
606 
607   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
608   /// represented as.
609   void truncateToMinimalBitwidths(VPTransformState &State);
610 
611   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
612   /// variable on which to base the steps, \p Step is the size of the step, and
613   /// \p EntryVal is the value from the original loop that maps to the steps.
614   /// Note that \p EntryVal doesn't have to be an induction variable - it
615   /// can also be a truncate instruction.
616   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
617                         const InductionDescriptor &ID, VPValue *Def,
618                         VPTransformState &State);
619 
620   /// Create a vector induction phi node based on an existing scalar one. \p
621   /// EntryVal is the value from the original loop that maps to the vector phi
622   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
623   /// truncate instruction, instead of widening the original IV, we widen a
624   /// version of the IV truncated to \p EntryVal's type.
625   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
626                                        Value *Step, Value *Start,
627                                        Instruction *EntryVal, VPValue *Def,
628                                        VPTransformState &State);
629 
630   /// Returns (and creates if needed) the original loop trip count.
631   Value *getOrCreateTripCount(Loop *NewLoop);
632 
633   /// Returns (and creates if needed) the trip count of the widened loop.
634   Value *getOrCreateVectorTripCount(Loop *NewLoop);
635 
636   /// Returns a bitcasted value to the requested vector type.
637   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
638   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
639                                 const DataLayout &DL);
640 
641   /// Emit a bypass check to see if the vector trip count is zero, including if
642   /// it overflows.
643   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
644 
645   /// Emit a bypass check to see if all of the SCEV assumptions we've
646   /// had to make are correct. Returns the block containing the checks or
647   /// nullptr if no checks have been added.
648   BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass);
649 
650   /// Emit bypass checks to check any memory assumptions we may have made.
651   /// Returns the block containing the checks or nullptr if no checks have been
652   /// added.
653   BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
654 
655   /// Compute the transformed value of Index at offset StartValue using step
656   /// StepValue.
657   /// For integer induction, returns StartValue + Index * StepValue.
658   /// For pointer induction, returns StartValue[Index * StepValue].
659   /// FIXME: The newly created binary instructions should contain nsw/nuw
660   /// flags, which can be found from the original scalar operations.
661   Value *emitTransformedIndex(IRBuilderBase &B, Value *Index,
662                               ScalarEvolution *SE, const DataLayout &DL,
663                               const InductionDescriptor &ID,
664                               BasicBlock *VectorHeader) const;
665 
666   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
667   /// vector loop preheader, middle block and scalar preheader. Also
668   /// allocate a loop object for the new vector loop and return it.
669   Loop *createVectorLoopSkeleton(StringRef Prefix);
670 
671   /// Create new phi nodes for the induction variables to resume iteration count
672   /// in the scalar epilogue, from where the vectorized loop left off.
673   /// In cases where the loop skeleton is more complicated (eg. epilogue
674   /// vectorization) and the resume values can come from an additional bypass
675   /// block, the \p AdditionalBypass pair provides information about the bypass
676   /// block and the end value on the edge from bypass to this loop.
677   void createInductionResumeValues(
678       Loop *L,
679       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
680 
681   /// Complete the loop skeleton by adding debug MDs, creating appropriate
682   /// conditional branches in the middle block, preparing the builder and
683   /// running the verifier. Take in the vector loop \p L as argument, and return
684   /// the preheader of the completed vector loop.
685   BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
686 
687   /// Add additional metadata to \p To that was not present on \p Orig.
688   ///
689   /// Currently this is used to add the noalias annotations based on the
690   /// inserted memchecks.  Use this for instructions that are *cloned* into the
691   /// vector loop.
692   void addNewMetadata(Instruction *To, const Instruction *Orig);
693 
694   /// Collect poison-generating recipes that may generate a poison value that is
695   /// used after vectorization, even when their operands are not poison. Those
696   /// recipes meet the following conditions:
697   ///  * Contribute to the address computation of a recipe generating a widen
698   ///    memory load/store (VPWidenMemoryInstructionRecipe or
699   ///    VPInterleaveRecipe).
700   ///  * Such a widen memory load/store has at least one underlying Instruction
701   ///    that is in a basic block that needs predication and after vectorization
702   ///    the generated instruction won't be predicated.
703   void collectPoisonGeneratingRecipes(VPTransformState &State);
704 
705   /// Allow subclasses to override and print debug traces before/after vplan
706   /// execution, when trace information is requested.
707   virtual void printDebugTracesAtStart(){};
708   virtual void printDebugTracesAtEnd(){};
709 
710   /// The original loop.
711   Loop *OrigLoop;
712 
713   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
714   /// dynamic knowledge to simplify SCEV expressions and converts them to a
715   /// more usable form.
716   PredicatedScalarEvolution &PSE;
717 
718   /// Loop Info.
719   LoopInfo *LI;
720 
721   /// Dominator Tree.
722   DominatorTree *DT;
723 
724   /// Alias Analysis.
725   AAResults *AA;
726 
727   /// Target Library Info.
728   const TargetLibraryInfo *TLI;
729 
730   /// Target Transform Info.
731   const TargetTransformInfo *TTI;
732 
733   /// Assumption Cache.
734   AssumptionCache *AC;
735 
736   /// Interface to emit optimization remarks.
737   OptimizationRemarkEmitter *ORE;
738 
739   /// LoopVersioning.  It's only set up (non-null) if memchecks were
740   /// used.
741   ///
742   /// This is currently only used to add no-alias metadata based on the
743   /// memchecks.  The actually versioning is performed manually.
744   std::unique_ptr<LoopVersioning> LVer;
745 
746   /// The vectorization SIMD factor to use. Each vector will have this many
747   /// vector elements.
748   ElementCount VF;
749 
750   /// The vectorization unroll factor to use. Each scalar is vectorized to this
751   /// many different vector instructions.
752   unsigned UF;
753 
754   /// The builder that we use
755   IRBuilder<> Builder;
756 
757   // --- Vectorization state ---
758 
759   /// The vector-loop preheader.
760   BasicBlock *LoopVectorPreHeader;
761 
762   /// The scalar-loop preheader.
763   BasicBlock *LoopScalarPreHeader;
764 
765   /// Middle Block between the vector and the scalar.
766   BasicBlock *LoopMiddleBlock;
767 
768   /// The unique ExitBlock of the scalar loop if one exists.  Note that
769   /// there can be multiple exiting edges reaching this block.
770   BasicBlock *LoopExitBlock;
771 
772   /// The vector loop body.
773   BasicBlock *LoopVectorBody;
774 
775   /// The scalar loop body.
776   BasicBlock *LoopScalarBody;
777 
778   /// A list of all bypass blocks. The first block is the entry of the loop.
779   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
780 
781   /// Store instructions that were predicated.
782   SmallVector<Instruction *, 4> PredicatedInstructions;
783 
784   /// Trip count of the original loop.
785   Value *TripCount = nullptr;
786 
787   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
788   Value *VectorTripCount = nullptr;
789 
790   /// The legality analysis.
791   LoopVectorizationLegality *Legal;
792 
793   /// The profitablity analysis.
794   LoopVectorizationCostModel *Cost;
795 
796   // Record whether runtime checks are added.
797   bool AddedSafetyChecks = false;
798 
799   // Holds the end values for each induction variable. We save the end values
800   // so we can later fix-up the external users of the induction variables.
801   DenseMap<PHINode *, Value *> IVEndValues;
802 
803   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
804   // fixed up at the end of vector code generation.
805   SmallVector<PHINode *, 8> OrigPHIsToFix;
806 
807   /// BFI and PSI are used to check for profile guided size optimizations.
808   BlockFrequencyInfo *BFI;
809   ProfileSummaryInfo *PSI;
810 
811   // Whether this loop should be optimized for size based on profile guided size
812   // optimizatios.
813   bool OptForSizeBasedOnProfile;
814 
815   /// Structure to hold information about generated runtime checks, responsible
816   /// for cleaning the checks, if vectorization turns out unprofitable.
817   GeneratedRTChecks &RTChecks;
818 
819   // Holds the resume values for reductions in the loops, used to set the
820   // correct start value of reduction PHIs when vectorizing the epilogue.
821   SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
822       ReductionResumeValues;
823 };
824 
825 class InnerLoopUnroller : public InnerLoopVectorizer {
826 public:
827   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
828                     LoopInfo *LI, DominatorTree *DT,
829                     const TargetLibraryInfo *TLI,
830                     const TargetTransformInfo *TTI, AssumptionCache *AC,
831                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
832                     LoopVectorizationLegality *LVL,
833                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
834                     ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
835       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
836                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
837                             BFI, PSI, Check) {}
838 
839 private:
840   Value *getBroadcastInstrs(Value *V) override;
841 };
842 
843 /// Encapsulate information regarding vectorization of a loop and its epilogue.
844 /// This information is meant to be updated and used across two stages of
845 /// epilogue vectorization.
846 struct EpilogueLoopVectorizationInfo {
847   ElementCount MainLoopVF = ElementCount::getFixed(0);
848   unsigned MainLoopUF = 0;
849   ElementCount EpilogueVF = ElementCount::getFixed(0);
850   unsigned EpilogueUF = 0;
851   BasicBlock *MainLoopIterationCountCheck = nullptr;
852   BasicBlock *EpilogueIterationCountCheck = nullptr;
853   BasicBlock *SCEVSafetyCheck = nullptr;
854   BasicBlock *MemSafetyCheck = nullptr;
855   Value *TripCount = nullptr;
856   Value *VectorTripCount = nullptr;
857 
858   EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
859                                 ElementCount EVF, unsigned EUF)
860       : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
861     assert(EUF == 1 &&
862            "A high UF for the epilogue loop is likely not beneficial.");
863   }
864 };
865 
866 /// An extension of the inner loop vectorizer that creates a skeleton for a
867 /// vectorized loop that has its epilogue (residual) also vectorized.
868 /// The idea is to run the vplan on a given loop twice, firstly to setup the
869 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
870 /// from the first step and vectorize the epilogue.  This is achieved by
871 /// deriving two concrete strategy classes from this base class and invoking
872 /// them in succession from the loop vectorizer planner.
873 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
874 public:
875   InnerLoopAndEpilogueVectorizer(
876       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
877       DominatorTree *DT, const TargetLibraryInfo *TLI,
878       const TargetTransformInfo *TTI, AssumptionCache *AC,
879       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
880       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
881       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
882       GeneratedRTChecks &Checks)
883       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
884                             EPI.MainLoopVF, EPI.MainLoopUF, LVL, CM, BFI, PSI,
885                             Checks),
886         EPI(EPI) {}
887 
888   // Override this function to handle the more complex control flow around the
889   // three loops.
890   std::pair<BasicBlock *, Value *>
891   createVectorizedLoopSkeleton() final override {
892     return createEpilogueVectorizedLoopSkeleton();
893   }
894 
895   /// The interface for creating a vectorized skeleton using one of two
896   /// different strategies, each corresponding to one execution of the vplan
897   /// as described above.
898   virtual std::pair<BasicBlock *, Value *>
899   createEpilogueVectorizedLoopSkeleton() = 0;
900 
901   /// Holds and updates state information required to vectorize the main loop
902   /// and its epilogue in two separate passes. This setup helps us avoid
903   /// regenerating and recomputing runtime safety checks. It also helps us to
904   /// shorten the iteration-count-check path length for the cases where the
905   /// iteration count of the loop is so small that the main vector loop is
906   /// completely skipped.
907   EpilogueLoopVectorizationInfo &EPI;
908 };
909 
910 /// A specialized derived class of inner loop vectorizer that performs
911 /// vectorization of *main* loops in the process of vectorizing loops and their
912 /// epilogues.
913 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
914 public:
915   EpilogueVectorizerMainLoop(
916       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
917       DominatorTree *DT, const TargetLibraryInfo *TLI,
918       const TargetTransformInfo *TTI, AssumptionCache *AC,
919       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
920       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
921       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
922       GeneratedRTChecks &Check)
923       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
924                                        EPI, LVL, CM, BFI, PSI, Check) {}
925   /// Implements the interface for creating a vectorized skeleton using the
926   /// *main loop* strategy (ie the first pass of vplan execution).
927   std::pair<BasicBlock *, Value *>
928   createEpilogueVectorizedLoopSkeleton() final override;
929 
930 protected:
931   /// Emits an iteration count bypass check once for the main loop (when \p
932   /// ForEpilogue is false) and once for the epilogue loop (when \p
933   /// ForEpilogue is true).
934   BasicBlock *emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass,
935                                              bool ForEpilogue);
936   void printDebugTracesAtStart() override;
937   void printDebugTracesAtEnd() override;
938 };
939 
940 // A specialized derived class of inner loop vectorizer that performs
941 // vectorization of *epilogue* loops in the process of vectorizing loops and
942 // their epilogues.
943 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
944 public:
945   EpilogueVectorizerEpilogueLoop(
946       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
947       DominatorTree *DT, const TargetLibraryInfo *TLI,
948       const TargetTransformInfo *TTI, AssumptionCache *AC,
949       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
950       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
951       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
952       GeneratedRTChecks &Checks)
953       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
954                                        EPI, LVL, CM, BFI, PSI, Checks) {}
955   /// Implements the interface for creating a vectorized skeleton using the
956   /// *epilogue loop* strategy (ie the second pass of vplan execution).
957   std::pair<BasicBlock *, Value *>
958   createEpilogueVectorizedLoopSkeleton() final override;
959 
960 protected:
961   /// Emits an iteration count bypass check after the main vector loop has
962   /// finished to see if there are any iterations left to execute by either
963   /// the vector epilogue or the scalar epilogue.
964   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(Loop *L,
965                                                       BasicBlock *Bypass,
966                                                       BasicBlock *Insert);
967   void printDebugTracesAtStart() override;
968   void printDebugTracesAtEnd() override;
969 };
970 } // end namespace llvm
971 
972 /// Look for a meaningful debug location on the instruction or it's
973 /// operands.
974 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
975   if (!I)
976     return I;
977 
978   DebugLoc Empty;
979   if (I->getDebugLoc() != Empty)
980     return I;
981 
982   for (Use &Op : I->operands()) {
983     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
984       if (OpInst->getDebugLoc() != Empty)
985         return OpInst;
986   }
987 
988   return I;
989 }
990 
991 void InnerLoopVectorizer::setDebugLocFromInst(
992     const Value *V, Optional<IRBuilderBase *> CustomBuilder) {
993   IRBuilderBase *B = (CustomBuilder == None) ? &Builder : *CustomBuilder;
994   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(V)) {
995     const DILocation *DIL = Inst->getDebugLoc();
996 
997     // When a FSDiscriminator is enabled, we don't need to add the multiply
998     // factors to the discriminators.
999     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
1000         !isa<DbgInfoIntrinsic>(Inst) && !EnableFSDiscriminator) {
1001       // FIXME: For scalable vectors, assume vscale=1.
1002       auto NewDIL =
1003           DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
1004       if (NewDIL)
1005         B->SetCurrentDebugLocation(NewDIL.getValue());
1006       else
1007         LLVM_DEBUG(dbgs()
1008                    << "Failed to create new discriminator: "
1009                    << DIL->getFilename() << " Line: " << DIL->getLine());
1010     } else
1011       B->SetCurrentDebugLocation(DIL);
1012   } else
1013     B->SetCurrentDebugLocation(DebugLoc());
1014 }
1015 
1016 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
1017 /// is passed, the message relates to that particular instruction.
1018 #ifndef NDEBUG
1019 static void debugVectorizationMessage(const StringRef Prefix,
1020                                       const StringRef DebugMsg,
1021                                       Instruction *I) {
1022   dbgs() << "LV: " << Prefix << DebugMsg;
1023   if (I != nullptr)
1024     dbgs() << " " << *I;
1025   else
1026     dbgs() << '.';
1027   dbgs() << '\n';
1028 }
1029 #endif
1030 
1031 /// Create an analysis remark that explains why vectorization failed
1032 ///
1033 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
1034 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
1035 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
1036 /// the location of the remark.  \return the remark object that can be
1037 /// streamed to.
1038 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
1039     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
1040   Value *CodeRegion = TheLoop->getHeader();
1041   DebugLoc DL = TheLoop->getStartLoc();
1042 
1043   if (I) {
1044     CodeRegion = I->getParent();
1045     // If there is no debug location attached to the instruction, revert back to
1046     // using the loop's.
1047     if (I->getDebugLoc())
1048       DL = I->getDebugLoc();
1049   }
1050 
1051   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
1052 }
1053 
1054 namespace llvm {
1055 
1056 /// Return a value for Step multiplied by VF.
1057 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
1058                        int64_t Step) {
1059   assert(Ty->isIntegerTy() && "Expected an integer step");
1060   Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue());
1061   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
1062 }
1063 
1064 /// Return the runtime value for VF.
1065 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
1066   Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
1067   return VF.isScalable() ? B.CreateVScale(EC) : EC;
1068 }
1069 
1070 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
1071                                   ElementCount VF) {
1072   assert(FTy->isFloatingPointTy() && "Expected floating point type!");
1073   Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
1074   Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
1075   return B.CreateUIToFP(RuntimeVF, FTy);
1076 }
1077 
1078 void reportVectorizationFailure(const StringRef DebugMsg,
1079                                 const StringRef OREMsg, const StringRef ORETag,
1080                                 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1081                                 Instruction *I) {
1082   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
1083   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1084   ORE->emit(
1085       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1086       << "loop not vectorized: " << OREMsg);
1087 }
1088 
1089 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
1090                              OptimizationRemarkEmitter *ORE, Loop *TheLoop,
1091                              Instruction *I) {
1092   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
1093   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
1094   ORE->emit(
1095       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
1096       << Msg);
1097 }
1098 
1099 } // end namespace llvm
1100 
1101 #ifndef NDEBUG
1102 /// \return string containing a file name and a line # for the given loop.
1103 static std::string getDebugLocString(const Loop *L) {
1104   std::string Result;
1105   if (L) {
1106     raw_string_ostream OS(Result);
1107     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
1108       LoopDbgLoc.print(OS);
1109     else
1110       // Just print the module name.
1111       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
1112     OS.flush();
1113   }
1114   return Result;
1115 }
1116 #endif
1117 
1118 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
1119                                          const Instruction *Orig) {
1120   // If the loop was versioned with memchecks, add the corresponding no-alias
1121   // metadata.
1122   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
1123     LVer->annotateInstWithNoAlias(To, Orig);
1124 }
1125 
1126 void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
1127     VPTransformState &State) {
1128 
1129   // Collect recipes in the backward slice of `Root` that may generate a poison
1130   // value that is used after vectorization.
1131   SmallPtrSet<VPRecipeBase *, 16> Visited;
1132   auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
1133     SmallVector<VPRecipeBase *, 16> Worklist;
1134     Worklist.push_back(Root);
1135 
1136     // Traverse the backward slice of Root through its use-def chain.
1137     while (!Worklist.empty()) {
1138       VPRecipeBase *CurRec = Worklist.back();
1139       Worklist.pop_back();
1140 
1141       if (!Visited.insert(CurRec).second)
1142         continue;
1143 
1144       // Prune search if we find another recipe generating a widen memory
1145       // instruction. Widen memory instructions involved in address computation
1146       // will lead to gather/scatter instructions, which don't need to be
1147       // handled.
1148       if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
1149           isa<VPInterleaveRecipe>(CurRec) ||
1150           isa<VPCanonicalIVPHIRecipe>(CurRec))
1151         continue;
1152 
1153       // This recipe contributes to the address computation of a widen
1154       // load/store. Collect recipe if its underlying instruction has
1155       // poison-generating flags.
1156       Instruction *Instr = CurRec->getUnderlyingInstr();
1157       if (Instr && Instr->hasPoisonGeneratingFlags())
1158         State.MayGeneratePoisonRecipes.insert(CurRec);
1159 
1160       // Add new definitions to the worklist.
1161       for (VPValue *operand : CurRec->operands())
1162         if (VPDef *OpDef = operand->getDef())
1163           Worklist.push_back(cast<VPRecipeBase>(OpDef));
1164     }
1165   });
1166 
1167   // Traverse all the recipes in the VPlan and collect the poison-generating
1168   // recipes in the backward slice starting at the address of a VPWidenRecipe or
1169   // VPInterleaveRecipe.
1170   auto Iter = depth_first(
1171       VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry()));
1172   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1173     for (VPRecipeBase &Recipe : *VPBB) {
1174       if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
1175         Instruction *UnderlyingInstr = WidenRec->getUnderlyingInstr();
1176         VPDef *AddrDef = WidenRec->getAddr()->getDef();
1177         if (AddrDef && WidenRec->isConsecutive() && UnderlyingInstr &&
1178             Legal->blockNeedsPredication(UnderlyingInstr->getParent()))
1179           collectPoisonGeneratingInstrsInBackwardSlice(
1180               cast<VPRecipeBase>(AddrDef));
1181       } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1182         VPDef *AddrDef = InterleaveRec->getAddr()->getDef();
1183         if (AddrDef) {
1184           // Check if any member of the interleave group needs predication.
1185           const InterleaveGroup<Instruction> *InterGroup =
1186               InterleaveRec->getInterleaveGroup();
1187           bool NeedPredication = false;
1188           for (int I = 0, NumMembers = InterGroup->getNumMembers();
1189                I < NumMembers; ++I) {
1190             Instruction *Member = InterGroup->getMember(I);
1191             if (Member)
1192               NeedPredication |=
1193                   Legal->blockNeedsPredication(Member->getParent());
1194           }
1195 
1196           if (NeedPredication)
1197             collectPoisonGeneratingInstrsInBackwardSlice(
1198                 cast<VPRecipeBase>(AddrDef));
1199         }
1200       }
1201     }
1202   }
1203 }
1204 
1205 void InnerLoopVectorizer::addMetadata(Instruction *To,
1206                                       Instruction *From) {
1207   propagateMetadata(To, From);
1208   addNewMetadata(To, From);
1209 }
1210 
1211 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
1212                                       Instruction *From) {
1213   for (Value *V : To) {
1214     if (Instruction *I = dyn_cast<Instruction>(V))
1215       addMetadata(I, From);
1216   }
1217 }
1218 
1219 PHINode *InnerLoopVectorizer::getReductionResumeValue(
1220     const RecurrenceDescriptor &RdxDesc) {
1221   auto It = ReductionResumeValues.find(&RdxDesc);
1222   assert(It != ReductionResumeValues.end() &&
1223          "Expected to find a resume value for the reduction.");
1224   return It->second;
1225 }
1226 
1227 namespace llvm {
1228 
1229 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1230 // lowered.
1231 enum ScalarEpilogueLowering {
1232 
1233   // The default: allowing scalar epilogues.
1234   CM_ScalarEpilogueAllowed,
1235 
1236   // Vectorization with OptForSize: don't allow epilogues.
1237   CM_ScalarEpilogueNotAllowedOptSize,
1238 
1239   // A special case of vectorisation with OptForSize: loops with a very small
1240   // trip count are considered for vectorization under OptForSize, thereby
1241   // making sure the cost of their loop body is dominant, free of runtime
1242   // guards and scalar iteration overheads.
1243   CM_ScalarEpilogueNotAllowedLowTripLoop,
1244 
1245   // Loop hint predicate indicating an epilogue is undesired.
1246   CM_ScalarEpilogueNotNeededUsePredicate,
1247 
1248   // Directive indicating we must either tail fold or not vectorize
1249   CM_ScalarEpilogueNotAllowedUsePredicate
1250 };
1251 
1252 /// ElementCountComparator creates a total ordering for ElementCount
1253 /// for the purposes of using it in a set structure.
1254 struct ElementCountComparator {
1255   bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
1256     return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
1257            std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
1258   }
1259 };
1260 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
1261 
1262 /// LoopVectorizationCostModel - estimates the expected speedups due to
1263 /// vectorization.
1264 /// In many cases vectorization is not profitable. This can happen because of
1265 /// a number of reasons. In this class we mainly attempt to predict the
1266 /// expected speedup/slowdowns due to the supported instruction set. We use the
1267 /// TargetTransformInfo to query the different backends for the cost of
1268 /// different operations.
1269 class LoopVectorizationCostModel {
1270 public:
1271   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1272                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1273                              LoopVectorizationLegality *Legal,
1274                              const TargetTransformInfo &TTI,
1275                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1276                              AssumptionCache *AC,
1277                              OptimizationRemarkEmitter *ORE, const Function *F,
1278                              const LoopVectorizeHints *Hints,
1279                              InterleavedAccessInfo &IAI)
1280       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1281         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1282         Hints(Hints), InterleaveInfo(IAI) {}
1283 
1284   /// \return An upper bound for the vectorization factors (both fixed and
1285   /// scalable). If the factors are 0, vectorization and interleaving should be
1286   /// avoided up front.
1287   FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1288 
1289   /// \return True if runtime checks are required for vectorization, and false
1290   /// otherwise.
1291   bool runtimeChecksRequired();
1292 
1293   /// \return The most profitable vectorization factor and the cost of that VF.
1294   /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
1295   /// then this vectorization factor will be selected if vectorization is
1296   /// possible.
1297   VectorizationFactor
1298   selectVectorizationFactor(const ElementCountSet &CandidateVFs);
1299 
1300   VectorizationFactor
1301   selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1302                                     const LoopVectorizationPlanner &LVP);
1303 
1304   /// Setup cost-based decisions for user vectorization factor.
1305   /// \return true if the UserVF is a feasible VF to be chosen.
1306   bool selectUserVectorizationFactor(ElementCount UserVF) {
1307     collectUniformsAndScalars(UserVF);
1308     collectInstsToScalarize(UserVF);
1309     return expectedCost(UserVF).first.isValid();
1310   }
1311 
1312   /// \return The size (in bits) of the smallest and widest types in the code
1313   /// that needs to be vectorized. We ignore values that remain scalar such as
1314   /// 64 bit loop indices.
1315   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1316 
1317   /// \return The desired interleave count.
1318   /// If interleave count has been specified by metadata it will be returned.
1319   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1320   /// are the selected vectorization factor and the cost of the selected VF.
1321   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1322 
1323   /// Memory access instruction may be vectorized in more than one way.
1324   /// Form of instruction after vectorization depends on cost.
1325   /// This function takes cost-based decisions for Load/Store instructions
1326   /// and collects them in a map. This decisions map is used for building
1327   /// the lists of loop-uniform and loop-scalar instructions.
1328   /// The calculated cost is saved with widening decision in order to
1329   /// avoid redundant calculations.
1330   void setCostBasedWideningDecision(ElementCount VF);
1331 
1332   /// A struct that represents some properties of the register usage
1333   /// of a loop.
1334   struct RegisterUsage {
1335     /// Holds the number of loop invariant values that are used in the loop.
1336     /// The key is ClassID of target-provided register class.
1337     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1338     /// Holds the maximum number of concurrent live intervals in the loop.
1339     /// The key is ClassID of target-provided register class.
1340     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1341   };
1342 
1343   /// \return Returns information about the register usages of the loop for the
1344   /// given vectorization factors.
1345   SmallVector<RegisterUsage, 8>
1346   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1347 
1348   /// Collect values we want to ignore in the cost model.
1349   void collectValuesToIgnore();
1350 
1351   /// Collect all element types in the loop for which widening is needed.
1352   void collectElementTypesForWidening();
1353 
1354   /// Split reductions into those that happen in the loop, and those that happen
1355   /// outside. In loop reductions are collected into InLoopReductionChains.
1356   void collectInLoopReductions();
1357 
1358   /// Returns true if we should use strict in-order reductions for the given
1359   /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1360   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1361   /// of FP operations.
1362   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) {
1363     return !Hints->allowReordering() && RdxDesc.isOrdered();
1364   }
1365 
1366   /// \returns The smallest bitwidth each instruction can be represented with.
1367   /// The vector equivalents of these instructions should be truncated to this
1368   /// type.
1369   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1370     return MinBWs;
1371   }
1372 
1373   /// \returns True if it is more profitable to scalarize instruction \p I for
1374   /// vectorization factor \p VF.
1375   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1376     assert(VF.isVector() &&
1377            "Profitable to scalarize relevant only for VF > 1.");
1378 
1379     // Cost model is not run in the VPlan-native path - return conservative
1380     // result until this changes.
1381     if (EnableVPlanNativePath)
1382       return false;
1383 
1384     auto Scalars = InstsToScalarize.find(VF);
1385     assert(Scalars != InstsToScalarize.end() &&
1386            "VF not yet analyzed for scalarization profitability");
1387     return Scalars->second.find(I) != Scalars->second.end();
1388   }
1389 
1390   /// Returns true if \p I is known to be uniform after vectorization.
1391   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1392     if (VF.isScalar())
1393       return true;
1394 
1395     // Cost model is not run in the VPlan-native path - return conservative
1396     // result until this changes.
1397     if (EnableVPlanNativePath)
1398       return false;
1399 
1400     auto UniformsPerVF = Uniforms.find(VF);
1401     assert(UniformsPerVF != Uniforms.end() &&
1402            "VF not yet analyzed for uniformity");
1403     return UniformsPerVF->second.count(I);
1404   }
1405 
1406   /// Returns true if \p I is known to be scalar after vectorization.
1407   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1408     if (VF.isScalar())
1409       return true;
1410 
1411     // Cost model is not run in the VPlan-native path - return conservative
1412     // result until this changes.
1413     if (EnableVPlanNativePath)
1414       return false;
1415 
1416     auto ScalarsPerVF = Scalars.find(VF);
1417     assert(ScalarsPerVF != Scalars.end() &&
1418            "Scalar values are not calculated for VF");
1419     return ScalarsPerVF->second.count(I);
1420   }
1421 
1422   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1423   /// for vectorization factor \p VF.
1424   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1425     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1426            !isProfitableToScalarize(I, VF) &&
1427            !isScalarAfterVectorization(I, VF);
1428   }
1429 
1430   /// Decision that was taken during cost calculation for memory instruction.
1431   enum InstWidening {
1432     CM_Unknown,
1433     CM_Widen,         // For consecutive accesses with stride +1.
1434     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1435     CM_Interleave,
1436     CM_GatherScatter,
1437     CM_Scalarize
1438   };
1439 
1440   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1441   /// instruction \p I and vector width \p VF.
1442   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1443                            InstructionCost Cost) {
1444     assert(VF.isVector() && "Expected VF >=2");
1445     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1446   }
1447 
1448   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1449   /// interleaving group \p Grp and vector width \p VF.
1450   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1451                            ElementCount VF, InstWidening W,
1452                            InstructionCost Cost) {
1453     assert(VF.isVector() && "Expected VF >=2");
1454     /// Broadcast this decicion to all instructions inside the group.
1455     /// But the cost will be assigned to one instruction only.
1456     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1457       if (auto *I = Grp->getMember(i)) {
1458         if (Grp->getInsertPos() == I)
1459           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1460         else
1461           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1462       }
1463     }
1464   }
1465 
1466   /// Return the cost model decision for the given instruction \p I and vector
1467   /// width \p VF. Return CM_Unknown if this instruction did not pass
1468   /// through the cost modeling.
1469   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1470     assert(VF.isVector() && "Expected VF to be a vector VF");
1471     // Cost model is not run in the VPlan-native path - return conservative
1472     // result until this changes.
1473     if (EnableVPlanNativePath)
1474       return CM_GatherScatter;
1475 
1476     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1477     auto Itr = WideningDecisions.find(InstOnVF);
1478     if (Itr == WideningDecisions.end())
1479       return CM_Unknown;
1480     return Itr->second.first;
1481   }
1482 
1483   /// Return the vectorization cost for the given instruction \p I and vector
1484   /// width \p VF.
1485   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1486     assert(VF.isVector() && "Expected VF >=2");
1487     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1488     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1489            "The cost is not calculated");
1490     return WideningDecisions[InstOnVF].second;
1491   }
1492 
1493   /// Return True if instruction \p I is an optimizable truncate whose operand
1494   /// is an induction variable. Such a truncate will be removed by adding a new
1495   /// induction variable with the destination type.
1496   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1497     // If the instruction is not a truncate, return false.
1498     auto *Trunc = dyn_cast<TruncInst>(I);
1499     if (!Trunc)
1500       return false;
1501 
1502     // Get the source and destination types of the truncate.
1503     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1504     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1505 
1506     // If the truncate is free for the given types, return false. Replacing a
1507     // free truncate with an induction variable would add an induction variable
1508     // update instruction to each iteration of the loop. We exclude from this
1509     // check the primary induction variable since it will need an update
1510     // instruction regardless.
1511     Value *Op = Trunc->getOperand(0);
1512     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1513       return false;
1514 
1515     // If the truncated value is not an induction variable, return false.
1516     return Legal->isInductionPhi(Op);
1517   }
1518 
1519   /// Collects the instructions to scalarize for each predicated instruction in
1520   /// the loop.
1521   void collectInstsToScalarize(ElementCount VF);
1522 
1523   /// Collect Uniform and Scalar values for the given \p VF.
1524   /// The sets depend on CM decision for Load/Store instructions
1525   /// that may be vectorized as interleave, gather-scatter or scalarized.
1526   void collectUniformsAndScalars(ElementCount VF) {
1527     // Do the analysis once.
1528     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1529       return;
1530     setCostBasedWideningDecision(VF);
1531     collectLoopUniforms(VF);
1532     collectLoopScalars(VF);
1533   }
1534 
1535   /// Returns true if the target machine supports masked store operation
1536   /// for the given \p DataType and kind of access to \p Ptr.
1537   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1538     return Legal->isConsecutivePtr(DataType, Ptr) &&
1539            TTI.isLegalMaskedStore(DataType, Alignment);
1540   }
1541 
1542   /// Returns true if the target machine supports masked load operation
1543   /// for the given \p DataType and kind of access to \p Ptr.
1544   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1545     return Legal->isConsecutivePtr(DataType, Ptr) &&
1546            TTI.isLegalMaskedLoad(DataType, Alignment);
1547   }
1548 
1549   /// Returns true if the target machine can represent \p V as a masked gather
1550   /// or scatter operation.
1551   bool isLegalGatherOrScatter(Value *V,
1552                               ElementCount VF = ElementCount::getFixed(1)) {
1553     bool LI = isa<LoadInst>(V);
1554     bool SI = isa<StoreInst>(V);
1555     if (!LI && !SI)
1556       return false;
1557     auto *Ty = getLoadStoreType(V);
1558     Align Align = getLoadStoreAlignment(V);
1559     if (VF.isVector())
1560       Ty = VectorType::get(Ty, VF);
1561     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1562            (SI && TTI.isLegalMaskedScatter(Ty, Align));
1563   }
1564 
1565   /// Returns true if the target machine supports all of the reduction
1566   /// variables found for the given VF.
1567   bool canVectorizeReductions(ElementCount VF) const {
1568     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1569       const RecurrenceDescriptor &RdxDesc = Reduction.second;
1570       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1571     }));
1572   }
1573 
1574   /// Returns true if \p I is an instruction that will be scalarized with
1575   /// predication when vectorizing \p I with vectorization factor \p VF. Such
1576   /// instructions include conditional stores and instructions that may divide
1577   /// by zero.
1578   bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1579 
1580   // Returns true if \p I is an instruction that will be predicated either
1581   // through scalar predication or masked load/store or masked gather/scatter.
1582   // \p VF is the vectorization factor that will be used to vectorize \p I.
1583   // Superset of instructions that return true for isScalarWithPredication.
1584   bool isPredicatedInst(Instruction *I, ElementCount VF,
1585                         bool IsKnownUniform = false) {
1586     // When we know the load is uniform and the original scalar loop was not
1587     // predicated we don't need to mark it as a predicated instruction. Any
1588     // vectorised blocks created when tail-folding are something artificial we
1589     // have introduced and we know there is always at least one active lane.
1590     // That's why we call Legal->blockNeedsPredication here because it doesn't
1591     // query tail-folding.
1592     if (IsKnownUniform && isa<LoadInst>(I) &&
1593         !Legal->blockNeedsPredication(I->getParent()))
1594       return false;
1595     if (!blockNeedsPredicationForAnyReason(I->getParent()))
1596       return false;
1597     // Loads and stores that need some form of masked operation are predicated
1598     // instructions.
1599     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1600       return Legal->isMaskRequired(I);
1601     return isScalarWithPredication(I, VF);
1602   }
1603 
1604   /// Returns true if \p I is a memory instruction with consecutive memory
1605   /// access that can be widened.
1606   bool
1607   memoryInstructionCanBeWidened(Instruction *I,
1608                                 ElementCount VF = ElementCount::getFixed(1));
1609 
1610   /// Returns true if \p I is a memory instruction in an interleaved-group
1611   /// of memory accesses that can be vectorized with wide vector loads/stores
1612   /// and shuffles.
1613   bool
1614   interleavedAccessCanBeWidened(Instruction *I,
1615                                 ElementCount VF = ElementCount::getFixed(1));
1616 
1617   /// Check if \p Instr belongs to any interleaved access group.
1618   bool isAccessInterleaved(Instruction *Instr) {
1619     return InterleaveInfo.isInterleaved(Instr);
1620   }
1621 
1622   /// Get the interleaved access group that \p Instr belongs to.
1623   const InterleaveGroup<Instruction> *
1624   getInterleavedAccessGroup(Instruction *Instr) {
1625     return InterleaveInfo.getInterleaveGroup(Instr);
1626   }
1627 
1628   /// Returns true if we're required to use a scalar epilogue for at least
1629   /// the final iteration of the original loop.
1630   bool requiresScalarEpilogue(ElementCount VF) const {
1631     if (!isScalarEpilogueAllowed())
1632       return false;
1633     // If we might exit from anywhere but the latch, must run the exiting
1634     // iteration in scalar form.
1635     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1636       return true;
1637     return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();
1638   }
1639 
1640   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1641   /// loop hint annotation.
1642   bool isScalarEpilogueAllowed() const {
1643     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1644   }
1645 
1646   /// Returns true if all loop blocks should be masked to fold tail loop.
1647   bool foldTailByMasking() const { return FoldTailByMasking; }
1648 
1649   /// Returns true if the instructions in this block requires predication
1650   /// for any reason, e.g. because tail folding now requires a predicate
1651   /// or because the block in the original loop was predicated.
1652   bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1653     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1654   }
1655 
1656   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1657   /// nodes to the chain of instructions representing the reductions. Uses a
1658   /// MapVector to ensure deterministic iteration order.
1659   using ReductionChainMap =
1660       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1661 
1662   /// Return the chain of instructions representing an inloop reduction.
1663   const ReductionChainMap &getInLoopReductionChains() const {
1664     return InLoopReductionChains;
1665   }
1666 
1667   /// Returns true if the Phi is part of an inloop reduction.
1668   bool isInLoopReduction(PHINode *Phi) const {
1669     return InLoopReductionChains.count(Phi);
1670   }
1671 
1672   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1673   /// with factor VF.  Return the cost of the instruction, including
1674   /// scalarization overhead if it's needed.
1675   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1676 
1677   /// Estimate cost of a call instruction CI if it were vectorized with factor
1678   /// VF. Return the cost of the instruction, including scalarization overhead
1679   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1680   /// scalarized -
1681   /// i.e. either vector version isn't available, or is too expensive.
1682   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1683                                     bool &NeedToScalarize) const;
1684 
1685   /// Returns true if the per-lane cost of VectorizationFactor A is lower than
1686   /// that of B.
1687   bool isMoreProfitable(const VectorizationFactor &A,
1688                         const VectorizationFactor &B) const;
1689 
1690   /// Invalidates decisions already taken by the cost model.
1691   void invalidateCostModelingDecisions() {
1692     WideningDecisions.clear();
1693     Uniforms.clear();
1694     Scalars.clear();
1695   }
1696 
1697 private:
1698   unsigned NumPredStores = 0;
1699 
1700   /// Convenience function that returns the value of vscale_range iff
1701   /// vscale_range.min == vscale_range.max or otherwise returns the value
1702   /// returned by the corresponding TLI method.
1703   Optional<unsigned> getVScaleForTuning() const;
1704 
1705   /// \return An upper bound for the vectorization factors for both
1706   /// fixed and scalable vectorization, where the minimum-known number of
1707   /// elements is a power-of-2 larger than zero. If scalable vectorization is
1708   /// disabled or unsupported, then the scalable part will be equal to
1709   /// ElementCount::getScalable(0).
1710   FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1711                                            ElementCount UserVF,
1712                                            bool FoldTailByMasking);
1713 
1714   /// \return the maximized element count based on the targets vector
1715   /// registers and the loop trip-count, but limited to a maximum safe VF.
1716   /// This is a helper function of computeFeasibleMaxVF.
1717   /// FIXME: MaxSafeVF is currently passed by reference to avoid some obscure
1718   /// issue that occurred on one of the buildbots which cannot be reproduced
1719   /// without having access to the properietary compiler (see comments on
1720   /// D98509). The issue is currently under investigation and this workaround
1721   /// will be removed as soon as possible.
1722   ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
1723                                        unsigned SmallestType,
1724                                        unsigned WidestType,
1725                                        const ElementCount &MaxSafeVF,
1726                                        bool FoldTailByMasking);
1727 
1728   /// \return the maximum legal scalable VF, based on the safe max number
1729   /// of elements.
1730   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1731 
1732   /// The vectorization cost is a combination of the cost itself and a boolean
1733   /// indicating whether any of the contributing operations will actually
1734   /// operate on vector values after type legalization in the backend. If this
1735   /// latter value is false, then all operations will be scalarized (i.e. no
1736   /// vectorization has actually taken place).
1737   using VectorizationCostTy = std::pair<InstructionCost, bool>;
1738 
1739   /// Returns the expected execution cost. The unit of the cost does
1740   /// not matter because we use the 'cost' units to compare different
1741   /// vector widths. The cost that is returned is *not* normalized by
1742   /// the factor width. If \p Invalid is not nullptr, this function
1743   /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1744   /// each instruction that has an Invalid cost for the given VF.
1745   using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1746   VectorizationCostTy
1747   expectedCost(ElementCount VF,
1748                SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1749 
1750   /// Returns the execution time cost of an instruction for a given vector
1751   /// width. Vector width of one means scalar.
1752   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1753 
1754   /// The cost-computation logic from getInstructionCost which provides
1755   /// the vector type as an output parameter.
1756   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1757                                      Type *&VectorTy);
1758 
1759   /// Return the cost of instructions in an inloop reduction pattern, if I is
1760   /// part of that pattern.
1761   Optional<InstructionCost>
1762   getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1763                           TTI::TargetCostKind CostKind);
1764 
1765   /// Calculate vectorization cost of memory instruction \p I.
1766   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1767 
1768   /// The cost computation for scalarized memory instruction.
1769   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1770 
1771   /// The cost computation for interleaving group of memory instructions.
1772   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1773 
1774   /// The cost computation for Gather/Scatter instruction.
1775   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1776 
1777   /// The cost computation for widening instruction \p I with consecutive
1778   /// memory access.
1779   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1780 
1781   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1782   /// Load: scalar load + broadcast.
1783   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1784   /// element)
1785   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1786 
1787   /// Estimate the overhead of scalarizing an instruction. This is a
1788   /// convenience wrapper for the type-based getScalarizationOverhead API.
1789   InstructionCost getScalarizationOverhead(Instruction *I,
1790                                            ElementCount VF) const;
1791 
1792   /// Returns whether the instruction is a load or store and will be a emitted
1793   /// as a vector operation.
1794   bool isConsecutiveLoadOrStore(Instruction *I);
1795 
1796   /// Map of scalar integer values to the smallest bitwidth they can be legally
1797   /// represented as. The vector equivalents of these values should be truncated
1798   /// to this type.
1799   MapVector<Instruction *, uint64_t> MinBWs;
1800 
1801   /// A type representing the costs for instructions if they were to be
1802   /// scalarized rather than vectorized. The entries are Instruction-Cost
1803   /// pairs.
1804   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1805 
1806   /// A set containing all BasicBlocks that are known to present after
1807   /// vectorization as a predicated block.
1808   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1809 
1810   /// Records whether it is allowed to have the original scalar loop execute at
1811   /// least once. This may be needed as a fallback loop in case runtime
1812   /// aliasing/dependence checks fail, or to handle the tail/remainder
1813   /// iterations when the trip count is unknown or doesn't divide by the VF,
1814   /// or as a peel-loop to handle gaps in interleave-groups.
1815   /// Under optsize and when the trip count is very small we don't allow any
1816   /// iterations to execute in the scalar loop.
1817   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1818 
1819   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1820   bool FoldTailByMasking = false;
1821 
1822   /// A map holding scalar costs for different vectorization factors. The
1823   /// presence of a cost for an instruction in the mapping indicates that the
1824   /// instruction will be scalarized when vectorizing with the associated
1825   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1826   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1827 
1828   /// Holds the instructions known to be uniform after vectorization.
1829   /// The data is collected per VF.
1830   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1831 
1832   /// Holds the instructions known to be scalar after vectorization.
1833   /// The data is collected per VF.
1834   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1835 
1836   /// Holds the instructions (address computations) that are forced to be
1837   /// scalarized.
1838   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1839 
1840   /// PHINodes of the reductions that should be expanded in-loop along with
1841   /// their associated chains of reduction operations, in program order from top
1842   /// (PHI) to bottom
1843   ReductionChainMap InLoopReductionChains;
1844 
1845   /// A Map of inloop reduction operations and their immediate chain operand.
1846   /// FIXME: This can be removed once reductions can be costed correctly in
1847   /// vplan. This was added to allow quick lookup to the inloop operations,
1848   /// without having to loop through InLoopReductionChains.
1849   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1850 
1851   /// Returns the expected difference in cost from scalarizing the expression
1852   /// feeding a predicated instruction \p PredInst. The instructions to
1853   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1854   /// non-negative return value implies the expression will be scalarized.
1855   /// Currently, only single-use chains are considered for scalarization.
1856   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1857                               ElementCount VF);
1858 
1859   /// Collect the instructions that are uniform after vectorization. An
1860   /// instruction is uniform if we represent it with a single scalar value in
1861   /// the vectorized loop corresponding to each vector iteration. Examples of
1862   /// uniform instructions include pointer operands of consecutive or
1863   /// interleaved memory accesses. Note that although uniformity implies an
1864   /// instruction will be scalar, the reverse is not true. In general, a
1865   /// scalarized instruction will be represented by VF scalar values in the
1866   /// vectorized loop, each corresponding to an iteration of the original
1867   /// scalar loop.
1868   void collectLoopUniforms(ElementCount VF);
1869 
1870   /// Collect the instructions that are scalar after vectorization. An
1871   /// instruction is scalar if it is known to be uniform or will be scalarized
1872   /// during vectorization. collectLoopScalars should only add non-uniform nodes
1873   /// to the list if they are used by a load/store instruction that is marked as
1874   /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1875   /// VF values in the vectorized loop, each corresponding to an iteration of
1876   /// the original scalar loop.
1877   void collectLoopScalars(ElementCount VF);
1878 
1879   /// Keeps cost model vectorization decision and cost for instructions.
1880   /// Right now it is used for memory instructions only.
1881   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1882                                 std::pair<InstWidening, InstructionCost>>;
1883 
1884   DecisionList WideningDecisions;
1885 
1886   /// Returns true if \p V is expected to be vectorized and it needs to be
1887   /// extracted.
1888   bool needsExtract(Value *V, ElementCount VF) const {
1889     Instruction *I = dyn_cast<Instruction>(V);
1890     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1891         TheLoop->isLoopInvariant(I))
1892       return false;
1893 
1894     // Assume we can vectorize V (and hence we need extraction) if the
1895     // scalars are not computed yet. This can happen, because it is called
1896     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1897     // the scalars are collected. That should be a safe assumption in most
1898     // cases, because we check if the operands have vectorizable types
1899     // beforehand in LoopVectorizationLegality.
1900     return Scalars.find(VF) == Scalars.end() ||
1901            !isScalarAfterVectorization(I, VF);
1902   };
1903 
1904   /// Returns a range containing only operands needing to be extracted.
1905   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1906                                                    ElementCount VF) const {
1907     return SmallVector<Value *, 4>(make_filter_range(
1908         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1909   }
1910 
1911   /// Determines if we have the infrastructure to vectorize loop \p L and its
1912   /// epilogue, assuming the main loop is vectorized by \p VF.
1913   bool isCandidateForEpilogueVectorization(const Loop &L,
1914                                            const ElementCount VF) const;
1915 
1916   /// Returns true if epilogue vectorization is considered profitable, and
1917   /// false otherwise.
1918   /// \p VF is the vectorization factor chosen for the original loop.
1919   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1920 
1921 public:
1922   /// The loop that we evaluate.
1923   Loop *TheLoop;
1924 
1925   /// Predicated scalar evolution analysis.
1926   PredicatedScalarEvolution &PSE;
1927 
1928   /// Loop Info analysis.
1929   LoopInfo *LI;
1930 
1931   /// Vectorization legality.
1932   LoopVectorizationLegality *Legal;
1933 
1934   /// Vector target information.
1935   const TargetTransformInfo &TTI;
1936 
1937   /// Target Library Info.
1938   const TargetLibraryInfo *TLI;
1939 
1940   /// Demanded bits analysis.
1941   DemandedBits *DB;
1942 
1943   /// Assumption cache.
1944   AssumptionCache *AC;
1945 
1946   /// Interface to emit optimization remarks.
1947   OptimizationRemarkEmitter *ORE;
1948 
1949   const Function *TheFunction;
1950 
1951   /// Loop Vectorize Hint.
1952   const LoopVectorizeHints *Hints;
1953 
1954   /// The interleave access information contains groups of interleaved accesses
1955   /// with the same stride and close to each other.
1956   InterleavedAccessInfo &InterleaveInfo;
1957 
1958   /// Values to ignore in the cost model.
1959   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1960 
1961   /// Values to ignore in the cost model when VF > 1.
1962   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1963 
1964   /// All element types found in the loop.
1965   SmallPtrSet<Type *, 16> ElementTypesInLoop;
1966 
1967   /// Profitable vector factors.
1968   SmallVector<VectorizationFactor, 8> ProfitableVFs;
1969 };
1970 } // end namespace llvm
1971 
1972 /// Helper struct to manage generating runtime checks for vectorization.
1973 ///
1974 /// The runtime checks are created up-front in temporary blocks to allow better
1975 /// estimating the cost and un-linked from the existing IR. After deciding to
1976 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1977 /// temporary blocks are completely removed.
1978 class GeneratedRTChecks {
1979   /// Basic block which contains the generated SCEV checks, if any.
1980   BasicBlock *SCEVCheckBlock = nullptr;
1981 
1982   /// The value representing the result of the generated SCEV checks. If it is
1983   /// nullptr, either no SCEV checks have been generated or they have been used.
1984   Value *SCEVCheckCond = nullptr;
1985 
1986   /// Basic block which contains the generated memory runtime checks, if any.
1987   BasicBlock *MemCheckBlock = nullptr;
1988 
1989   /// The value representing the result of the generated memory runtime checks.
1990   /// If it is nullptr, either no memory runtime checks have been generated or
1991   /// they have been used.
1992   Value *MemRuntimeCheckCond = nullptr;
1993 
1994   DominatorTree *DT;
1995   LoopInfo *LI;
1996 
1997   SCEVExpander SCEVExp;
1998   SCEVExpander MemCheckExp;
1999 
2000 public:
2001   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
2002                     const DataLayout &DL)
2003       : DT(DT), LI(LI), SCEVExp(SE, DL, "scev.check"),
2004         MemCheckExp(SE, DL, "scev.check") {}
2005 
2006   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
2007   /// accurately estimate the cost of the runtime checks. The blocks are
2008   /// un-linked from the IR and is added back during vector code generation. If
2009   /// there is no vector code generation, the check blocks are removed
2010   /// completely.
2011   void Create(Loop *L, const LoopAccessInfo &LAI,
2012               const SCEVUnionPredicate &UnionPred) {
2013 
2014     BasicBlock *LoopHeader = L->getHeader();
2015     BasicBlock *Preheader = L->getLoopPreheader();
2016 
2017     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
2018     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
2019     // may be used by SCEVExpander. The blocks will be un-linked from their
2020     // predecessors and removed from LI & DT at the end of the function.
2021     if (!UnionPred.isAlwaysTrue()) {
2022       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
2023                                   nullptr, "vector.scevcheck");
2024 
2025       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
2026           &UnionPred, SCEVCheckBlock->getTerminator());
2027     }
2028 
2029     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
2030     if (RtPtrChecking.Need) {
2031       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
2032       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
2033                                  "vector.memcheck");
2034 
2035       MemRuntimeCheckCond =
2036           addRuntimeChecks(MemCheckBlock->getTerminator(), L,
2037                            RtPtrChecking.getChecks(), MemCheckExp);
2038       assert(MemRuntimeCheckCond &&
2039              "no RT checks generated although RtPtrChecking "
2040              "claimed checks are required");
2041     }
2042 
2043     if (!MemCheckBlock && !SCEVCheckBlock)
2044       return;
2045 
2046     // Unhook the temporary block with the checks, update various places
2047     // accordingly.
2048     if (SCEVCheckBlock)
2049       SCEVCheckBlock->replaceAllUsesWith(Preheader);
2050     if (MemCheckBlock)
2051       MemCheckBlock->replaceAllUsesWith(Preheader);
2052 
2053     if (SCEVCheckBlock) {
2054       SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2055       new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
2056       Preheader->getTerminator()->eraseFromParent();
2057     }
2058     if (MemCheckBlock) {
2059       MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
2060       new UnreachableInst(Preheader->getContext(), MemCheckBlock);
2061       Preheader->getTerminator()->eraseFromParent();
2062     }
2063 
2064     DT->changeImmediateDominator(LoopHeader, Preheader);
2065     if (MemCheckBlock) {
2066       DT->eraseNode(MemCheckBlock);
2067       LI->removeBlock(MemCheckBlock);
2068     }
2069     if (SCEVCheckBlock) {
2070       DT->eraseNode(SCEVCheckBlock);
2071       LI->removeBlock(SCEVCheckBlock);
2072     }
2073   }
2074 
2075   /// Remove the created SCEV & memory runtime check blocks & instructions, if
2076   /// unused.
2077   ~GeneratedRTChecks() {
2078     SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2079     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2080     if (!SCEVCheckCond)
2081       SCEVCleaner.markResultUsed();
2082 
2083     if (!MemRuntimeCheckCond)
2084       MemCheckCleaner.markResultUsed();
2085 
2086     if (MemRuntimeCheckCond) {
2087       auto &SE = *MemCheckExp.getSE();
2088       // Memory runtime check generation creates compares that use expanded
2089       // values. Remove them before running the SCEVExpanderCleaners.
2090       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2091         if (MemCheckExp.isInsertedInstruction(&I))
2092           continue;
2093         SE.forgetValue(&I);
2094         I.eraseFromParent();
2095       }
2096     }
2097     MemCheckCleaner.cleanup();
2098     SCEVCleaner.cleanup();
2099 
2100     if (SCEVCheckCond)
2101       SCEVCheckBlock->eraseFromParent();
2102     if (MemRuntimeCheckCond)
2103       MemCheckBlock->eraseFromParent();
2104   }
2105 
2106   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2107   /// adjusts the branches to branch to the vector preheader or \p Bypass,
2108   /// depending on the generated condition.
2109   BasicBlock *emitSCEVChecks(Loop *L, BasicBlock *Bypass,
2110                              BasicBlock *LoopVectorPreHeader,
2111                              BasicBlock *LoopExitBlock) {
2112     if (!SCEVCheckCond)
2113       return nullptr;
2114     if (auto *C = dyn_cast<ConstantInt>(SCEVCheckCond))
2115       if (C->isZero())
2116         return nullptr;
2117 
2118     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2119 
2120     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2121     // Create new preheader for vector loop.
2122     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2123       PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2124 
2125     SCEVCheckBlock->getTerminator()->eraseFromParent();
2126     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2127     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2128                                                 SCEVCheckBlock);
2129 
2130     DT->addNewBlock(SCEVCheckBlock, Pred);
2131     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2132 
2133     ReplaceInstWithInst(
2134         SCEVCheckBlock->getTerminator(),
2135         BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheckCond));
2136     // Mark the check as used, to prevent it from being removed during cleanup.
2137     SCEVCheckCond = nullptr;
2138     return SCEVCheckBlock;
2139   }
2140 
2141   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2142   /// the branches to branch to the vector preheader or \p Bypass, depending on
2143   /// the generated condition.
2144   BasicBlock *emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass,
2145                                    BasicBlock *LoopVectorPreHeader) {
2146     // Check if we generated code that checks in runtime if arrays overlap.
2147     if (!MemRuntimeCheckCond)
2148       return nullptr;
2149 
2150     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2151     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2152                                                 MemCheckBlock);
2153 
2154     DT->addNewBlock(MemCheckBlock, Pred);
2155     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2156     MemCheckBlock->moveBefore(LoopVectorPreHeader);
2157 
2158     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2159       PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2160 
2161     ReplaceInstWithInst(
2162         MemCheckBlock->getTerminator(),
2163         BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2164     MemCheckBlock->getTerminator()->setDebugLoc(
2165         Pred->getTerminator()->getDebugLoc());
2166 
2167     // Mark the check as used, to prevent it from being removed during cleanup.
2168     MemRuntimeCheckCond = nullptr;
2169     return MemCheckBlock;
2170   }
2171 };
2172 
2173 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2174 // vectorization. The loop needs to be annotated with #pragma omp simd
2175 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2176 // vector length information is not provided, vectorization is not considered
2177 // explicit. Interleave hints are not allowed either. These limitations will be
2178 // relaxed in the future.
2179 // Please, note that we are currently forced to abuse the pragma 'clang
2180 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2181 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2182 // provides *explicit vectorization hints* (LV can bypass legal checks and
2183 // assume that vectorization is legal). However, both hints are implemented
2184 // using the same metadata (llvm.loop.vectorize, processed by
2185 // LoopVectorizeHints). This will be fixed in the future when the native IR
2186 // representation for pragma 'omp simd' is introduced.
2187 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2188                                    OptimizationRemarkEmitter *ORE) {
2189   assert(!OuterLp->isInnermost() && "This is not an outer loop");
2190   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2191 
2192   // Only outer loops with an explicit vectorization hint are supported.
2193   // Unannotated outer loops are ignored.
2194   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2195     return false;
2196 
2197   Function *Fn = OuterLp->getHeader()->getParent();
2198   if (!Hints.allowVectorization(Fn, OuterLp,
2199                                 true /*VectorizeOnlyWhenForced*/)) {
2200     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2201     return false;
2202   }
2203 
2204   if (Hints.getInterleave() > 1) {
2205     // TODO: Interleave support is future work.
2206     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2207                          "outer loops.\n");
2208     Hints.emitRemarkWithHints();
2209     return false;
2210   }
2211 
2212   return true;
2213 }
2214 
2215 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2216                                   OptimizationRemarkEmitter *ORE,
2217                                   SmallVectorImpl<Loop *> &V) {
2218   // Collect inner loops and outer loops without irreducible control flow. For
2219   // now, only collect outer loops that have explicit vectorization hints. If we
2220   // are stress testing the VPlan H-CFG construction, we collect the outermost
2221   // loop of every loop nest.
2222   if (L.isInnermost() || VPlanBuildStressTest ||
2223       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2224     LoopBlocksRPO RPOT(&L);
2225     RPOT.perform(LI);
2226     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2227       V.push_back(&L);
2228       // TODO: Collect inner loops inside marked outer loops in case
2229       // vectorization fails for the outer loop. Do not invoke
2230       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2231       // already known to be reducible. We can use an inherited attribute for
2232       // that.
2233       return;
2234     }
2235   }
2236   for (Loop *InnerL : L)
2237     collectSupportedLoops(*InnerL, LI, ORE, V);
2238 }
2239 
2240 namespace {
2241 
2242 /// The LoopVectorize Pass.
2243 struct LoopVectorize : public FunctionPass {
2244   /// Pass identification, replacement for typeid
2245   static char ID;
2246 
2247   LoopVectorizePass Impl;
2248 
2249   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2250                          bool VectorizeOnlyWhenForced = false)
2251       : FunctionPass(ID),
2252         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2253     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2254   }
2255 
2256   bool runOnFunction(Function &F) override {
2257     if (skipFunction(F))
2258       return false;
2259 
2260     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2261     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2262     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2263     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2264     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2265     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2266     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2267     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2268     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2269     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2270     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2271     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2272     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2273 
2274     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2275         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2276 
2277     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2278                         GetLAA, *ORE, PSI).MadeAnyChange;
2279   }
2280 
2281   void getAnalysisUsage(AnalysisUsage &AU) const override {
2282     AU.addRequired<AssumptionCacheTracker>();
2283     AU.addRequired<BlockFrequencyInfoWrapperPass>();
2284     AU.addRequired<DominatorTreeWrapperPass>();
2285     AU.addRequired<LoopInfoWrapperPass>();
2286     AU.addRequired<ScalarEvolutionWrapperPass>();
2287     AU.addRequired<TargetTransformInfoWrapperPass>();
2288     AU.addRequired<AAResultsWrapperPass>();
2289     AU.addRequired<LoopAccessLegacyAnalysis>();
2290     AU.addRequired<DemandedBitsWrapperPass>();
2291     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2292     AU.addRequired<InjectTLIMappingsLegacy>();
2293 
2294     // We currently do not preserve loopinfo/dominator analyses with outer loop
2295     // vectorization. Until this is addressed, mark these analyses as preserved
2296     // only for non-VPlan-native path.
2297     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2298     if (!EnableVPlanNativePath) {
2299       AU.addPreserved<LoopInfoWrapperPass>();
2300       AU.addPreserved<DominatorTreeWrapperPass>();
2301     }
2302 
2303     AU.addPreserved<BasicAAWrapperPass>();
2304     AU.addPreserved<GlobalsAAWrapperPass>();
2305     AU.addRequired<ProfileSummaryInfoWrapperPass>();
2306   }
2307 };
2308 
2309 } // end anonymous namespace
2310 
2311 //===----------------------------------------------------------------------===//
2312 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2313 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2314 //===----------------------------------------------------------------------===//
2315 
2316 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2317   // We need to place the broadcast of invariant variables outside the loop,
2318   // but only if it's proven safe to do so. Else, broadcast will be inside
2319   // vector loop body.
2320   Instruction *Instr = dyn_cast<Instruction>(V);
2321   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2322                      (!Instr ||
2323                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2324   // Place the code for broadcasting invariant variables in the new preheader.
2325   IRBuilder<>::InsertPointGuard Guard(Builder);
2326   if (SafeToHoist)
2327     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2328 
2329   // Broadcast the scalar into all locations in the vector.
2330   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2331 
2332   return Shuf;
2333 }
2334 
2335 /// This function adds
2336 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
2337 /// to each vector element of Val. The sequence starts at StartIndex.
2338 /// \p Opcode is relevant for FP induction variable.
2339 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
2340                             Instruction::BinaryOps BinOp, ElementCount VF,
2341                             IRBuilderBase &Builder) {
2342   assert(VF.isVector() && "only vector VFs are supported");
2343 
2344   // Create and check the types.
2345   auto *ValVTy = cast<VectorType>(Val->getType());
2346   ElementCount VLen = ValVTy->getElementCount();
2347 
2348   Type *STy = Val->getType()->getScalarType();
2349   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2350          "Induction Step must be an integer or FP");
2351   assert(Step->getType() == STy && "Step has wrong type");
2352 
2353   SmallVector<Constant *, 8> Indices;
2354 
2355   // Create a vector of consecutive numbers from zero to VF.
2356   VectorType *InitVecValVTy = ValVTy;
2357   Type *InitVecValSTy = STy;
2358   if (STy->isFloatingPointTy()) {
2359     InitVecValSTy =
2360         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
2361     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2362   }
2363   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2364 
2365   // Splat the StartIdx
2366   Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
2367 
2368   if (STy->isIntegerTy()) {
2369     InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2370     Step = Builder.CreateVectorSplat(VLen, Step);
2371     assert(Step->getType() == Val->getType() && "Invalid step vec");
2372     // FIXME: The newly created binary instructions should contain nsw/nuw
2373     // flags, which can be found from the original scalar operations.
2374     Step = Builder.CreateMul(InitVec, Step);
2375     return Builder.CreateAdd(Val, Step, "induction");
2376   }
2377 
2378   // Floating point induction.
2379   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2380          "Binary Opcode should be specified for FP induction");
2381   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2382   InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
2383 
2384   Step = Builder.CreateVectorSplat(VLen, Step);
2385   Value *MulOp = Builder.CreateFMul(InitVec, Step);
2386   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2387 }
2388 
2389 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
2390     const InductionDescriptor &II, Value *Step, Value *Start,
2391     Instruction *EntryVal, VPValue *Def, VPTransformState &State) {
2392   IRBuilderBase &Builder = State.Builder;
2393   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
2394          "Expected either an induction phi-node or a truncate of it!");
2395 
2396   // Construct the initial value of the vector IV in the vector loop preheader
2397   auto CurrIP = Builder.saveIP();
2398   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2399   if (isa<TruncInst>(EntryVal)) {
2400     assert(Start->getType()->isIntegerTy() &&
2401            "Truncation requires an integer type");
2402     auto *TruncType = cast<IntegerType>(EntryVal->getType());
2403     Step = Builder.CreateTrunc(Step, TruncType);
2404     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
2405   }
2406 
2407   Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
2408   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
2409   Value *SteppedStart = getStepVector(
2410       SplatStart, Zero, Step, II.getInductionOpcode(), State.VF, State.Builder);
2411 
2412   // We create vector phi nodes for both integer and floating-point induction
2413   // variables. Here, we determine the kind of arithmetic we will perform.
2414   Instruction::BinaryOps AddOp;
2415   Instruction::BinaryOps MulOp;
2416   if (Step->getType()->isIntegerTy()) {
2417     AddOp = Instruction::Add;
2418     MulOp = Instruction::Mul;
2419   } else {
2420     AddOp = II.getInductionOpcode();
2421     MulOp = Instruction::FMul;
2422   }
2423 
2424   // Multiply the vectorization factor by the step using integer or
2425   // floating-point arithmetic as appropriate.
2426   Type *StepType = Step->getType();
2427   Value *RuntimeVF;
2428   if (Step->getType()->isFloatingPointTy())
2429     RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
2430   else
2431     RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
2432   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
2433 
2434   // Create a vector splat to use in the induction update.
2435   //
2436   // FIXME: If the step is non-constant, we create the vector splat with
2437   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
2438   //        handle a constant vector splat.
2439   Value *SplatVF = isa<Constant>(Mul)
2440                        ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
2441                        : Builder.CreateVectorSplat(State.VF, Mul);
2442   Builder.restoreIP(CurrIP);
2443 
2444   // We may need to add the step a number of times, depending on the unroll
2445   // factor. The last of those goes into the PHI.
2446   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
2447                                     &*LoopVectorBody->getFirstInsertionPt());
2448   VecInd->setDebugLoc(EntryVal->getDebugLoc());
2449   Instruction *LastInduction = VecInd;
2450   for (unsigned Part = 0; Part < UF; ++Part) {
2451     State.set(Def, LastInduction, Part);
2452 
2453     if (isa<TruncInst>(EntryVal))
2454       addMetadata(LastInduction, EntryVal);
2455 
2456     LastInduction = cast<Instruction>(
2457         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
2458     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
2459   }
2460 
2461   // Move the last step to the end of the latch block. This ensures consistent
2462   // placement of all induction updates.
2463   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
2464   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
2465   LastInduction->moveBefore(Br);
2466   LastInduction->setName("vec.ind.next");
2467 
2468   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
2469   VecInd->addIncoming(LastInduction, LoopVectorLatch);
2470 }
2471 
2472 void InnerLoopVectorizer::widenIntOrFpInduction(
2473     PHINode *IV, VPWidenIntOrFpInductionRecipe *Def, VPTransformState &State,
2474     Value *CanonicalIV) {
2475   Value *Start = Def->getStartValue()->getLiveInIRValue();
2476   const InductionDescriptor &ID = Def->getInductionDescriptor();
2477   TruncInst *Trunc = Def->getTruncInst();
2478   IRBuilderBase &Builder = State.Builder;
2479   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
2480   assert(!State.VF.isZero() && "VF must be non-zero");
2481 
2482   // The value from the original loop to which we are mapping the new induction
2483   // variable.
2484   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
2485 
2486   auto &DL = EntryVal->getModule()->getDataLayout();
2487 
2488   // Generate code for the induction step. Note that induction steps are
2489   // required to be loop-invariant
2490   auto CreateStepValue = [&](const SCEV *Step) -> Value * {
2491     assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
2492            "Induction step should be loop invariant");
2493     if (PSE.getSE()->isSCEVable(IV->getType())) {
2494       SCEVExpander Exp(*PSE.getSE(), DL, "induction");
2495       return Exp.expandCodeFor(Step, Step->getType(),
2496                                State.CFG.VectorPreHeader->getTerminator());
2497     }
2498     return cast<SCEVUnknown>(Step)->getValue();
2499   };
2500 
2501   // The scalar value to broadcast. This is derived from the canonical
2502   // induction variable. If a truncation type is given, truncate the canonical
2503   // induction variable and step. Otherwise, derive these values from the
2504   // induction descriptor.
2505   auto CreateScalarIV = [&](Value *&Step) -> Value * {
2506     Value *ScalarIV = CanonicalIV;
2507     Type *NeededType = IV->getType();
2508     if (!Def->isCanonical() || ScalarIV->getType() != NeededType) {
2509       ScalarIV =
2510           NeededType->isIntegerTy()
2511               ? Builder.CreateSExtOrTrunc(ScalarIV, NeededType)
2512               : Builder.CreateCast(Instruction::SIToFP, ScalarIV, NeededType);
2513       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID,
2514                                       State.CFG.PrevBB);
2515       ScalarIV->setName("offset.idx");
2516     }
2517     if (Trunc) {
2518       auto *TruncType = cast<IntegerType>(Trunc->getType());
2519       assert(Step->getType()->isIntegerTy() &&
2520              "Truncation requires an integer step");
2521       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
2522       Step = Builder.CreateTrunc(Step, TruncType);
2523     }
2524     return ScalarIV;
2525   };
2526 
2527   // Fast-math-flags propagate from the original induction instruction.
2528   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
2529   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
2530     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
2531 
2532   // Now do the actual transformations, and start with creating the step value.
2533   Value *Step = CreateStepValue(ID.getStep());
2534   if (State.VF.isScalar()) {
2535     Value *ScalarIV = CreateScalarIV(Step);
2536     Type *ScalarTy = IntegerType::get(ScalarIV->getContext(),
2537                                       Step->getType()->getScalarSizeInBits());
2538 
2539     Instruction::BinaryOps IncOp = ID.getInductionOpcode();
2540     if (IncOp == Instruction::BinaryOpsEnd)
2541       IncOp = Instruction::Add;
2542     for (unsigned Part = 0; Part < UF; ++Part) {
2543       Value *StartIdx = ConstantInt::get(ScalarTy, Part);
2544       Instruction::BinaryOps MulOp = Instruction::Mul;
2545       if (Step->getType()->isFloatingPointTy()) {
2546         StartIdx = Builder.CreateUIToFP(StartIdx, Step->getType());
2547         MulOp = Instruction::FMul;
2548       }
2549 
2550       Value *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2551       Value *EntryPart = Builder.CreateBinOp(IncOp, ScalarIV, Mul, "induction");
2552       State.set(Def, EntryPart, Part);
2553       if (Trunc) {
2554         assert(!Step->getType()->isFloatingPointTy() &&
2555                "fp inductions shouldn't be truncated");
2556         addMetadata(EntryPart, Trunc);
2557       }
2558     }
2559     return;
2560   }
2561 
2562   // Create a new independent vector induction variable, if one is needed.
2563   if (Def->needsVectorIV())
2564     createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal, Def, State);
2565 
2566   if (Def->needsScalarIV()) {
2567     // Create scalar steps that can be used by instructions we will later
2568     // scalarize. Note that the addition of the scalar steps will not increase
2569     // the number of instructions in the loop in the common case prior to
2570     // InstCombine. We will be trading one vector extract for each scalar step.
2571     Value *ScalarIV = CreateScalarIV(Step);
2572     buildScalarSteps(ScalarIV, Step, EntryVal, ID, Def, State);
2573   }
2574 }
2575 
2576 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
2577                                            Instruction *EntryVal,
2578                                            const InductionDescriptor &ID,
2579                                            VPValue *Def,
2580                                            VPTransformState &State) {
2581   IRBuilderBase &Builder = State.Builder;
2582   // We shouldn't have to build scalar steps if we aren't vectorizing.
2583   assert(State.VF.isVector() && "VF should be greater than one");
2584   // Get the value type and ensure it and the step have the same integer type.
2585   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2586   assert(ScalarIVTy == Step->getType() &&
2587          "Val and Step should have the same type");
2588 
2589   // We build scalar steps for both integer and floating-point induction
2590   // variables. Here, we determine the kind of arithmetic we will perform.
2591   Instruction::BinaryOps AddOp;
2592   Instruction::BinaryOps MulOp;
2593   if (ScalarIVTy->isIntegerTy()) {
2594     AddOp = Instruction::Add;
2595     MulOp = Instruction::Mul;
2596   } else {
2597     AddOp = ID.getInductionOpcode();
2598     MulOp = Instruction::FMul;
2599   }
2600 
2601   // Determine the number of scalars we need to generate for each unroll
2602   // iteration.
2603   bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def);
2604   unsigned Lanes = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2605   // Compute the scalar steps and save the results in State.
2606   Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2607                                      ScalarIVTy->getScalarSizeInBits());
2608   Type *VecIVTy = nullptr;
2609   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2610   if (!FirstLaneOnly && State.VF.isScalable()) {
2611     VecIVTy = VectorType::get(ScalarIVTy, State.VF);
2612     UnitStepVec =
2613         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
2614     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
2615     SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV);
2616   }
2617 
2618   for (unsigned Part = 0; Part < State.UF; ++Part) {
2619     Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
2620 
2621     if (!FirstLaneOnly && State.VF.isScalable()) {
2622       auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
2623       auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2624       if (ScalarIVTy->isFloatingPointTy())
2625         InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2626       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2627       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2628       State.set(Def, Add, Part);
2629       // It's useful to record the lane values too for the known minimum number
2630       // of elements so we do those below. This improves the code quality when
2631       // trying to extract the first element, for example.
2632     }
2633 
2634     if (ScalarIVTy->isFloatingPointTy())
2635       StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2636 
2637     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2638       Value *StartIdx = Builder.CreateBinOp(
2639           AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2640       // The step returned by `createStepForVF` is a runtime-evaluated value
2641       // when VF is scalable. Otherwise, it should be folded into a Constant.
2642       assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
2643              "Expected StartIdx to be folded to a constant when VF is not "
2644              "scalable");
2645       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2646       auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2647       State.set(Def, Add, VPIteration(Part, Lane));
2648     }
2649   }
2650 }
2651 
2652 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2653                                                     const VPIteration &Instance,
2654                                                     VPTransformState &State) {
2655   Value *ScalarInst = State.get(Def, Instance);
2656   Value *VectorValue = State.get(Def, Instance.Part);
2657   VectorValue = Builder.CreateInsertElement(
2658       VectorValue, ScalarInst,
2659       Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2660   State.set(Def, VectorValue, Instance.Part);
2661 }
2662 
2663 // Return whether we allow using masked interleave-groups (for dealing with
2664 // strided loads/stores that reside in predicated blocks, or for dealing
2665 // with gaps).
2666 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2667   // If an override option has been passed in for interleaved accesses, use it.
2668   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2669     return EnableMaskedInterleavedMemAccesses;
2670 
2671   return TTI.enableMaskedInterleavedAccessVectorization();
2672 }
2673 
2674 // Try to vectorize the interleave group that \p Instr belongs to.
2675 //
2676 // E.g. Translate following interleaved load group (factor = 3):
2677 //   for (i = 0; i < N; i+=3) {
2678 //     R = Pic[i];             // Member of index 0
2679 //     G = Pic[i+1];           // Member of index 1
2680 //     B = Pic[i+2];           // Member of index 2
2681 //     ... // do something to R, G, B
2682 //   }
2683 // To:
2684 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2685 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2686 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2687 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2688 //
2689 // Or translate following interleaved store group (factor = 3):
2690 //   for (i = 0; i < N; i+=3) {
2691 //     ... do something to R, G, B
2692 //     Pic[i]   = R;           // Member of index 0
2693 //     Pic[i+1] = G;           // Member of index 1
2694 //     Pic[i+2] = B;           // Member of index 2
2695 //   }
2696 // To:
2697 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2698 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2699 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2700 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2701 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2702 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2703     const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2704     VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2705     VPValue *BlockInMask) {
2706   Instruction *Instr = Group->getInsertPos();
2707   const DataLayout &DL = Instr->getModule()->getDataLayout();
2708 
2709   // Prepare for the vector type of the interleaved load/store.
2710   Type *ScalarTy = getLoadStoreType(Instr);
2711   unsigned InterleaveFactor = Group->getFactor();
2712   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2713   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2714 
2715   // Prepare for the new pointers.
2716   SmallVector<Value *, 2> AddrParts;
2717   unsigned Index = Group->getIndex(Instr);
2718 
2719   // TODO: extend the masked interleaved-group support to reversed access.
2720   assert((!BlockInMask || !Group->isReverse()) &&
2721          "Reversed masked interleave-group not supported.");
2722 
2723   // If the group is reverse, adjust the index to refer to the last vector lane
2724   // instead of the first. We adjust the index from the first vector lane,
2725   // rather than directly getting the pointer for lane VF - 1, because the
2726   // pointer operand of the interleaved access is supposed to be uniform. For
2727   // uniform instructions, we're only required to generate a value for the
2728   // first vector lane in each unroll iteration.
2729   if (Group->isReverse())
2730     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2731 
2732   for (unsigned Part = 0; Part < UF; Part++) {
2733     Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2734     setDebugLocFromInst(AddrPart);
2735 
2736     // Notice current instruction could be any index. Need to adjust the address
2737     // to the member of index 0.
2738     //
2739     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2740     //       b = A[i];       // Member of index 0
2741     // Current pointer is pointed to A[i+1], adjust it to A[i].
2742     //
2743     // E.g.  A[i+1] = a;     // Member of index 1
2744     //       A[i]   = b;     // Member of index 0
2745     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2746     // Current pointer is pointed to A[i+2], adjust it to A[i].
2747 
2748     bool InBounds = false;
2749     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2750       InBounds = gep->isInBounds();
2751     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2752     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2753 
2754     // Cast to the vector pointer type.
2755     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2756     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2757     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2758   }
2759 
2760   setDebugLocFromInst(Instr);
2761   Value *PoisonVec = PoisonValue::get(VecTy);
2762 
2763   Value *MaskForGaps = nullptr;
2764   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2765     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2766     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2767   }
2768 
2769   // Vectorize the interleaved load group.
2770   if (isa<LoadInst>(Instr)) {
2771     // For each unroll part, create a wide load for the group.
2772     SmallVector<Value *, 2> NewLoads;
2773     for (unsigned Part = 0; Part < UF; Part++) {
2774       Instruction *NewLoad;
2775       if (BlockInMask || MaskForGaps) {
2776         assert(useMaskedInterleavedAccesses(*TTI) &&
2777                "masked interleaved groups are not allowed.");
2778         Value *GroupMask = MaskForGaps;
2779         if (BlockInMask) {
2780           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2781           Value *ShuffledMask = Builder.CreateShuffleVector(
2782               BlockInMaskPart,
2783               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2784               "interleaved.mask");
2785           GroupMask = MaskForGaps
2786                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2787                                                 MaskForGaps)
2788                           : ShuffledMask;
2789         }
2790         NewLoad =
2791             Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2792                                      GroupMask, PoisonVec, "wide.masked.vec");
2793       }
2794       else
2795         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2796                                             Group->getAlign(), "wide.vec");
2797       Group->addMetadata(NewLoad);
2798       NewLoads.push_back(NewLoad);
2799     }
2800 
2801     // For each member in the group, shuffle out the appropriate data from the
2802     // wide loads.
2803     unsigned J = 0;
2804     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2805       Instruction *Member = Group->getMember(I);
2806 
2807       // Skip the gaps in the group.
2808       if (!Member)
2809         continue;
2810 
2811       auto StrideMask =
2812           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2813       for (unsigned Part = 0; Part < UF; Part++) {
2814         Value *StridedVec = Builder.CreateShuffleVector(
2815             NewLoads[Part], StrideMask, "strided.vec");
2816 
2817         // If this member has different type, cast the result type.
2818         if (Member->getType() != ScalarTy) {
2819           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2820           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2821           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2822         }
2823 
2824         if (Group->isReverse())
2825           StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2826 
2827         State.set(VPDefs[J], StridedVec, Part);
2828       }
2829       ++J;
2830     }
2831     return;
2832   }
2833 
2834   // The sub vector type for current instruction.
2835   auto *SubVT = VectorType::get(ScalarTy, VF);
2836 
2837   // Vectorize the interleaved store group.
2838   MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2839   assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2840          "masked interleaved groups are not allowed.");
2841   assert((!MaskForGaps || !VF.isScalable()) &&
2842          "masking gaps for scalable vectors is not yet supported.");
2843   for (unsigned Part = 0; Part < UF; Part++) {
2844     // Collect the stored vector from each member.
2845     SmallVector<Value *, 4> StoredVecs;
2846     for (unsigned i = 0; i < InterleaveFactor; i++) {
2847       assert((Group->getMember(i) || MaskForGaps) &&
2848              "Fail to get a member from an interleaved store group");
2849       Instruction *Member = Group->getMember(i);
2850 
2851       // Skip the gaps in the group.
2852       if (!Member) {
2853         Value *Undef = PoisonValue::get(SubVT);
2854         StoredVecs.push_back(Undef);
2855         continue;
2856       }
2857 
2858       Value *StoredVec = State.get(StoredValues[i], Part);
2859 
2860       if (Group->isReverse())
2861         StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
2862 
2863       // If this member has different type, cast it to a unified type.
2864 
2865       if (StoredVec->getType() != SubVT)
2866         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2867 
2868       StoredVecs.push_back(StoredVec);
2869     }
2870 
2871     // Concatenate all vectors into a wide vector.
2872     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2873 
2874     // Interleave the elements in the wide vector.
2875     Value *IVec = Builder.CreateShuffleVector(
2876         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2877         "interleaved.vec");
2878 
2879     Instruction *NewStoreInstr;
2880     if (BlockInMask || MaskForGaps) {
2881       Value *GroupMask = MaskForGaps;
2882       if (BlockInMask) {
2883         Value *BlockInMaskPart = State.get(BlockInMask, Part);
2884         Value *ShuffledMask = Builder.CreateShuffleVector(
2885             BlockInMaskPart,
2886             createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2887             "interleaved.mask");
2888         GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And,
2889                                                       ShuffledMask, MaskForGaps)
2890                                 : ShuffledMask;
2891       }
2892       NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2893                                                 Group->getAlign(), GroupMask);
2894     } else
2895       NewStoreInstr =
2896           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2897 
2898     Group->addMetadata(NewStoreInstr);
2899   }
2900 }
2901 
2902 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2903                                                VPReplicateRecipe *RepRecipe,
2904                                                const VPIteration &Instance,
2905                                                bool IfPredicateInstr,
2906                                                VPTransformState &State) {
2907   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2908 
2909   // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2910   // the first lane and part.
2911   if (isa<NoAliasScopeDeclInst>(Instr))
2912     if (!Instance.isFirstIteration())
2913       return;
2914 
2915   setDebugLocFromInst(Instr);
2916 
2917   // Does this instruction return a value ?
2918   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2919 
2920   Instruction *Cloned = Instr->clone();
2921   if (!IsVoidRetTy)
2922     Cloned->setName(Instr->getName() + ".cloned");
2923 
2924   // If the scalarized instruction contributes to the address computation of a
2925   // widen masked load/store which was in a basic block that needed predication
2926   // and is not predicated after vectorization, we can't propagate
2927   // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized
2928   // instruction could feed a poison value to the base address of the widen
2929   // load/store.
2930   if (State.MayGeneratePoisonRecipes.contains(RepRecipe))
2931     Cloned->dropPoisonGeneratingFlags();
2932 
2933   State.Builder.SetInsertPoint(Builder.GetInsertBlock(),
2934                                Builder.GetInsertPoint());
2935   // Replace the operands of the cloned instructions with their scalar
2936   // equivalents in the new loop.
2937   for (auto &I : enumerate(RepRecipe->operands())) {
2938     auto InputInstance = Instance;
2939     VPValue *Operand = I.value();
2940     VPReplicateRecipe *OperandR = dyn_cast<VPReplicateRecipe>(Operand);
2941     if (OperandR && OperandR->isUniform())
2942       InputInstance.Lane = VPLane::getFirstLane();
2943     Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2944   }
2945   addNewMetadata(Cloned, Instr);
2946 
2947   // Place the cloned scalar in the new loop.
2948   Builder.Insert(Cloned);
2949 
2950   State.set(RepRecipe, Cloned, Instance);
2951 
2952   // If we just cloned a new assumption, add it the assumption cache.
2953   if (auto *II = dyn_cast<AssumeInst>(Cloned))
2954     AC->registerAssumption(II);
2955 
2956   // End if-block.
2957   if (IfPredicateInstr)
2958     PredicatedInstructions.push_back(Cloned);
2959 }
2960 
2961 void InnerLoopVectorizer::createHeaderBranch(Loop *L) {
2962   BasicBlock *Header = L->getHeader();
2963   assert(!L->getLoopLatch() && "loop should not have a latch at this point");
2964 
2965   IRBuilder<> B(Header->getTerminator());
2966   Instruction *OldInst =
2967       getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
2968   setDebugLocFromInst(OldInst, &B);
2969 
2970   // Connect the header to the exit and header blocks and replace the old
2971   // terminator.
2972   B.CreateCondBr(B.getTrue(), L->getUniqueExitBlock(), Header);
2973 
2974   // Now we have two terminators. Remove the old one from the block.
2975   Header->getTerminator()->eraseFromParent();
2976 }
2977 
2978 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2979   if (TripCount)
2980     return TripCount;
2981 
2982   assert(L && "Create Trip Count for null loop.");
2983   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2984   // Find the loop boundaries.
2985   ScalarEvolution *SE = PSE.getSE();
2986   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2987   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
2988          "Invalid loop count");
2989 
2990   Type *IdxTy = Legal->getWidestInductionType();
2991   assert(IdxTy && "No type for induction");
2992 
2993   // The exit count might have the type of i64 while the phi is i32. This can
2994   // happen if we have an induction variable that is sign extended before the
2995   // compare. The only way that we get a backedge taken count is that the
2996   // induction variable was signed and as such will not overflow. In such a case
2997   // truncation is legal.
2998   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2999       IdxTy->getPrimitiveSizeInBits())
3000     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
3001   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
3002 
3003   // Get the total trip count from the count by adding 1.
3004   const SCEV *ExitCount = SE->getAddExpr(
3005       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
3006 
3007   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
3008 
3009   // Expand the trip count and place the new instructions in the preheader.
3010   // Notice that the pre-header does not change, only the loop body.
3011   SCEVExpander Exp(*SE, DL, "induction");
3012 
3013   // Count holds the overall loop count (N).
3014   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
3015                                 L->getLoopPreheader()->getTerminator());
3016 
3017   if (TripCount->getType()->isPointerTy())
3018     TripCount =
3019         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
3020                                     L->getLoopPreheader()->getTerminator());
3021 
3022   return TripCount;
3023 }
3024 
3025 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
3026   if (VectorTripCount)
3027     return VectorTripCount;
3028 
3029   Value *TC = getOrCreateTripCount(L);
3030   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
3031 
3032   Type *Ty = TC->getType();
3033   // This is where we can make the step a runtime constant.
3034   Value *Step = createStepForVF(Builder, Ty, VF, UF);
3035 
3036   // If the tail is to be folded by masking, round the number of iterations N
3037   // up to a multiple of Step instead of rounding down. This is done by first
3038   // adding Step-1 and then rounding down. Note that it's ok if this addition
3039   // overflows: the vector induction variable will eventually wrap to zero given
3040   // that it starts at zero and its Step is a power of two; the loop will then
3041   // exit, with the last early-exit vector comparison also producing all-true.
3042   if (Cost->foldTailByMasking()) {
3043     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
3044            "VF*UF must be a power of 2 when folding tail by masking");
3045     Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
3046     TC = Builder.CreateAdd(
3047         TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
3048   }
3049 
3050   // Now we need to generate the expression for the part of the loop that the
3051   // vectorized body will execute. This is equal to N - (N % Step) if scalar
3052   // iterations are not required for correctness, or N - Step, otherwise. Step
3053   // is equal to the vectorization factor (number of SIMD elements) times the
3054   // unroll factor (number of SIMD instructions).
3055   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
3056 
3057   // There are cases where we *must* run at least one iteration in the remainder
3058   // loop.  See the cost model for when this can happen.  If the step evenly
3059   // divides the trip count, we set the remainder to be equal to the step. If
3060   // the step does not evenly divide the trip count, no adjustment is necessary
3061   // since there will already be scalar iterations. Note that the minimum
3062   // iterations check ensures that N >= Step.
3063   if (Cost->requiresScalarEpilogue(VF)) {
3064     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
3065     R = Builder.CreateSelect(IsZero, Step, R);
3066   }
3067 
3068   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
3069 
3070   return VectorTripCount;
3071 }
3072 
3073 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
3074                                                    const DataLayout &DL) {
3075   // Verify that V is a vector type with same number of elements as DstVTy.
3076   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
3077   unsigned VF = DstFVTy->getNumElements();
3078   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
3079   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
3080   Type *SrcElemTy = SrcVecTy->getElementType();
3081   Type *DstElemTy = DstFVTy->getElementType();
3082   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
3083          "Vector elements must have same size");
3084 
3085   // Do a direct cast if element types are castable.
3086   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
3087     return Builder.CreateBitOrPointerCast(V, DstFVTy);
3088   }
3089   // V cannot be directly casted to desired vector type.
3090   // May happen when V is a floating point vector but DstVTy is a vector of
3091   // pointers or vice-versa. Handle this using a two-step bitcast using an
3092   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
3093   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
3094          "Only one type should be a pointer type");
3095   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
3096          "Only one type should be a floating point type");
3097   Type *IntTy =
3098       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
3099   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
3100   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
3101   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
3102 }
3103 
3104 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
3105                                                          BasicBlock *Bypass) {
3106   Value *Count = getOrCreateTripCount(L);
3107   // Reuse existing vector loop preheader for TC checks.
3108   // Note that new preheader block is generated for vector loop.
3109   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
3110   IRBuilder<> Builder(TCCheckBlock->getTerminator());
3111 
3112   // Generate code to check if the loop's trip count is less than VF * UF, or
3113   // equal to it in case a scalar epilogue is required; this implies that the
3114   // vector trip count is zero. This check also covers the case where adding one
3115   // to the backedge-taken count overflowed leading to an incorrect trip count
3116   // of zero. In this case we will also jump to the scalar loop.
3117   auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
3118                                             : ICmpInst::ICMP_ULT;
3119 
3120   // If tail is to be folded, vector loop takes care of all iterations.
3121   Value *CheckMinIters = Builder.getFalse();
3122   if (!Cost->foldTailByMasking()) {
3123     Value *Step = createStepForVF(Builder, Count->getType(), VF, UF);
3124     CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
3125   }
3126   // Create new preheader for vector loop.
3127   LoopVectorPreHeader =
3128       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
3129                  "vector.ph");
3130 
3131   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
3132                                DT->getNode(Bypass)->getIDom()) &&
3133          "TC check is expected to dominate Bypass");
3134 
3135   // Update dominator for Bypass & LoopExit (if needed).
3136   DT->changeImmediateDominator(Bypass, TCCheckBlock);
3137   if (!Cost->requiresScalarEpilogue(VF))
3138     // If there is an epilogue which must run, there's no edge from the
3139     // middle block to exit blocks  and thus no need to update the immediate
3140     // dominator of the exit blocks.
3141     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3142 
3143   ReplaceInstWithInst(
3144       TCCheckBlock->getTerminator(),
3145       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
3146   LoopBypassBlocks.push_back(TCCheckBlock);
3147 }
3148 
3149 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
3150 
3151   BasicBlock *const SCEVCheckBlock =
3152       RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock);
3153   if (!SCEVCheckBlock)
3154     return nullptr;
3155 
3156   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3157            (OptForSizeBasedOnProfile &&
3158             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
3159          "Cannot SCEV check stride or overflow when optimizing for size");
3160 
3161 
3162   // Update dominator only if this is first RT check.
3163   if (LoopBypassBlocks.empty()) {
3164     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3165     if (!Cost->requiresScalarEpilogue(VF))
3166       // If there is an epilogue which must run, there's no edge from the
3167       // middle block to exit blocks  and thus no need to update the immediate
3168       // dominator of the exit blocks.
3169       DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3170   }
3171 
3172   LoopBypassBlocks.push_back(SCEVCheckBlock);
3173   AddedSafetyChecks = true;
3174   return SCEVCheckBlock;
3175 }
3176 
3177 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L,
3178                                                       BasicBlock *Bypass) {
3179   // VPlan-native path does not do any analysis for runtime checks currently.
3180   if (EnableVPlanNativePath)
3181     return nullptr;
3182 
3183   BasicBlock *const MemCheckBlock =
3184       RTChecks.emitMemRuntimeChecks(L, Bypass, LoopVectorPreHeader);
3185 
3186   // Check if we generated code that checks in runtime if arrays overlap. We put
3187   // the checks into a separate block to make the more common case of few
3188   // elements faster.
3189   if (!MemCheckBlock)
3190     return nullptr;
3191 
3192   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3193     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3194            "Cannot emit memory checks when optimizing for size, unless forced "
3195            "to vectorize.");
3196     ORE->emit([&]() {
3197       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3198                                         L->getStartLoc(), L->getHeader())
3199              << "Code-size may be reduced by not forcing "
3200                 "vectorization, or by source-code modifications "
3201                 "eliminating the need for runtime checks "
3202                 "(e.g., adding 'restrict').";
3203     });
3204   }
3205 
3206   LoopBypassBlocks.push_back(MemCheckBlock);
3207 
3208   AddedSafetyChecks = true;
3209 
3210   // We currently don't use LoopVersioning for the actual loop cloning but we
3211   // still use it to add the noalias metadata.
3212   LVer = std::make_unique<LoopVersioning>(
3213       *Legal->getLAI(),
3214       Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
3215       DT, PSE.getSE());
3216   LVer->prepareNoAliasMetadata();
3217   return MemCheckBlock;
3218 }
3219 
3220 Value *InnerLoopVectorizer::emitTransformedIndex(
3221     IRBuilderBase &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
3222     const InductionDescriptor &ID, BasicBlock *VectorHeader) const {
3223 
3224   SCEVExpander Exp(*SE, DL, "induction");
3225   auto Step = ID.getStep();
3226   auto StartValue = ID.getStartValue();
3227   assert(Index->getType()->getScalarType() == Step->getType() &&
3228          "Index scalar type does not match StepValue type");
3229 
3230   // Note: the IR at this point is broken. We cannot use SE to create any new
3231   // SCEV and then expand it, hoping that SCEV's simplification will give us
3232   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
3233   // lead to various SCEV crashes. So all we can do is to use builder and rely
3234   // on InstCombine for future simplifications. Here we handle some trivial
3235   // cases only.
3236   auto CreateAdd = [&B](Value *X, Value *Y) {
3237     assert(X->getType() == Y->getType() && "Types don't match!");
3238     if (auto *CX = dyn_cast<ConstantInt>(X))
3239       if (CX->isZero())
3240         return Y;
3241     if (auto *CY = dyn_cast<ConstantInt>(Y))
3242       if (CY->isZero())
3243         return X;
3244     return B.CreateAdd(X, Y);
3245   };
3246 
3247   // We allow X to be a vector type, in which case Y will potentially be
3248   // splatted into a vector with the same element count.
3249   auto CreateMul = [&B](Value *X, Value *Y) {
3250     assert(X->getType()->getScalarType() == Y->getType() &&
3251            "Types don't match!");
3252     if (auto *CX = dyn_cast<ConstantInt>(X))
3253       if (CX->isOne())
3254         return Y;
3255     if (auto *CY = dyn_cast<ConstantInt>(Y))
3256       if (CY->isOne())
3257         return X;
3258     VectorType *XVTy = dyn_cast<VectorType>(X->getType());
3259     if (XVTy && !isa<VectorType>(Y->getType()))
3260       Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
3261     return B.CreateMul(X, Y);
3262   };
3263 
3264   // Get a suitable insert point for SCEV expansion. For blocks in the vector
3265   // loop, choose the end of the vector loop header (=VectorHeader), because
3266   // the DomTree is not kept up-to-date for additional blocks generated in the
3267   // vector loop. By using the header as insertion point, we guarantee that the
3268   // expanded instructions dominate all their uses.
3269   auto GetInsertPoint = [this, &B, VectorHeader]() {
3270     BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
3271     if (InsertBB != LoopVectorBody &&
3272         LI->getLoopFor(VectorHeader) == LI->getLoopFor(InsertBB))
3273       return VectorHeader->getTerminator();
3274     return &*B.GetInsertPoint();
3275   };
3276 
3277   switch (ID.getKind()) {
3278   case InductionDescriptor::IK_IntInduction: {
3279     assert(!isa<VectorType>(Index->getType()) &&
3280            "Vector indices not supported for integer inductions yet");
3281     assert(Index->getType() == StartValue->getType() &&
3282            "Index type does not match StartValue type");
3283     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
3284       return B.CreateSub(StartValue, Index);
3285     auto *Offset = CreateMul(
3286         Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
3287     return CreateAdd(StartValue, Offset);
3288   }
3289   case InductionDescriptor::IK_PtrInduction: {
3290     assert(isa<SCEVConstant>(Step) &&
3291            "Expected constant step for pointer induction");
3292     return B.CreateGEP(
3293         ID.getElementType(), StartValue,
3294         CreateMul(Index,
3295                   Exp.expandCodeFor(Step, Index->getType()->getScalarType(),
3296                                     GetInsertPoint())));
3297   }
3298   case InductionDescriptor::IK_FpInduction: {
3299     assert(!isa<VectorType>(Index->getType()) &&
3300            "Vector indices not supported for FP inductions yet");
3301     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
3302     auto InductionBinOp = ID.getInductionBinOp();
3303     assert(InductionBinOp &&
3304            (InductionBinOp->getOpcode() == Instruction::FAdd ||
3305             InductionBinOp->getOpcode() == Instruction::FSub) &&
3306            "Original bin op should be defined for FP induction");
3307 
3308     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
3309     Value *MulExp = B.CreateFMul(StepValue, Index);
3310     return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
3311                          "induction");
3312   }
3313   case InductionDescriptor::IK_NoInduction:
3314     return nullptr;
3315   }
3316   llvm_unreachable("invalid enum");
3317 }
3318 
3319 Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3320   LoopScalarBody = OrigLoop->getHeader();
3321   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3322   assert(LoopVectorPreHeader && "Invalid loop structure");
3323   LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3324   assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) &&
3325          "multiple exit loop without required epilogue?");
3326 
3327   LoopMiddleBlock =
3328       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3329                  LI, nullptr, Twine(Prefix) + "middle.block");
3330   LoopScalarPreHeader =
3331       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3332                  nullptr, Twine(Prefix) + "scalar.ph");
3333 
3334   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3335 
3336   // Set up the middle block terminator.  Two cases:
3337   // 1) If we know that we must execute the scalar epilogue, emit an
3338   //    unconditional branch.
3339   // 2) Otherwise, we must have a single unique exit block (due to how we
3340   //    implement the multiple exit case).  In this case, set up a conditonal
3341   //    branch from the middle block to the loop scalar preheader, and the
3342   //    exit block.  completeLoopSkeleton will update the condition to use an
3343   //    iteration check, if required to decide whether to execute the remainder.
3344   BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ?
3345     BranchInst::Create(LoopScalarPreHeader) :
3346     BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3347                        Builder.getTrue());
3348   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3349   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3350 
3351   // We intentionally don't let SplitBlock to update LoopInfo since
3352   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
3353   // LoopVectorBody is explicitly added to the correct place few lines later.
3354   LoopVectorBody =
3355       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3356                  nullptr, nullptr, Twine(Prefix) + "vector.body");
3357 
3358   // Update dominator for loop exit.
3359   if (!Cost->requiresScalarEpilogue(VF))
3360     // If there is an epilogue which must run, there's no edge from the
3361     // middle block to exit blocks  and thus no need to update the immediate
3362     // dominator of the exit blocks.
3363     DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3364 
3365   // Create and register the new vector loop.
3366   Loop *Lp = LI->AllocateLoop();
3367   Loop *ParentLoop = OrigLoop->getParentLoop();
3368 
3369   // Insert the new loop into the loop nest and register the new basic blocks
3370   // before calling any utilities such as SCEV that require valid LoopInfo.
3371   if (ParentLoop) {
3372     ParentLoop->addChildLoop(Lp);
3373   } else {
3374     LI->addTopLevelLoop(Lp);
3375   }
3376   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
3377   return Lp;
3378 }
3379 
3380 void InnerLoopVectorizer::createInductionResumeValues(
3381     Loop *L, std::pair<BasicBlock *, Value *> AdditionalBypass) {
3382   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3383           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3384          "Inconsistent information about additional bypass.");
3385 
3386   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3387   assert(VectorTripCount && L && "Expected valid arguments");
3388   // We are going to resume the execution of the scalar loop.
3389   // Go over all of the induction variables that we found and fix the
3390   // PHIs that are left in the scalar version of the loop.
3391   // The starting values of PHI nodes depend on the counter of the last
3392   // iteration in the vectorized loop.
3393   // If we come from a bypass edge then we need to start from the original
3394   // start value.
3395   Instruction *OldInduction = Legal->getPrimaryInduction();
3396   for (auto &InductionEntry : Legal->getInductionVars()) {
3397     PHINode *OrigPhi = InductionEntry.first;
3398     InductionDescriptor II = InductionEntry.second;
3399 
3400     // Create phi nodes to merge from the  backedge-taken check block.
3401     PHINode *BCResumeVal =
3402         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3403                         LoopScalarPreHeader->getTerminator());
3404     // Copy original phi DL over to the new one.
3405     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3406     Value *&EndValue = IVEndValues[OrigPhi];
3407     Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3408     if (OrigPhi == OldInduction) {
3409       // We know what the end value is.
3410       EndValue = VectorTripCount;
3411     } else {
3412       IRBuilder<> B(L->getLoopPreheader()->getTerminator());
3413 
3414       // Fast-math-flags propagate from the original induction instruction.
3415       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3416         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3417 
3418       Type *StepType = II.getStep()->getType();
3419       Instruction::CastOps CastOp =
3420           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3421       Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
3422       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3423       EndValue =
3424           emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody);
3425       EndValue->setName("ind.end");
3426 
3427       // Compute the end value for the additional bypass (if applicable).
3428       if (AdditionalBypass.first) {
3429         B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3430         CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3431                                          StepType, true);
3432         CRD =
3433             B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.crd");
3434         EndValueFromAdditionalBypass =
3435             emitTransformedIndex(B, CRD, PSE.getSE(), DL, II, LoopVectorBody);
3436         EndValueFromAdditionalBypass->setName("ind.end");
3437       }
3438     }
3439     // The new PHI merges the original incoming value, in case of a bypass,
3440     // or the value at the end of the vectorized loop.
3441     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3442 
3443     // Fix the scalar body counter (PHI node).
3444     // The old induction's phi node in the scalar body needs the truncated
3445     // value.
3446     for (BasicBlock *BB : LoopBypassBlocks)
3447       BCResumeVal->addIncoming(II.getStartValue(), BB);
3448 
3449     if (AdditionalBypass.first)
3450       BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3451                                             EndValueFromAdditionalBypass);
3452 
3453     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3454   }
3455 }
3456 
3457 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3458                                                       MDNode *OrigLoopID) {
3459   assert(L && "Expected valid loop.");
3460 
3461   // The trip counts should be cached by now.
3462   Value *Count = getOrCreateTripCount(L);
3463   Value *VectorTripCount = getOrCreateVectorTripCount(L);
3464 
3465   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3466 
3467   // Add a check in the middle block to see if we have completed
3468   // all of the iterations in the first vector loop.  Three cases:
3469   // 1) If we require a scalar epilogue, there is no conditional branch as
3470   //    we unconditionally branch to the scalar preheader.  Do nothing.
3471   // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3472   //    Thus if tail is to be folded, we know we don't need to run the
3473   //    remainder and we can use the previous value for the condition (true).
3474   // 3) Otherwise, construct a runtime check.
3475   if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) {
3476     Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3477                                         Count, VectorTripCount, "cmp.n",
3478                                         LoopMiddleBlock->getTerminator());
3479 
3480     // Here we use the same DebugLoc as the scalar loop latch terminator instead
3481     // of the corresponding compare because they may have ended up with
3482     // different line numbers and we want to avoid awkward line stepping while
3483     // debugging. Eg. if the compare has got a line number inside the loop.
3484     CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3485     cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3486   }
3487 
3488   // Get ready to start creating new instructions into the vectorized body.
3489   assert(LoopVectorPreHeader == L->getLoopPreheader() &&
3490          "Inconsistent vector loop preheader");
3491   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3492 
3493 #ifdef EXPENSIVE_CHECKS
3494   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3495   LI->verify(*DT);
3496 #endif
3497 
3498   return LoopVectorPreHeader;
3499 }
3500 
3501 std::pair<BasicBlock *, Value *>
3502 InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3503   /*
3504    In this function we generate a new loop. The new loop will contain
3505    the vectorized instructions while the old loop will continue to run the
3506    scalar remainder.
3507 
3508        [ ] <-- loop iteration number check.
3509     /   |
3510    /    v
3511   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3512   |  /  |
3513   | /   v
3514   ||   [ ]     <-- vector pre header.
3515   |/    |
3516   |     v
3517   |    [  ] \
3518   |    [  ]_|   <-- vector loop.
3519   |     |
3520   |     v
3521   \   -[ ]   <--- middle-block.
3522    \/   |
3523    /\   v
3524    | ->[ ]     <--- new preheader.
3525    |    |
3526  (opt)  v      <-- edge from middle to exit iff epilogue is not required.
3527    |   [ ] \
3528    |   [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue).
3529     \   |
3530      \  v
3531       >[ ]     <-- exit block(s).
3532    ...
3533    */
3534 
3535   // Get the metadata of the original loop before it gets modified.
3536   MDNode *OrigLoopID = OrigLoop->getLoopID();
3537 
3538   // Workaround!  Compute the trip count of the original loop and cache it
3539   // before we start modifying the CFG.  This code has a systemic problem
3540   // wherein it tries to run analysis over partially constructed IR; this is
3541   // wrong, and not simply for SCEV.  The trip count of the original loop
3542   // simply happens to be prone to hitting this in practice.  In theory, we
3543   // can hit the same issue for any SCEV, or ValueTracking query done during
3544   // mutation.  See PR49900.
3545   getOrCreateTripCount(OrigLoop);
3546 
3547   // Create an empty vector loop, and prepare basic blocks for the runtime
3548   // checks.
3549   Loop *Lp = createVectorLoopSkeleton("");
3550 
3551   // Now, compare the new count to zero. If it is zero skip the vector loop and
3552   // jump to the scalar loop. This check also covers the case where the
3553   // backedge-taken count is uint##_max: adding one to it will overflow leading
3554   // to an incorrect trip count of zero. In this (rare) case we will also jump
3555   // to the scalar loop.
3556   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3557 
3558   // Generate the code to check any assumptions that we've made for SCEV
3559   // expressions.
3560   emitSCEVChecks(Lp, LoopScalarPreHeader);
3561 
3562   // Generate the code that checks in runtime if arrays overlap. We put the
3563   // checks into a separate block to make the more common case of few elements
3564   // faster.
3565   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3566 
3567   createHeaderBranch(Lp);
3568 
3569   // Emit phis for the new starting index of the scalar loop.
3570   createInductionResumeValues(Lp);
3571 
3572   return {completeLoopSkeleton(Lp, OrigLoopID), nullptr};
3573 }
3574 
3575 // Fix up external users of the induction variable. At this point, we are
3576 // in LCSSA form, with all external PHIs that use the IV having one input value,
3577 // coming from the remainder loop. We need those PHIs to also have a correct
3578 // value for the IV when arriving directly from the middle block.
3579 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3580                                        const InductionDescriptor &II,
3581                                        Value *CountRoundDown, Value *EndValue,
3582                                        BasicBlock *MiddleBlock) {
3583   // There are two kinds of external IV usages - those that use the value
3584   // computed in the last iteration (the PHI) and those that use the penultimate
3585   // value (the value that feeds into the phi from the loop latch).
3586   // We allow both, but they, obviously, have different values.
3587 
3588   assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3589 
3590   DenseMap<Value *, Value *> MissingVals;
3591 
3592   // An external user of the last iteration's value should see the value that
3593   // the remainder loop uses to initialize its own IV.
3594   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3595   for (User *U : PostInc->users()) {
3596     Instruction *UI = cast<Instruction>(U);
3597     if (!OrigLoop->contains(UI)) {
3598       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3599       MissingVals[UI] = EndValue;
3600     }
3601   }
3602 
3603   // An external user of the penultimate value need to see EndValue - Step.
3604   // The simplest way to get this is to recompute it from the constituent SCEVs,
3605   // that is Start + (Step * (CRD - 1)).
3606   for (User *U : OrigPhi->users()) {
3607     auto *UI = cast<Instruction>(U);
3608     if (!OrigLoop->contains(UI)) {
3609       const DataLayout &DL =
3610           OrigLoop->getHeader()->getModule()->getDataLayout();
3611       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3612 
3613       IRBuilder<> B(MiddleBlock->getTerminator());
3614 
3615       // Fast-math-flags propagate from the original induction instruction.
3616       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3617         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3618 
3619       Value *CountMinusOne = B.CreateSub(
3620           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3621       Value *CMO =
3622           !II.getStep()->getType()->isIntegerTy()
3623               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3624                              II.getStep()->getType())
3625               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3626       CMO->setName("cast.cmo");
3627       Value *Escape =
3628           emitTransformedIndex(B, CMO, PSE.getSE(), DL, II, LoopVectorBody);
3629       Escape->setName("ind.escape");
3630       MissingVals[UI] = Escape;
3631     }
3632   }
3633 
3634   for (auto &I : MissingVals) {
3635     PHINode *PHI = cast<PHINode>(I.first);
3636     // One corner case we have to handle is two IVs "chasing" each-other,
3637     // that is %IV2 = phi [...], [ %IV1, %latch ]
3638     // In this case, if IV1 has an external use, we need to avoid adding both
3639     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3640     // don't already have an incoming value for the middle block.
3641     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3642       PHI->addIncoming(I.second, MiddleBlock);
3643   }
3644 }
3645 
3646 namespace {
3647 
3648 struct CSEDenseMapInfo {
3649   static bool canHandle(const Instruction *I) {
3650     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3651            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3652   }
3653 
3654   static inline Instruction *getEmptyKey() {
3655     return DenseMapInfo<Instruction *>::getEmptyKey();
3656   }
3657 
3658   static inline Instruction *getTombstoneKey() {
3659     return DenseMapInfo<Instruction *>::getTombstoneKey();
3660   }
3661 
3662   static unsigned getHashValue(const Instruction *I) {
3663     assert(canHandle(I) && "Unknown instruction!");
3664     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3665                                                            I->value_op_end()));
3666   }
3667 
3668   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3669     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3670         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3671       return LHS == RHS;
3672     return LHS->isIdenticalTo(RHS);
3673   }
3674 };
3675 
3676 } // end anonymous namespace
3677 
3678 ///Perform cse of induction variable instructions.
3679 static void cse(BasicBlock *BB) {
3680   // Perform simple cse.
3681   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3682   for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3683     if (!CSEDenseMapInfo::canHandle(&In))
3684       continue;
3685 
3686     // Check if we can replace this instruction with any of the
3687     // visited instructions.
3688     if (Instruction *V = CSEMap.lookup(&In)) {
3689       In.replaceAllUsesWith(V);
3690       In.eraseFromParent();
3691       continue;
3692     }
3693 
3694     CSEMap[&In] = &In;
3695   }
3696 }
3697 
3698 InstructionCost
3699 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3700                                               bool &NeedToScalarize) const {
3701   Function *F = CI->getCalledFunction();
3702   Type *ScalarRetTy = CI->getType();
3703   SmallVector<Type *, 4> Tys, ScalarTys;
3704   for (auto &ArgOp : CI->args())
3705     ScalarTys.push_back(ArgOp->getType());
3706 
3707   // Estimate cost of scalarized vector call. The source operands are assumed
3708   // to be vectors, so we need to extract individual elements from there,
3709   // execute VF scalar calls, and then gather the result into the vector return
3710   // value.
3711   InstructionCost ScalarCallCost =
3712       TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
3713   if (VF.isScalar())
3714     return ScalarCallCost;
3715 
3716   // Compute corresponding vector type for return value and arguments.
3717   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3718   for (Type *ScalarTy : ScalarTys)
3719     Tys.push_back(ToVectorTy(ScalarTy, VF));
3720 
3721   // Compute costs of unpacking argument values for the scalar calls and
3722   // packing the return values to a vector.
3723   InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
3724 
3725   InstructionCost Cost =
3726       ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3727 
3728   // If we can't emit a vector call for this function, then the currently found
3729   // cost is the cost we need to return.
3730   NeedToScalarize = true;
3731   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3732   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3733 
3734   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3735     return Cost;
3736 
3737   // If the corresponding vector cost is cheaper, return its cost.
3738   InstructionCost VectorCallCost =
3739       TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
3740   if (VectorCallCost < Cost) {
3741     NeedToScalarize = false;
3742     Cost = VectorCallCost;
3743   }
3744   return Cost;
3745 }
3746 
3747 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3748   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3749     return Elt;
3750   return VectorType::get(Elt, VF);
3751 }
3752 
3753 InstructionCost
3754 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3755                                                    ElementCount VF) const {
3756   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3757   assert(ID && "Expected intrinsic call!");
3758   Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3759   FastMathFlags FMF;
3760   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3761     FMF = FPMO->getFastMathFlags();
3762 
3763   SmallVector<const Value *> Arguments(CI->args());
3764   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3765   SmallVector<Type *> ParamTys;
3766   std::transform(FTy->param_begin(), FTy->param_end(),
3767                  std::back_inserter(ParamTys),
3768                  [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3769 
3770   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3771                                     dyn_cast<IntrinsicInst>(CI));
3772   return TTI.getIntrinsicInstrCost(CostAttrs,
3773                                    TargetTransformInfo::TCK_RecipThroughput);
3774 }
3775 
3776 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3777   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3778   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3779   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3780 }
3781 
3782 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3783   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3784   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3785   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3786 }
3787 
3788 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3789   // For every instruction `I` in MinBWs, truncate the operands, create a
3790   // truncated version of `I` and reextend its result. InstCombine runs
3791   // later and will remove any ext/trunc pairs.
3792   SmallPtrSet<Value *, 4> Erased;
3793   for (const auto &KV : Cost->getMinimalBitwidths()) {
3794     // If the value wasn't vectorized, we must maintain the original scalar
3795     // type. The absence of the value from State indicates that it
3796     // wasn't vectorized.
3797     // FIXME: Should not rely on getVPValue at this point.
3798     VPValue *Def = State.Plan->getVPValue(KV.first, true);
3799     if (!State.hasAnyVectorValue(Def))
3800       continue;
3801     for (unsigned Part = 0; Part < UF; ++Part) {
3802       Value *I = State.get(Def, Part);
3803       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3804         continue;
3805       Type *OriginalTy = I->getType();
3806       Type *ScalarTruncatedTy =
3807           IntegerType::get(OriginalTy->getContext(), KV.second);
3808       auto *TruncatedTy = VectorType::get(
3809           ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
3810       if (TruncatedTy == OriginalTy)
3811         continue;
3812 
3813       IRBuilder<> B(cast<Instruction>(I));
3814       auto ShrinkOperand = [&](Value *V) -> Value * {
3815         if (auto *ZI = dyn_cast<ZExtInst>(V))
3816           if (ZI->getSrcTy() == TruncatedTy)
3817             return ZI->getOperand(0);
3818         return B.CreateZExtOrTrunc(V, TruncatedTy);
3819       };
3820 
3821       // The actual instruction modification depends on the instruction type,
3822       // unfortunately.
3823       Value *NewI = nullptr;
3824       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3825         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3826                              ShrinkOperand(BO->getOperand(1)));
3827 
3828         // Any wrapping introduced by shrinking this operation shouldn't be
3829         // considered undefined behavior. So, we can't unconditionally copy
3830         // arithmetic wrapping flags to NewI.
3831         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3832       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3833         NewI =
3834             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3835                          ShrinkOperand(CI->getOperand(1)));
3836       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3837         NewI = B.CreateSelect(SI->getCondition(),
3838                               ShrinkOperand(SI->getTrueValue()),
3839                               ShrinkOperand(SI->getFalseValue()));
3840       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3841         switch (CI->getOpcode()) {
3842         default:
3843           llvm_unreachable("Unhandled cast!");
3844         case Instruction::Trunc:
3845           NewI = ShrinkOperand(CI->getOperand(0));
3846           break;
3847         case Instruction::SExt:
3848           NewI = B.CreateSExtOrTrunc(
3849               CI->getOperand(0),
3850               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3851           break;
3852         case Instruction::ZExt:
3853           NewI = B.CreateZExtOrTrunc(
3854               CI->getOperand(0),
3855               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3856           break;
3857         }
3858       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3859         auto Elements0 =
3860             cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
3861         auto *O0 = B.CreateZExtOrTrunc(
3862             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3863         auto Elements1 =
3864             cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
3865         auto *O1 = B.CreateZExtOrTrunc(
3866             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3867 
3868         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3869       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3870         // Don't do anything with the operands, just extend the result.
3871         continue;
3872       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3873         auto Elements =
3874             cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
3875         auto *O0 = B.CreateZExtOrTrunc(
3876             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3877         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3878         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3879       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3880         auto Elements =
3881             cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
3882         auto *O0 = B.CreateZExtOrTrunc(
3883             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3884         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3885       } else {
3886         // If we don't know what to do, be conservative and don't do anything.
3887         continue;
3888       }
3889 
3890       // Lastly, extend the result.
3891       NewI->takeName(cast<Instruction>(I));
3892       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3893       I->replaceAllUsesWith(Res);
3894       cast<Instruction>(I)->eraseFromParent();
3895       Erased.insert(I);
3896       State.reset(Def, Res, Part);
3897     }
3898   }
3899 
3900   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3901   for (const auto &KV : Cost->getMinimalBitwidths()) {
3902     // If the value wasn't vectorized, we must maintain the original scalar
3903     // type. The absence of the value from State indicates that it
3904     // wasn't vectorized.
3905     // FIXME: Should not rely on getVPValue at this point.
3906     VPValue *Def = State.Plan->getVPValue(KV.first, true);
3907     if (!State.hasAnyVectorValue(Def))
3908       continue;
3909     for (unsigned Part = 0; Part < UF; ++Part) {
3910       Value *I = State.get(Def, Part);
3911       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3912       if (Inst && Inst->use_empty()) {
3913         Value *NewI = Inst->getOperand(0);
3914         Inst->eraseFromParent();
3915         State.reset(Def, NewI, Part);
3916       }
3917     }
3918   }
3919 }
3920 
3921 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
3922   // Insert truncates and extends for any truncated instructions as hints to
3923   // InstCombine.
3924   if (VF.isVector())
3925     truncateToMinimalBitwidths(State);
3926 
3927   // Fix widened non-induction PHIs by setting up the PHI operands.
3928   if (OrigPHIsToFix.size()) {
3929     assert(EnableVPlanNativePath &&
3930            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3931     fixNonInductionPHIs(State);
3932   }
3933 
3934   // At this point every instruction in the original loop is widened to a
3935   // vector form. Now we need to fix the recurrences in the loop. These PHI
3936   // nodes are currently empty because we did not want to introduce cycles.
3937   // This is the second stage of vectorizing recurrences.
3938   fixCrossIterationPHIs(State);
3939 
3940   // Forget the original basic block.
3941   PSE.getSE()->forgetLoop(OrigLoop);
3942 
3943   // If we inserted an edge from the middle block to the unique exit block,
3944   // update uses outside the loop (phis) to account for the newly inserted
3945   // edge.
3946   if (!Cost->requiresScalarEpilogue(VF)) {
3947     // Fix-up external users of the induction variables.
3948     for (auto &Entry : Legal->getInductionVars())
3949       fixupIVUsers(Entry.first, Entry.second,
3950                    getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3951                    IVEndValues[Entry.first], LoopMiddleBlock);
3952 
3953     fixLCSSAPHIs(State);
3954   }
3955 
3956   for (Instruction *PI : PredicatedInstructions)
3957     sinkScalarOperands(&*PI);
3958 
3959   // Remove redundant induction instructions.
3960   cse(LoopVectorBody);
3961 
3962   // Set/update profile weights for the vector and remainder loops as original
3963   // loop iterations are now distributed among them. Note that original loop
3964   // represented by LoopScalarBody becomes remainder loop after vectorization.
3965   //
3966   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3967   // end up getting slightly roughened result but that should be OK since
3968   // profile is not inherently precise anyway. Note also possible bypass of
3969   // vector code caused by legality checks is ignored, assigning all the weight
3970   // to the vector loop, optimistically.
3971   //
3972   // For scalable vectorization we can't know at compile time how many iterations
3973   // of the loop are handled in one vector iteration, so instead assume a pessimistic
3974   // vscale of '1'.
3975   setProfileInfoAfterUnrolling(
3976       LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
3977       LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
3978 }
3979 
3980 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
3981   // In order to support recurrences we need to be able to vectorize Phi nodes.
3982   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3983   // stage #2: We now need to fix the recurrences by adding incoming edges to
3984   // the currently empty PHI nodes. At this point every instruction in the
3985   // original loop is widened to a vector form so we can use them to construct
3986   // the incoming edges.
3987   VPBasicBlock *Header = State.Plan->getEntry()->getEntryBasicBlock();
3988   for (VPRecipeBase &R : Header->phis()) {
3989     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
3990       fixReduction(ReductionPhi, State);
3991     else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3992       fixFirstOrderRecurrence(FOR, State);
3993   }
3994 }
3995 
3996 void InnerLoopVectorizer::fixFirstOrderRecurrence(
3997     VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
3998   // This is the second phase of vectorizing first-order recurrences. An
3999   // overview of the transformation is described below. Suppose we have the
4000   // following loop.
4001   //
4002   //   for (int i = 0; i < n; ++i)
4003   //     b[i] = a[i] - a[i - 1];
4004   //
4005   // There is a first-order recurrence on "a". For this loop, the shorthand
4006   // scalar IR looks like:
4007   //
4008   //   scalar.ph:
4009   //     s_init = a[-1]
4010   //     br scalar.body
4011   //
4012   //   scalar.body:
4013   //     i = phi [0, scalar.ph], [i+1, scalar.body]
4014   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
4015   //     s2 = a[i]
4016   //     b[i] = s2 - s1
4017   //     br cond, scalar.body, ...
4018   //
4019   // In this example, s1 is a recurrence because it's value depends on the
4020   // previous iteration. In the first phase of vectorization, we created a
4021   // vector phi v1 for s1. We now complete the vectorization and produce the
4022   // shorthand vector IR shown below (for VF = 4, UF = 1).
4023   //
4024   //   vector.ph:
4025   //     v_init = vector(..., ..., ..., a[-1])
4026   //     br vector.body
4027   //
4028   //   vector.body
4029   //     i = phi [0, vector.ph], [i+4, vector.body]
4030   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
4031   //     v2 = a[i, i+1, i+2, i+3];
4032   //     v3 = vector(v1(3), v2(0, 1, 2))
4033   //     b[i, i+1, i+2, i+3] = v2 - v3
4034   //     br cond, vector.body, middle.block
4035   //
4036   //   middle.block:
4037   //     x = v2(3)
4038   //     br scalar.ph
4039   //
4040   //   scalar.ph:
4041   //     s_init = phi [x, middle.block], [a[-1], otherwise]
4042   //     br scalar.body
4043   //
4044   // After execution completes the vector loop, we extract the next value of
4045   // the recurrence (x) to use as the initial value in the scalar loop.
4046 
4047   // Extract the last vector element in the middle block. This will be the
4048   // initial value for the recurrence when jumping to the scalar loop.
4049   VPValue *PreviousDef = PhiR->getBackedgeValue();
4050   Value *Incoming = State.get(PreviousDef, UF - 1);
4051   auto *ExtractForScalar = Incoming;
4052   auto *IdxTy = Builder.getInt32Ty();
4053   if (VF.isVector()) {
4054     auto *One = ConstantInt::get(IdxTy, 1);
4055     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4056     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
4057     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
4058     ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
4059                                                     "vector.recur.extract");
4060   }
4061   // Extract the second last element in the middle block if the
4062   // Phi is used outside the loop. We need to extract the phi itself
4063   // and not the last element (the phi update in the current iteration). This
4064   // will be the value when jumping to the exit block from the LoopMiddleBlock,
4065   // when the scalar loop is not run at all.
4066   Value *ExtractForPhiUsedOutsideLoop = nullptr;
4067   if (VF.isVector()) {
4068     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
4069     auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
4070     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
4071         Incoming, Idx, "vector.recur.extract.for.phi");
4072   } else if (UF > 1)
4073     // When loop is unrolled without vectorizing, initialize
4074     // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
4075     // of `Incoming`. This is analogous to the vectorized case above: extracting
4076     // the second last element when VF > 1.
4077     ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
4078 
4079   // Fix the initial value of the original recurrence in the scalar loop.
4080   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
4081   PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
4082   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
4083   auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
4084   for (auto *BB : predecessors(LoopScalarPreHeader)) {
4085     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
4086     Start->addIncoming(Incoming, BB);
4087   }
4088 
4089   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
4090   Phi->setName("scalar.recur");
4091 
4092   // Finally, fix users of the recurrence outside the loop. The users will need
4093   // either the last value of the scalar recurrence or the last value of the
4094   // vector recurrence we extracted in the middle block. Since the loop is in
4095   // LCSSA form, we just need to find all the phi nodes for the original scalar
4096   // recurrence in the exit block, and then add an edge for the middle block.
4097   // Note that LCSSA does not imply single entry when the original scalar loop
4098   // had multiple exiting edges (as we always run the last iteration in the
4099   // scalar epilogue); in that case, there is no edge from middle to exit and
4100   // and thus no phis which needed updated.
4101   if (!Cost->requiresScalarEpilogue(VF))
4102     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4103       if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi))
4104         LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4105 }
4106 
4107 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
4108                                        VPTransformState &State) {
4109   PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
4110   // Get it's reduction variable descriptor.
4111   assert(Legal->isReductionVariable(OrigPhi) &&
4112          "Unable to find the reduction variable");
4113   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
4114 
4115   RecurKind RK = RdxDesc.getRecurrenceKind();
4116   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
4117   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
4118   setDebugLocFromInst(ReductionStartValue);
4119 
4120   VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
4121   // This is the vector-clone of the value that leaves the loop.
4122   Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
4123 
4124   // Wrap flags are in general invalid after vectorization, clear them.
4125   clearReductionWrapFlags(RdxDesc, State);
4126 
4127   // Before each round, move the insertion point right between
4128   // the PHIs and the values we are going to write.
4129   // This allows us to write both PHINodes and the extractelement
4130   // instructions.
4131   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4132 
4133   setDebugLocFromInst(LoopExitInst);
4134 
4135   Type *PhiTy = OrigPhi->getType();
4136   // If tail is folded by masking, the vector value to leave the loop should be
4137   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
4138   // instead of the former. For an inloop reduction the reduction will already
4139   // be predicated, and does not need to be handled here.
4140   if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
4141     for (unsigned Part = 0; Part < UF; ++Part) {
4142       Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
4143       Value *Sel = nullptr;
4144       for (User *U : VecLoopExitInst->users()) {
4145         if (isa<SelectInst>(U)) {
4146           assert(!Sel && "Reduction exit feeding two selects");
4147           Sel = U;
4148         } else
4149           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
4150       }
4151       assert(Sel && "Reduction exit feeds no select");
4152       State.reset(LoopExitInstDef, Sel, Part);
4153 
4154       // If the target can create a predicated operator for the reduction at no
4155       // extra cost in the loop (for example a predicated vadd), it can be
4156       // cheaper for the select to remain in the loop than be sunk out of it,
4157       // and so use the select value for the phi instead of the old
4158       // LoopExitValue.
4159       if (PreferPredicatedReductionSelect ||
4160           TTI->preferPredicatedReductionSelect(
4161               RdxDesc.getOpcode(), PhiTy,
4162               TargetTransformInfo::ReductionFlags())) {
4163         auto *VecRdxPhi =
4164             cast<PHINode>(State.get(PhiR, Part));
4165         VecRdxPhi->setIncomingValueForBlock(
4166             LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
4167       }
4168     }
4169   }
4170 
4171   // If the vector reduction can be performed in a smaller type, we truncate
4172   // then extend the loop exit value to enable InstCombine to evaluate the
4173   // entire expression in the smaller type.
4174   if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
4175     assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
4176     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
4177     Builder.SetInsertPoint(
4178         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
4179     VectorParts RdxParts(UF);
4180     for (unsigned Part = 0; Part < UF; ++Part) {
4181       RdxParts[Part] = State.get(LoopExitInstDef, Part);
4182       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4183       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
4184                                         : Builder.CreateZExt(Trunc, VecTy);
4185       for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users()))
4186         if (U != Trunc) {
4187           U->replaceUsesOfWith(RdxParts[Part], Extnd);
4188           RdxParts[Part] = Extnd;
4189         }
4190     }
4191     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
4192     for (unsigned Part = 0; Part < UF; ++Part) {
4193       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
4194       State.reset(LoopExitInstDef, RdxParts[Part], Part);
4195     }
4196   }
4197 
4198   // Reduce all of the unrolled parts into a single vector.
4199   Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
4200   unsigned Op = RecurrenceDescriptor::getOpcode(RK);
4201 
4202   // The middle block terminator has already been assigned a DebugLoc here (the
4203   // OrigLoop's single latch terminator). We want the whole middle block to
4204   // appear to execute on this line because: (a) it is all compiler generated,
4205   // (b) these instructions are always executed after evaluating the latch
4206   // conditional branch, and (c) other passes may add new predecessors which
4207   // terminate on this line. This is the easiest way to ensure we don't
4208   // accidentally cause an extra step back into the loop while debugging.
4209   setDebugLocFromInst(LoopMiddleBlock->getTerminator());
4210   if (PhiR->isOrdered())
4211     ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
4212   else {
4213     // Floating-point operations should have some FMF to enable the reduction.
4214     IRBuilderBase::FastMathFlagGuard FMFG(Builder);
4215     Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
4216     for (unsigned Part = 1; Part < UF; ++Part) {
4217       Value *RdxPart = State.get(LoopExitInstDef, Part);
4218       if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
4219         ReducedPartRdx = Builder.CreateBinOp(
4220             (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
4221       } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK))
4222         ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK,
4223                                            ReducedPartRdx, RdxPart);
4224       else
4225         ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
4226     }
4227   }
4228 
4229   // Create the reduction after the loop. Note that inloop reductions create the
4230   // target reduction in the loop using a Reduction recipe.
4231   if (VF.isVector() && !PhiR->isInLoop()) {
4232     ReducedPartRdx =
4233         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi);
4234     // If the reduction can be performed in a smaller type, we need to extend
4235     // the reduction to the wider type before we branch to the original loop.
4236     if (PhiTy != RdxDesc.getRecurrenceType())
4237       ReducedPartRdx = RdxDesc.isSigned()
4238                            ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
4239                            : Builder.CreateZExt(ReducedPartRdx, PhiTy);
4240   }
4241 
4242   PHINode *ResumePhi =
4243       dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
4244 
4245   // Create a phi node that merges control-flow from the backedge-taken check
4246   // block and the middle block.
4247   PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
4248                                         LoopScalarPreHeader->getTerminator());
4249 
4250   // If we are fixing reductions in the epilogue loop then we should already
4251   // have created a bc.merge.rdx Phi after the main vector body. Ensure that
4252   // we carry over the incoming values correctly.
4253   for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
4254     if (Incoming == LoopMiddleBlock)
4255       BCBlockPhi->addIncoming(ReducedPartRdx, Incoming);
4256     else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming))
4257       BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
4258                               Incoming);
4259     else
4260       BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
4261   }
4262 
4263   // Set the resume value for this reduction
4264   ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
4265 
4266   // Now, we need to fix the users of the reduction variable
4267   // inside and outside of the scalar remainder loop.
4268 
4269   // We know that the loop is in LCSSA form. We need to update the PHI nodes
4270   // in the exit blocks.  See comment on analogous loop in
4271   // fixFirstOrderRecurrence for a more complete explaination of the logic.
4272   if (!Cost->requiresScalarEpilogue(VF))
4273     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4274       if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst))
4275         LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4276 
4277   // Fix the scalar loop reduction variable with the incoming reduction sum
4278   // from the vector body and from the backedge value.
4279   int IncomingEdgeBlockIdx =
4280       OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4281   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4282   // Pick the other block.
4283   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4284   OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4285   OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4286 }
4287 
4288 void InnerLoopVectorizer::clearReductionWrapFlags(const RecurrenceDescriptor &RdxDesc,
4289                                                   VPTransformState &State) {
4290   RecurKind RK = RdxDesc.getRecurrenceKind();
4291   if (RK != RecurKind::Add && RK != RecurKind::Mul)
4292     return;
4293 
4294   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
4295   assert(LoopExitInstr && "null loop exit instruction");
4296   SmallVector<Instruction *, 8> Worklist;
4297   SmallPtrSet<Instruction *, 8> Visited;
4298   Worklist.push_back(LoopExitInstr);
4299   Visited.insert(LoopExitInstr);
4300 
4301   while (!Worklist.empty()) {
4302     Instruction *Cur = Worklist.pop_back_val();
4303     if (isa<OverflowingBinaryOperator>(Cur))
4304       for (unsigned Part = 0; Part < UF; ++Part) {
4305         // FIXME: Should not rely on getVPValue at this point.
4306         Value *V = State.get(State.Plan->getVPValue(Cur, true), Part);
4307         cast<Instruction>(V)->dropPoisonGeneratingFlags();
4308       }
4309 
4310     for (User *U : Cur->users()) {
4311       Instruction *UI = cast<Instruction>(U);
4312       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
4313           Visited.insert(UI).second)
4314         Worklist.push_back(UI);
4315     }
4316   }
4317 }
4318 
4319 void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) {
4320   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
4321     if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
4322       // Some phis were already hand updated by the reduction and recurrence
4323       // code above, leave them alone.
4324       continue;
4325 
4326     auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
4327     // Non-instruction incoming values will have only one value.
4328 
4329     VPLane Lane = VPLane::getFirstLane();
4330     if (isa<Instruction>(IncomingValue) &&
4331         !Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue),
4332                                            VF))
4333       Lane = VPLane::getLastLaneForVF(VF);
4334 
4335     // Can be a loop invariant incoming value or the last scalar value to be
4336     // extracted from the vectorized loop.
4337     // FIXME: Should not rely on getVPValue at this point.
4338     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
4339     Value *lastIncomingValue =
4340         OrigLoop->isLoopInvariant(IncomingValue)
4341             ? IncomingValue
4342             : State.get(State.Plan->getVPValue(IncomingValue, true),
4343                         VPIteration(UF - 1, Lane));
4344     LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
4345   }
4346 }
4347 
4348 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4349   // The basic block and loop containing the predicated instruction.
4350   auto *PredBB = PredInst->getParent();
4351   auto *VectorLoop = LI->getLoopFor(PredBB);
4352 
4353   // Initialize a worklist with the operands of the predicated instruction.
4354   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4355 
4356   // Holds instructions that we need to analyze again. An instruction may be
4357   // reanalyzed if we don't yet know if we can sink it or not.
4358   SmallVector<Instruction *, 8> InstsToReanalyze;
4359 
4360   // Returns true if a given use occurs in the predicated block. Phi nodes use
4361   // their operands in their corresponding predecessor blocks.
4362   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4363     auto *I = cast<Instruction>(U.getUser());
4364     BasicBlock *BB = I->getParent();
4365     if (auto *Phi = dyn_cast<PHINode>(I))
4366       BB = Phi->getIncomingBlock(
4367           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4368     return BB == PredBB;
4369   };
4370 
4371   // Iteratively sink the scalarized operands of the predicated instruction
4372   // into the block we created for it. When an instruction is sunk, it's
4373   // operands are then added to the worklist. The algorithm ends after one pass
4374   // through the worklist doesn't sink a single instruction.
4375   bool Changed;
4376   do {
4377     // Add the instructions that need to be reanalyzed to the worklist, and
4378     // reset the changed indicator.
4379     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4380     InstsToReanalyze.clear();
4381     Changed = false;
4382 
4383     while (!Worklist.empty()) {
4384       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4385 
4386       // We can't sink an instruction if it is a phi node, is not in the loop,
4387       // or may have side effects.
4388       if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
4389           I->mayHaveSideEffects())
4390         continue;
4391 
4392       // If the instruction is already in PredBB, check if we can sink its
4393       // operands. In that case, VPlan's sinkScalarOperands() succeeded in
4394       // sinking the scalar instruction I, hence it appears in PredBB; but it
4395       // may have failed to sink I's operands (recursively), which we try
4396       // (again) here.
4397       if (I->getParent() == PredBB) {
4398         Worklist.insert(I->op_begin(), I->op_end());
4399         continue;
4400       }
4401 
4402       // It's legal to sink the instruction if all its uses occur in the
4403       // predicated block. Otherwise, there's nothing to do yet, and we may
4404       // need to reanalyze the instruction.
4405       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4406         InstsToReanalyze.push_back(I);
4407         continue;
4408       }
4409 
4410       // Move the instruction to the beginning of the predicated block, and add
4411       // it's operands to the worklist.
4412       I->moveBefore(&*PredBB->getFirstInsertionPt());
4413       Worklist.insert(I->op_begin(), I->op_end());
4414 
4415       // The sinking may have enabled other instructions to be sunk, so we will
4416       // need to iterate.
4417       Changed = true;
4418     }
4419   } while (Changed);
4420 }
4421 
4422 void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
4423   for (PHINode *OrigPhi : OrigPHIsToFix) {
4424     VPWidenPHIRecipe *VPPhi =
4425         cast<VPWidenPHIRecipe>(State.Plan->getVPValue(OrigPhi));
4426     PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4427     // Make sure the builder has a valid insert point.
4428     Builder.SetInsertPoint(NewPhi);
4429     for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4430       VPValue *Inc = VPPhi->getIncomingValue(i);
4431       VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4432       NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4433     }
4434   }
4435 }
4436 
4437 bool InnerLoopVectorizer::useOrderedReductions(
4438     const RecurrenceDescriptor &RdxDesc) {
4439   return Cost->useOrderedReductions(RdxDesc);
4440 }
4441 
4442 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
4443                                               VPWidenPHIRecipe *PhiR,
4444                                               VPTransformState &State) {
4445   PHINode *P = cast<PHINode>(PN);
4446   if (EnableVPlanNativePath) {
4447     // Currently we enter here in the VPlan-native path for non-induction
4448     // PHIs where all control flow is uniform. We simply widen these PHIs.
4449     // Create a vector phi with no operands - the vector phi operands will be
4450     // set at the end of vector code generation.
4451     Type *VecTy = (State.VF.isScalar())
4452                       ? PN->getType()
4453                       : VectorType::get(PN->getType(), State.VF);
4454     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4455     State.set(PhiR, VecPhi, 0);
4456     OrigPHIsToFix.push_back(P);
4457 
4458     return;
4459   }
4460 
4461   assert(PN->getParent() == OrigLoop->getHeader() &&
4462          "Non-header phis should have been handled elsewhere");
4463 
4464   // In order to support recurrences we need to be able to vectorize Phi nodes.
4465   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4466   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4467   // this value when we vectorize all of the instructions that use the PHI.
4468 
4469   assert(!Legal->isReductionVariable(P) &&
4470          "reductions should be handled elsewhere");
4471 
4472   setDebugLocFromInst(P);
4473 
4474   // This PHINode must be an induction variable.
4475   // Make sure that we know about it.
4476   assert(Legal->getInductionVars().count(P) && "Not an induction variable");
4477 
4478   InductionDescriptor II = Legal->getInductionVars().lookup(P);
4479   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4480 
4481   auto *IVR = PhiR->getParent()->getPlan()->getCanonicalIV();
4482   PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
4483 
4484   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4485   // which can be found from the original scalar operations.
4486   switch (II.getKind()) {
4487   case InductionDescriptor::IK_NoInduction:
4488     llvm_unreachable("Unknown induction");
4489   case InductionDescriptor::IK_IntInduction:
4490   case InductionDescriptor::IK_FpInduction:
4491     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4492   case InductionDescriptor::IK_PtrInduction: {
4493     // Handle the pointer induction variable case.
4494     assert(P->getType()->isPointerTy() && "Unexpected type.");
4495 
4496     if (Cost->isScalarAfterVectorization(P, State.VF)) {
4497       // This is the normalized GEP that starts counting at zero.
4498       Value *PtrInd =
4499           Builder.CreateSExtOrTrunc(CanonicalIV, II.getStep()->getType());
4500       // Determine the number of scalars we need to generate for each unroll
4501       // iteration. If the instruction is uniform, we only need to generate the
4502       // first lane. Otherwise, we generate all VF values.
4503       bool IsUniform = vputils::onlyFirstLaneUsed(PhiR);
4504       assert((IsUniform || !State.VF.isScalable()) &&
4505              "Cannot scalarize a scalable VF");
4506       unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
4507 
4508       for (unsigned Part = 0; Part < UF; ++Part) {
4509         Value *PartStart =
4510             createStepForVF(Builder, PtrInd->getType(), VF, Part);
4511 
4512         for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4513           Value *Idx = Builder.CreateAdd(
4514               PartStart, ConstantInt::get(PtrInd->getType(), Lane));
4515           Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4516           Value *SclrGep = emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(),
4517                                                 DL, II, State.CFG.PrevBB);
4518           SclrGep->setName("next.gep");
4519           State.set(PhiR, SclrGep, VPIteration(Part, Lane));
4520         }
4521       }
4522       return;
4523     }
4524     assert(isa<SCEVConstant>(II.getStep()) &&
4525            "Induction step not a SCEV constant!");
4526     Type *PhiType = II.getStep()->getType();
4527 
4528     // Build a pointer phi
4529     Value *ScalarStartValue = PhiR->getStartValue()->getLiveInIRValue();
4530     Type *ScStValueType = ScalarStartValue->getType();
4531     PHINode *NewPointerPhi =
4532         PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
4533     NewPointerPhi->addIncoming(ScalarStartValue, LoopVectorPreHeader);
4534 
4535     // A pointer induction, performed by using a gep
4536     BasicBlock *LoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
4537     Instruction *InductionLoc = LoopLatch->getTerminator();
4538     const SCEV *ScalarStep = II.getStep();
4539     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
4540     Value *ScalarStepValue =
4541         Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
4542     Value *RuntimeVF = getRuntimeVF(Builder, PhiType, VF);
4543     Value *NumUnrolledElems =
4544         Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
4545     Value *InductionGEP = GetElementPtrInst::Create(
4546         II.getElementType(), NewPointerPhi,
4547         Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
4548         InductionLoc);
4549     NewPointerPhi->addIncoming(InductionGEP, LoopLatch);
4550 
4551     // Create UF many actual address geps that use the pointer
4552     // phi as base and a vectorized version of the step value
4553     // (<step*0, ..., step*N>) as offset.
4554     for (unsigned Part = 0; Part < State.UF; ++Part) {
4555       Type *VecPhiType = VectorType::get(PhiType, State.VF);
4556       Value *StartOffsetScalar =
4557           Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
4558       Value *StartOffset =
4559           Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
4560       // Create a vector of consecutive numbers from zero to VF.
4561       StartOffset =
4562           Builder.CreateAdd(StartOffset, Builder.CreateStepVector(VecPhiType));
4563 
4564       Value *GEP = Builder.CreateGEP(
4565           II.getElementType(), NewPointerPhi,
4566           Builder.CreateMul(
4567               StartOffset, Builder.CreateVectorSplat(State.VF, ScalarStepValue),
4568               "vector.gep"));
4569       State.set(PhiR, GEP, Part);
4570     }
4571   }
4572   }
4573 }
4574 
4575 /// A helper function for checking whether an integer division-related
4576 /// instruction may divide by zero (in which case it must be predicated if
4577 /// executed conditionally in the scalar code).
4578 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4579 /// Non-zero divisors that are non compile-time constants will not be
4580 /// converted into multiplication, so we will still end up scalarizing
4581 /// the division, but can do so w/o predication.
4582 static bool mayDivideByZero(Instruction &I) {
4583   assert((I.getOpcode() == Instruction::UDiv ||
4584           I.getOpcode() == Instruction::SDiv ||
4585           I.getOpcode() == Instruction::URem ||
4586           I.getOpcode() == Instruction::SRem) &&
4587          "Unexpected instruction");
4588   Value *Divisor = I.getOperand(1);
4589   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4590   return !CInt || CInt->isZero();
4591 }
4592 
4593 void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
4594                                                VPUser &ArgOperands,
4595                                                VPTransformState &State) {
4596   assert(!isa<DbgInfoIntrinsic>(I) &&
4597          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4598   setDebugLocFromInst(&I);
4599 
4600   Module *M = I.getParent()->getParent()->getParent();
4601   auto *CI = cast<CallInst>(&I);
4602 
4603   SmallVector<Type *, 4> Tys;
4604   for (Value *ArgOperand : CI->args())
4605     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4606 
4607   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4608 
4609   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4610   // version of the instruction.
4611   // Is it beneficial to perform intrinsic call compared to lib call?
4612   bool NeedToScalarize = false;
4613   InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4614   InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
4615   bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
4616   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4617          "Instruction should be scalarized elsewhere.");
4618   assert((IntrinsicCost.isValid() || CallCost.isValid()) &&
4619          "Either the intrinsic cost or vector call cost must be valid");
4620 
4621   for (unsigned Part = 0; Part < UF; ++Part) {
4622     SmallVector<Type *, 2> TysForDecl = {CI->getType()};
4623     SmallVector<Value *, 4> Args;
4624     for (auto &I : enumerate(ArgOperands.operands())) {
4625       // Some intrinsics have a scalar argument - don't replace it with a
4626       // vector.
4627       Value *Arg;
4628       if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
4629         Arg = State.get(I.value(), Part);
4630       else {
4631         Arg = State.get(I.value(), VPIteration(0, 0));
4632         if (hasVectorInstrinsicOverloadedScalarOpd(ID, I.index()))
4633           TysForDecl.push_back(Arg->getType());
4634       }
4635       Args.push_back(Arg);
4636     }
4637 
4638     Function *VectorF;
4639     if (UseVectorIntrinsic) {
4640       // Use vector version of the intrinsic.
4641       if (VF.isVector())
4642         TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4643       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4644       assert(VectorF && "Can't retrieve vector intrinsic.");
4645     } else {
4646       // Use vector version of the function call.
4647       const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
4648 #ifndef NDEBUG
4649       assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
4650              "Can't create vector function.");
4651 #endif
4652         VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
4653     }
4654       SmallVector<OperandBundleDef, 1> OpBundles;
4655       CI->getOperandBundlesAsDefs(OpBundles);
4656       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4657 
4658       if (isa<FPMathOperator>(V))
4659         V->copyFastMathFlags(CI);
4660 
4661       State.set(Def, V, Part);
4662       addMetadata(V, &I);
4663   }
4664 }
4665 
4666 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4667   // We should not collect Scalars more than once per VF. Right now, this
4668   // function is called from collectUniformsAndScalars(), which already does
4669   // this check. Collecting Scalars for VF=1 does not make any sense.
4670   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4671          "This function should not be visited twice for the same VF");
4672 
4673   SmallSetVector<Instruction *, 8> Worklist;
4674 
4675   // These sets are used to seed the analysis with pointers used by memory
4676   // accesses that will remain scalar.
4677   SmallSetVector<Instruction *, 8> ScalarPtrs;
4678   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4679   auto *Latch = TheLoop->getLoopLatch();
4680 
4681   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4682   // The pointer operands of loads and stores will be scalar as long as the
4683   // memory access is not a gather or scatter operation. The value operand of a
4684   // store will remain scalar if the store is scalarized.
4685   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4686     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4687     assert(WideningDecision != CM_Unknown &&
4688            "Widening decision should be ready at this moment");
4689     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4690       if (Ptr == Store->getValueOperand())
4691         return WideningDecision == CM_Scalarize;
4692     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4693            "Ptr is neither a value or pointer operand");
4694     return WideningDecision != CM_GatherScatter;
4695   };
4696 
4697   // A helper that returns true if the given value is a bitcast or
4698   // getelementptr instruction contained in the loop.
4699   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4700     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4701             isa<GetElementPtrInst>(V)) &&
4702            !TheLoop->isLoopInvariant(V);
4703   };
4704 
4705   // A helper that evaluates a memory access's use of a pointer. If the use will
4706   // be a scalar use and the pointer is only used by memory accesses, we place
4707   // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4708   // PossibleNonScalarPtrs.
4709   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4710     // We only care about bitcast and getelementptr instructions contained in
4711     // the loop.
4712     if (!isLoopVaryingBitCastOrGEP(Ptr))
4713       return;
4714 
4715     // If the pointer has already been identified as scalar (e.g., if it was
4716     // also identified as uniform), there's nothing to do.
4717     auto *I = cast<Instruction>(Ptr);
4718     if (Worklist.count(I))
4719       return;
4720 
4721     // If the use of the pointer will be a scalar use, and all users of the
4722     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4723     // place the pointer in PossibleNonScalarPtrs.
4724     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4725           return isa<LoadInst>(U) || isa<StoreInst>(U);
4726         }))
4727       ScalarPtrs.insert(I);
4728     else
4729       PossibleNonScalarPtrs.insert(I);
4730   };
4731 
4732   // We seed the scalars analysis with three classes of instructions: (1)
4733   // instructions marked uniform-after-vectorization and (2) bitcast,
4734   // getelementptr and (pointer) phi instructions used by memory accesses
4735   // requiring a scalar use.
4736   //
4737   // (1) Add to the worklist all instructions that have been identified as
4738   // uniform-after-vectorization.
4739   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4740 
4741   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4742   // memory accesses requiring a scalar use. The pointer operands of loads and
4743   // stores will be scalar as long as the memory accesses is not a gather or
4744   // scatter operation. The value operand of a store will remain scalar if the
4745   // store is scalarized.
4746   for (auto *BB : TheLoop->blocks())
4747     for (auto &I : *BB) {
4748       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4749         evaluatePtrUse(Load, Load->getPointerOperand());
4750       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4751         evaluatePtrUse(Store, Store->getPointerOperand());
4752         evaluatePtrUse(Store, Store->getValueOperand());
4753       }
4754     }
4755   for (auto *I : ScalarPtrs)
4756     if (!PossibleNonScalarPtrs.count(I)) {
4757       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4758       Worklist.insert(I);
4759     }
4760 
4761   // Insert the forced scalars.
4762   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4763   // induction variable when the PHI user is scalarized.
4764   auto ForcedScalar = ForcedScalars.find(VF);
4765   if (ForcedScalar != ForcedScalars.end())
4766     for (auto *I : ForcedScalar->second)
4767       Worklist.insert(I);
4768 
4769   // Expand the worklist by looking through any bitcasts and getelementptr
4770   // instructions we've already identified as scalar. This is similar to the
4771   // expansion step in collectLoopUniforms(); however, here we're only
4772   // expanding to include additional bitcasts and getelementptr instructions.
4773   unsigned Idx = 0;
4774   while (Idx != Worklist.size()) {
4775     Instruction *Dst = Worklist[Idx++];
4776     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4777       continue;
4778     auto *Src = cast<Instruction>(Dst->getOperand(0));
4779     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4780           auto *J = cast<Instruction>(U);
4781           return !TheLoop->contains(J) || Worklist.count(J) ||
4782                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4783                   isScalarUse(J, Src));
4784         })) {
4785       Worklist.insert(Src);
4786       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4787     }
4788   }
4789 
4790   // An induction variable will remain scalar if all users of the induction
4791   // variable and induction variable update remain scalar.
4792   for (auto &Induction : Legal->getInductionVars()) {
4793     auto *Ind = Induction.first;
4794     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4795 
4796     // If tail-folding is applied, the primary induction variable will be used
4797     // to feed a vector compare.
4798     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4799       continue;
4800 
4801     // Returns true if \p Indvar is a pointer induction that is used directly by
4802     // load/store instruction \p I.
4803     auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
4804                                               Instruction *I) {
4805       return Induction.second.getKind() ==
4806                  InductionDescriptor::IK_PtrInduction &&
4807              (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
4808              Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
4809     };
4810 
4811     // Determine if all users of the induction variable are scalar after
4812     // vectorization.
4813     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4814       auto *I = cast<Instruction>(U);
4815       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4816              IsDirectLoadStoreFromPtrIndvar(Ind, I);
4817     });
4818     if (!ScalarInd)
4819       continue;
4820 
4821     // Determine if all users of the induction variable update instruction are
4822     // scalar after vectorization.
4823     auto ScalarIndUpdate =
4824         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4825           auto *I = cast<Instruction>(U);
4826           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4827                  IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
4828         });
4829     if (!ScalarIndUpdate)
4830       continue;
4831 
4832     // The induction variable and its update instruction will remain scalar.
4833     Worklist.insert(Ind);
4834     Worklist.insert(IndUpdate);
4835     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4836     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4837                       << "\n");
4838   }
4839 
4840   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4841 }
4842 
4843 bool LoopVectorizationCostModel::isScalarWithPredication(
4844     Instruction *I, ElementCount VF) const {
4845   if (!blockNeedsPredicationForAnyReason(I->getParent()))
4846     return false;
4847   switch(I->getOpcode()) {
4848   default:
4849     break;
4850   case Instruction::Load:
4851   case Instruction::Store: {
4852     if (!Legal->isMaskRequired(I))
4853       return false;
4854     auto *Ptr = getLoadStorePointerOperand(I);
4855     auto *Ty = getLoadStoreType(I);
4856     Type *VTy = Ty;
4857     if (VF.isVector())
4858       VTy = VectorType::get(Ty, VF);
4859     const Align Alignment = getLoadStoreAlignment(I);
4860     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4861                                 TTI.isLegalMaskedGather(VTy, Alignment))
4862                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4863                                 TTI.isLegalMaskedScatter(VTy, Alignment));
4864   }
4865   case Instruction::UDiv:
4866   case Instruction::SDiv:
4867   case Instruction::SRem:
4868   case Instruction::URem:
4869     return mayDivideByZero(*I);
4870   }
4871   return false;
4872 }
4873 
4874 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4875     Instruction *I, ElementCount VF) {
4876   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4877   assert(getWideningDecision(I, VF) == CM_Unknown &&
4878          "Decision should not be set yet.");
4879   auto *Group = getInterleavedAccessGroup(I);
4880   assert(Group && "Must have a group.");
4881 
4882   // If the instruction's allocated size doesn't equal it's type size, it
4883   // requires padding and will be scalarized.
4884   auto &DL = I->getModule()->getDataLayout();
4885   auto *ScalarTy = getLoadStoreType(I);
4886   if (hasIrregularType(ScalarTy, DL))
4887     return false;
4888 
4889   // Check if masking is required.
4890   // A Group may need masking for one of two reasons: it resides in a block that
4891   // needs predication, or it was decided to use masking to deal with gaps
4892   // (either a gap at the end of a load-access that may result in a speculative
4893   // load, or any gaps in a store-access).
4894   bool PredicatedAccessRequiresMasking =
4895       blockNeedsPredicationForAnyReason(I->getParent()) &&
4896       Legal->isMaskRequired(I);
4897   bool LoadAccessWithGapsRequiresEpilogMasking =
4898       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4899       !isScalarEpilogueAllowed();
4900   bool StoreAccessWithGapsRequiresMasking =
4901       isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4902   if (!PredicatedAccessRequiresMasking &&
4903       !LoadAccessWithGapsRequiresEpilogMasking &&
4904       !StoreAccessWithGapsRequiresMasking)
4905     return true;
4906 
4907   // If masked interleaving is required, we expect that the user/target had
4908   // enabled it, because otherwise it either wouldn't have been created or
4909   // it should have been invalidated by the CostModel.
4910   assert(useMaskedInterleavedAccesses(TTI) &&
4911          "Masked interleave-groups for predicated accesses are not enabled.");
4912 
4913   if (Group->isReverse())
4914     return false;
4915 
4916   auto *Ty = getLoadStoreType(I);
4917   const Align Alignment = getLoadStoreAlignment(I);
4918   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4919                           : TTI.isLegalMaskedStore(Ty, Alignment);
4920 }
4921 
4922 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4923     Instruction *I, ElementCount VF) {
4924   // Get and ensure we have a valid memory instruction.
4925   assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
4926 
4927   auto *Ptr = getLoadStorePointerOperand(I);
4928   auto *ScalarTy = getLoadStoreType(I);
4929 
4930   // In order to be widened, the pointer should be consecutive, first of all.
4931   if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
4932     return false;
4933 
4934   // If the instruction is a store located in a predicated block, it will be
4935   // scalarized.
4936   if (isScalarWithPredication(I, VF))
4937     return false;
4938 
4939   // If the instruction's allocated size doesn't equal it's type size, it
4940   // requires padding and will be scalarized.
4941   auto &DL = I->getModule()->getDataLayout();
4942   if (hasIrregularType(ScalarTy, DL))
4943     return false;
4944 
4945   return true;
4946 }
4947 
4948 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4949   // We should not collect Uniforms more than once per VF. Right now,
4950   // this function is called from collectUniformsAndScalars(), which
4951   // already does this check. Collecting Uniforms for VF=1 does not make any
4952   // sense.
4953 
4954   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
4955          "This function should not be visited twice for the same VF");
4956 
4957   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4958   // not analyze again.  Uniforms.count(VF) will return 1.
4959   Uniforms[VF].clear();
4960 
4961   // We now know that the loop is vectorizable!
4962   // Collect instructions inside the loop that will remain uniform after
4963   // vectorization.
4964 
4965   // Global values, params and instructions outside of current loop are out of
4966   // scope.
4967   auto isOutOfScope = [&](Value *V) -> bool {
4968     Instruction *I = dyn_cast<Instruction>(V);
4969     return (!I || !TheLoop->contains(I));
4970   };
4971 
4972   // Worklist containing uniform instructions demanding lane 0.
4973   SetVector<Instruction *> Worklist;
4974   BasicBlock *Latch = TheLoop->getLoopLatch();
4975 
4976   // Add uniform instructions demanding lane 0 to the worklist. Instructions
4977   // that are scalar with predication must not be considered uniform after
4978   // vectorization, because that would create an erroneous replicating region
4979   // where only a single instance out of VF should be formed.
4980   // TODO: optimize such seldom cases if found important, see PR40816.
4981   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4982     if (isOutOfScope(I)) {
4983       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
4984                         << *I << "\n");
4985       return;
4986     }
4987     if (isScalarWithPredication(I, VF)) {
4988       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4989                         << *I << "\n");
4990       return;
4991     }
4992     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4993     Worklist.insert(I);
4994   };
4995 
4996   // Start with the conditional branch. If the branch condition is an
4997   // instruction contained in the loop that is only used by the branch, it is
4998   // uniform.
4999   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
5000   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
5001     addToWorklistIfAllowed(Cmp);
5002 
5003   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
5004     InstWidening WideningDecision = getWideningDecision(I, VF);
5005     assert(WideningDecision != CM_Unknown &&
5006            "Widening decision should be ready at this moment");
5007 
5008     // A uniform memory op is itself uniform.  We exclude uniform stores
5009     // here as they demand the last lane, not the first one.
5010     if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
5011       assert(WideningDecision == CM_Scalarize);
5012       return true;
5013     }
5014 
5015     return (WideningDecision == CM_Widen ||
5016             WideningDecision == CM_Widen_Reverse ||
5017             WideningDecision == CM_Interleave);
5018   };
5019 
5020 
5021   // Returns true if Ptr is the pointer operand of a memory access instruction
5022   // I, and I is known to not require scalarization.
5023   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
5024     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
5025   };
5026 
5027   // Holds a list of values which are known to have at least one uniform use.
5028   // Note that there may be other uses which aren't uniform.  A "uniform use"
5029   // here is something which only demands lane 0 of the unrolled iterations;
5030   // it does not imply that all lanes produce the same value (e.g. this is not
5031   // the usual meaning of uniform)
5032   SetVector<Value *> HasUniformUse;
5033 
5034   // Scan the loop for instructions which are either a) known to have only
5035   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
5036   for (auto *BB : TheLoop->blocks())
5037     for (auto &I : *BB) {
5038       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
5039         switch (II->getIntrinsicID()) {
5040         case Intrinsic::sideeffect:
5041         case Intrinsic::experimental_noalias_scope_decl:
5042         case Intrinsic::assume:
5043         case Intrinsic::lifetime_start:
5044         case Intrinsic::lifetime_end:
5045           if (TheLoop->hasLoopInvariantOperands(&I))
5046             addToWorklistIfAllowed(&I);
5047           break;
5048         default:
5049           break;
5050         }
5051       }
5052 
5053       // ExtractValue instructions must be uniform, because the operands are
5054       // known to be loop-invariant.
5055       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
5056         assert(isOutOfScope(EVI->getAggregateOperand()) &&
5057                "Expected aggregate value to be loop invariant");
5058         addToWorklistIfAllowed(EVI);
5059         continue;
5060       }
5061 
5062       // If there's no pointer operand, there's nothing to do.
5063       auto *Ptr = getLoadStorePointerOperand(&I);
5064       if (!Ptr)
5065         continue;
5066 
5067       // A uniform memory op is itself uniform.  We exclude uniform stores
5068       // here as they demand the last lane, not the first one.
5069       if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
5070         addToWorklistIfAllowed(&I);
5071 
5072       if (isUniformDecision(&I, VF)) {
5073         assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
5074         HasUniformUse.insert(Ptr);
5075       }
5076     }
5077 
5078   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
5079   // demanding) users.  Since loops are assumed to be in LCSSA form, this
5080   // disallows uses outside the loop as well.
5081   for (auto *V : HasUniformUse) {
5082     if (isOutOfScope(V))
5083       continue;
5084     auto *I = cast<Instruction>(V);
5085     auto UsersAreMemAccesses =
5086       llvm::all_of(I->users(), [&](User *U) -> bool {
5087         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
5088       });
5089     if (UsersAreMemAccesses)
5090       addToWorklistIfAllowed(I);
5091   }
5092 
5093   // Expand Worklist in topological order: whenever a new instruction
5094   // is added , its users should be already inside Worklist.  It ensures
5095   // a uniform instruction will only be used by uniform instructions.
5096   unsigned idx = 0;
5097   while (idx != Worklist.size()) {
5098     Instruction *I = Worklist[idx++];
5099 
5100     for (auto OV : I->operand_values()) {
5101       // isOutOfScope operands cannot be uniform instructions.
5102       if (isOutOfScope(OV))
5103         continue;
5104       // First order recurrence Phi's should typically be considered
5105       // non-uniform.
5106       auto *OP = dyn_cast<PHINode>(OV);
5107       if (OP && Legal->isFirstOrderRecurrence(OP))
5108         continue;
5109       // If all the users of the operand are uniform, then add the
5110       // operand into the uniform worklist.
5111       auto *OI = cast<Instruction>(OV);
5112       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
5113             auto *J = cast<Instruction>(U);
5114             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
5115           }))
5116         addToWorklistIfAllowed(OI);
5117     }
5118   }
5119 
5120   // For an instruction to be added into Worklist above, all its users inside
5121   // the loop should also be in Worklist. However, this condition cannot be
5122   // true for phi nodes that form a cyclic dependence. We must process phi
5123   // nodes separately. An induction variable will remain uniform if all users
5124   // of the induction variable and induction variable update remain uniform.
5125   // The code below handles both pointer and non-pointer induction variables.
5126   for (auto &Induction : Legal->getInductionVars()) {
5127     auto *Ind = Induction.first;
5128     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
5129 
5130     // Determine if all users of the induction variable are uniform after
5131     // vectorization.
5132     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
5133       auto *I = cast<Instruction>(U);
5134       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
5135              isVectorizedMemAccessUse(I, Ind);
5136     });
5137     if (!UniformInd)
5138       continue;
5139 
5140     // Determine if all users of the induction variable update instruction are
5141     // uniform after vectorization.
5142     auto UniformIndUpdate =
5143         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
5144           auto *I = cast<Instruction>(U);
5145           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
5146                  isVectorizedMemAccessUse(I, IndUpdate);
5147         });
5148     if (!UniformIndUpdate)
5149       continue;
5150 
5151     // The induction variable and its update instruction will remain uniform.
5152     addToWorklistIfAllowed(Ind);
5153     addToWorklistIfAllowed(IndUpdate);
5154   }
5155 
5156   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
5157 }
5158 
5159 bool LoopVectorizationCostModel::runtimeChecksRequired() {
5160   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
5161 
5162   if (Legal->getRuntimePointerChecking()->Need) {
5163     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
5164         "runtime pointer checks needed. Enable vectorization of this "
5165         "loop with '#pragma clang loop vectorize(enable)' when "
5166         "compiling with -Os/-Oz",
5167         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5168     return true;
5169   }
5170 
5171   if (!PSE.getUnionPredicate().getPredicates().empty()) {
5172     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
5173         "runtime SCEV checks needed. Enable vectorization of this "
5174         "loop with '#pragma clang loop vectorize(enable)' when "
5175         "compiling with -Os/-Oz",
5176         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5177     return true;
5178   }
5179 
5180   // FIXME: Avoid specializing for stride==1 instead of bailing out.
5181   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
5182     reportVectorizationFailure("Runtime stride check for small trip count",
5183         "runtime stride == 1 checks needed. Enable vectorization of "
5184         "this loop without such check by compiling with -Os/-Oz",
5185         "CantVersionLoopWithOptForSize", ORE, TheLoop);
5186     return true;
5187   }
5188 
5189   return false;
5190 }
5191 
5192 ElementCount
5193 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
5194   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
5195     return ElementCount::getScalable(0);
5196 
5197   if (Hints->isScalableVectorizationDisabled()) {
5198     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
5199                             "ScalableVectorizationDisabled", ORE, TheLoop);
5200     return ElementCount::getScalable(0);
5201   }
5202 
5203   LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
5204 
5205   auto MaxScalableVF = ElementCount::getScalable(
5206       std::numeric_limits<ElementCount::ScalarTy>::max());
5207 
5208   // Test that the loop-vectorizer can legalize all operations for this MaxVF.
5209   // FIXME: While for scalable vectors this is currently sufficient, this should
5210   // be replaced by a more detailed mechanism that filters out specific VFs,
5211   // instead of invalidating vectorization for a whole set of VFs based on the
5212   // MaxVF.
5213 
5214   // Disable scalable vectorization if the loop contains unsupported reductions.
5215   if (!canVectorizeReductions(MaxScalableVF)) {
5216     reportVectorizationInfo(
5217         "Scalable vectorization not supported for the reduction "
5218         "operations found in this loop.",
5219         "ScalableVFUnfeasible", ORE, TheLoop);
5220     return ElementCount::getScalable(0);
5221   }
5222 
5223   // Disable scalable vectorization if the loop contains any instructions
5224   // with element types not supported for scalable vectors.
5225   if (any_of(ElementTypesInLoop, [&](Type *Ty) {
5226         return !Ty->isVoidTy() &&
5227                !this->TTI.isElementTypeLegalForScalableVector(Ty);
5228       })) {
5229     reportVectorizationInfo("Scalable vectorization is not supported "
5230                             "for all element types found in this loop.",
5231                             "ScalableVFUnfeasible", ORE, TheLoop);
5232     return ElementCount::getScalable(0);
5233   }
5234 
5235   if (Legal->isSafeForAnyVectorWidth())
5236     return MaxScalableVF;
5237 
5238   // Limit MaxScalableVF by the maximum safe dependence distance.
5239   Optional<unsigned> MaxVScale = TTI.getMaxVScale();
5240   if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange))
5241     MaxVScale =
5242         TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
5243   MaxScalableVF = ElementCount::getScalable(
5244       MaxVScale ? (MaxSafeElements / MaxVScale.getValue()) : 0);
5245   if (!MaxScalableVF)
5246     reportVectorizationInfo(
5247         "Max legal vector width too small, scalable vectorization "
5248         "unfeasible.",
5249         "ScalableVFUnfeasible", ORE, TheLoop);
5250 
5251   return MaxScalableVF;
5252 }
5253 
5254 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
5255     unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) {
5256   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5257   unsigned SmallestType, WidestType;
5258   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5259 
5260   // Get the maximum safe dependence distance in bits computed by LAA.
5261   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5262   // the memory accesses that is most restrictive (involved in the smallest
5263   // dependence distance).
5264   unsigned MaxSafeElements =
5265       PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
5266 
5267   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
5268   auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
5269 
5270   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
5271                     << ".\n");
5272   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
5273                     << ".\n");
5274 
5275   // First analyze the UserVF, fall back if the UserVF should be ignored.
5276   if (UserVF) {
5277     auto MaxSafeUserVF =
5278         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
5279 
5280     if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
5281       // If `VF=vscale x N` is safe, then so is `VF=N`
5282       if (UserVF.isScalable())
5283         return FixedScalableVFPair(
5284             ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
5285       else
5286         return UserVF;
5287     }
5288 
5289     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
5290 
5291     // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
5292     // is better to ignore the hint and let the compiler choose a suitable VF.
5293     if (!UserVF.isScalable()) {
5294       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5295                         << " is unsafe, clamping to max safe VF="
5296                         << MaxSafeFixedVF << ".\n");
5297       ORE->emit([&]() {
5298         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5299                                           TheLoop->getStartLoc(),
5300                                           TheLoop->getHeader())
5301                << "User-specified vectorization factor "
5302                << ore::NV("UserVectorizationFactor", UserVF)
5303                << " is unsafe, clamping to maximum safe vectorization factor "
5304                << ore::NV("VectorizationFactor", MaxSafeFixedVF);
5305       });
5306       return MaxSafeFixedVF;
5307     }
5308 
5309     if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
5310       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5311                         << " is ignored because scalable vectors are not "
5312                            "available.\n");
5313       ORE->emit([&]() {
5314         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5315                                           TheLoop->getStartLoc(),
5316                                           TheLoop->getHeader())
5317                << "User-specified vectorization factor "
5318                << ore::NV("UserVectorizationFactor", UserVF)
5319                << " is ignored because the target does not support scalable "
5320                   "vectors. The compiler will pick a more suitable value.";
5321       });
5322     } else {
5323       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
5324                         << " is unsafe. Ignoring scalable UserVF.\n");
5325       ORE->emit([&]() {
5326         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
5327                                           TheLoop->getStartLoc(),
5328                                           TheLoop->getHeader())
5329                << "User-specified vectorization factor "
5330                << ore::NV("UserVectorizationFactor", UserVF)
5331                << " is unsafe. Ignoring the hint to let the compiler pick a "
5332                   "more suitable value.";
5333       });
5334     }
5335   }
5336 
5337   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5338                     << " / " << WidestType << " bits.\n");
5339 
5340   FixedScalableVFPair Result(ElementCount::getFixed(1),
5341                              ElementCount::getScalable(0));
5342   if (auto MaxVF =
5343           getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5344                                   MaxSafeFixedVF, FoldTailByMasking))
5345     Result.FixedVF = MaxVF;
5346 
5347   if (auto MaxVF =
5348           getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
5349                                   MaxSafeScalableVF, FoldTailByMasking))
5350     if (MaxVF.isScalable()) {
5351       Result.ScalableVF = MaxVF;
5352       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
5353                         << "\n");
5354     }
5355 
5356   return Result;
5357 }
5358 
5359 FixedScalableVFPair
5360 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
5361   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
5362     // TODO: It may by useful to do since it's still likely to be dynamically
5363     // uniform if the target can skip.
5364     reportVectorizationFailure(
5365         "Not inserting runtime ptr check for divergent target",
5366         "runtime pointer checks needed. Not enabled for divergent target",
5367         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
5368     return FixedScalableVFPair::getNone();
5369   }
5370 
5371   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
5372   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
5373   if (TC == 1) {
5374     reportVectorizationFailure("Single iteration (non) loop",
5375         "loop trip count is one, irrelevant for vectorization",
5376         "SingleIterationLoop", ORE, TheLoop);
5377     return FixedScalableVFPair::getNone();
5378   }
5379 
5380   switch (ScalarEpilogueStatus) {
5381   case CM_ScalarEpilogueAllowed:
5382     return computeFeasibleMaxVF(TC, UserVF, false);
5383   case CM_ScalarEpilogueNotAllowedUsePredicate:
5384     LLVM_FALLTHROUGH;
5385   case CM_ScalarEpilogueNotNeededUsePredicate:
5386     LLVM_DEBUG(
5387         dbgs() << "LV: vector predicate hint/switch found.\n"
5388                << "LV: Not allowing scalar epilogue, creating predicated "
5389                << "vector loop.\n");
5390     break;
5391   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5392     // fallthrough as a special case of OptForSize
5393   case CM_ScalarEpilogueNotAllowedOptSize:
5394     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5395       LLVM_DEBUG(
5396           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5397     else
5398       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5399                         << "count.\n");
5400 
5401     // Bail if runtime checks are required, which are not good when optimising
5402     // for size.
5403     if (runtimeChecksRequired())
5404       return FixedScalableVFPair::getNone();
5405 
5406     break;
5407   }
5408 
5409   // The only loops we can vectorize without a scalar epilogue, are loops with
5410   // a bottom-test and a single exiting block. We'd have to handle the fact
5411   // that not every instruction executes on the last iteration.  This will
5412   // require a lane mask which varies through the vector loop body.  (TODO)
5413   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5414     // If there was a tail-folding hint/switch, but we can't fold the tail by
5415     // masking, fallback to a vectorization with a scalar epilogue.
5416     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5417       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5418                            "scalar epilogue instead.\n");
5419       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5420       return computeFeasibleMaxVF(TC, UserVF, false);
5421     }
5422     return FixedScalableVFPair::getNone();
5423   }
5424 
5425   // Now try the tail folding
5426 
5427   // Invalidate interleave groups that require an epilogue if we can't mask
5428   // the interleave-group.
5429   if (!useMaskedInterleavedAccesses(TTI)) {
5430     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5431            "No decisions should have been taken at this point");
5432     // Note: There is no need to invalidate any cost modeling decisions here, as
5433     // non where taken so far.
5434     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5435   }
5436 
5437   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true);
5438   // Avoid tail folding if the trip count is known to be a multiple of any VF
5439   // we chose.
5440   // FIXME: The condition below pessimises the case for fixed-width vectors,
5441   // when scalable VFs are also candidates for vectorization.
5442   if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
5443     ElementCount MaxFixedVF = MaxFactors.FixedVF;
5444     assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&
5445            "MaxFixedVF must be a power of 2");
5446     unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
5447                                    : MaxFixedVF.getFixedValue();
5448     ScalarEvolution *SE = PSE.getSE();
5449     const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5450     const SCEV *ExitCount = SE->getAddExpr(
5451         BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5452     const SCEV *Rem = SE->getURemExpr(
5453         SE->applyLoopGuards(ExitCount, TheLoop),
5454         SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5455     if (Rem->isZero()) {
5456       // Accept MaxFixedVF if we do not have a tail.
5457       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5458       return MaxFactors;
5459     }
5460   }
5461 
5462   // For scalable vectors don't use tail folding for low trip counts or
5463   // optimizing for code size. We only permit this if the user has explicitly
5464   // requested it.
5465   if (ScalarEpilogueStatus != CM_ScalarEpilogueNotNeededUsePredicate &&
5466       ScalarEpilogueStatus != CM_ScalarEpilogueNotAllowedUsePredicate &&
5467       MaxFactors.ScalableVF.isVector())
5468     MaxFactors.ScalableVF = ElementCount::getScalable(0);
5469 
5470   // If we don't know the precise trip count, or if the trip count that we
5471   // found modulo the vectorization factor is not zero, try to fold the tail
5472   // by masking.
5473   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5474   if (Legal->prepareToFoldTailByMasking()) {
5475     FoldTailByMasking = true;
5476     return MaxFactors;
5477   }
5478 
5479   // If there was a tail-folding hint/switch, but we can't fold the tail by
5480   // masking, fallback to a vectorization with a scalar epilogue.
5481   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5482     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5483                          "scalar epilogue instead.\n");
5484     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5485     return MaxFactors;
5486   }
5487 
5488   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5489     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5490     return FixedScalableVFPair::getNone();
5491   }
5492 
5493   if (TC == 0) {
5494     reportVectorizationFailure(
5495         "Unable to calculate the loop count due to complex control flow",
5496         "unable to calculate the loop count due to complex control flow",
5497         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5498     return FixedScalableVFPair::getNone();
5499   }
5500 
5501   reportVectorizationFailure(
5502       "Cannot optimize for size and vectorize at the same time.",
5503       "cannot optimize for size and vectorize at the same time. "
5504       "Enable vectorization of this loop with '#pragma clang loop "
5505       "vectorize(enable)' when compiling with -Os/-Oz",
5506       "NoTailLoopWithOptForSize", ORE, TheLoop);
5507   return FixedScalableVFPair::getNone();
5508 }
5509 
5510 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
5511     unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
5512     const ElementCount &MaxSafeVF, bool FoldTailByMasking) {
5513   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
5514   TypeSize WidestRegister = TTI.getRegisterBitWidth(
5515       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5516                            : TargetTransformInfo::RGK_FixedWidthVector);
5517 
5518   // Convenience function to return the minimum of two ElementCounts.
5519   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
5520     assert((LHS.isScalable() == RHS.isScalable()) &&
5521            "Scalable flags must match");
5522     return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
5523   };
5524 
5525   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5526   // Note that both WidestRegister and WidestType may not be a powers of 2.
5527   auto MaxVectorElementCount = ElementCount::get(
5528       PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType),
5529       ComputeScalableMaxVF);
5530   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
5531   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5532                     << (MaxVectorElementCount * WidestType) << " bits.\n");
5533 
5534   if (!MaxVectorElementCount) {
5535     LLVM_DEBUG(dbgs() << "LV: The target has no "
5536                       << (ComputeScalableMaxVF ? "scalable" : "fixed")
5537                       << " vector registers.\n");
5538     return ElementCount::getFixed(1);
5539   }
5540 
5541   const auto TripCountEC = ElementCount::getFixed(ConstTripCount);
5542   if (ConstTripCount &&
5543       ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) &&
5544       (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {
5545     // If loop trip count (TC) is known at compile time there is no point in
5546     // choosing VF greater than TC (as done in the loop below). Select maximum
5547     // power of two which doesn't exceed TC.
5548     // If MaxVectorElementCount is scalable, we only fall back on a fixed VF
5549     // when the TC is less than or equal to the known number of lanes.
5550     auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount);
5551     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
5552                          "exceeding the constant trip count: "
5553                       << ClampedConstTripCount << "\n");
5554     return ElementCount::getFixed(ClampedConstTripCount);
5555   }
5556 
5557   ElementCount MaxVF = MaxVectorElementCount;
5558   if (TTI.shouldMaximizeVectorBandwidth() ||
5559       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5560     auto MaxVectorElementCountMaxBW = ElementCount::get(
5561         PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
5562         ComputeScalableMaxVF);
5563     MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
5564 
5565     // Collect all viable vectorization factors larger than the default MaxVF
5566     // (i.e. MaxVectorElementCount).
5567     SmallVector<ElementCount, 8> VFs;
5568     for (ElementCount VS = MaxVectorElementCount * 2;
5569          ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
5570       VFs.push_back(VS);
5571 
5572     // For each VF calculate its register usage.
5573     auto RUs = calculateRegisterUsage(VFs);
5574 
5575     // Select the largest VF which doesn't require more registers than existing
5576     // ones.
5577     for (int i = RUs.size() - 1; i >= 0; --i) {
5578       bool Selected = true;
5579       for (auto &pair : RUs[i].MaxLocalUsers) {
5580         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5581         if (pair.second > TargetNumRegisters)
5582           Selected = false;
5583       }
5584       if (Selected) {
5585         MaxVF = VFs[i];
5586         break;
5587       }
5588     }
5589     if (ElementCount MinVF =
5590             TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
5591       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
5592         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5593                           << ") with target's minimum: " << MinVF << '\n');
5594         MaxVF = MinVF;
5595       }
5596     }
5597   }
5598   return MaxVF;
5599 }
5600 
5601 Optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const {
5602   if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
5603     auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
5604     auto Min = Attr.getVScaleRangeMin();
5605     auto Max = Attr.getVScaleRangeMax();
5606     if (Max && Min == Max)
5607       return Max;
5608   }
5609 
5610   return TTI.getVScaleForTuning();
5611 }
5612 
5613 bool LoopVectorizationCostModel::isMoreProfitable(
5614     const VectorizationFactor &A, const VectorizationFactor &B) const {
5615   InstructionCost CostA = A.Cost;
5616   InstructionCost CostB = B.Cost;
5617 
5618   unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
5619 
5620   if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking &&
5621       MaxTripCount) {
5622     // If we are folding the tail and the trip count is a known (possibly small)
5623     // constant, the trip count will be rounded up to an integer number of
5624     // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),
5625     // which we compare directly. When not folding the tail, the total cost will
5626     // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is
5627     // approximated with the per-lane cost below instead of using the tripcount
5628     // as here.
5629     auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue());
5630     auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue());
5631     return RTCostA < RTCostB;
5632   }
5633 
5634   // Improve estimate for the vector width if it is scalable.
5635   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
5636   unsigned EstimatedWidthB = B.Width.getKnownMinValue();
5637   if (Optional<unsigned> VScale = getVScaleForTuning()) {
5638     if (A.Width.isScalable())
5639       EstimatedWidthA *= VScale.getValue();
5640     if (B.Width.isScalable())
5641       EstimatedWidthB *= VScale.getValue();
5642   }
5643 
5644   // Assume vscale may be larger than 1 (or the value being tuned for),
5645   // so that scalable vectorization is slightly favorable over fixed-width
5646   // vectorization.
5647   if (A.Width.isScalable() && !B.Width.isScalable())
5648     return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
5649 
5650   // To avoid the need for FP division:
5651   //      (CostA / A.Width) < (CostB / B.Width)
5652   // <=>  (CostA * B.Width) < (CostB * A.Width)
5653   return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
5654 }
5655 
5656 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
5657     const ElementCountSet &VFCandidates) {
5658   InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
5659   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
5660   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
5661   assert(VFCandidates.count(ElementCount::getFixed(1)) &&
5662          "Expected Scalar VF to be a candidate");
5663 
5664   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost);
5665   VectorizationFactor ChosenFactor = ScalarCost;
5666 
5667   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5668   if (ForceVectorization && VFCandidates.size() > 1) {
5669     // Ignore scalar width, because the user explicitly wants vectorization.
5670     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5671     // evaluation.
5672     ChosenFactor.Cost = InstructionCost::getMax();
5673   }
5674 
5675   SmallVector<InstructionVFPair> InvalidCosts;
5676   for (const auto &i : VFCandidates) {
5677     // The cost for scalar VF=1 is already calculated, so ignore it.
5678     if (i.isScalar())
5679       continue;
5680 
5681     VectorizationCostTy C = expectedCost(i, &InvalidCosts);
5682     VectorizationFactor Candidate(i, C.first);
5683 
5684 #ifndef NDEBUG
5685     unsigned AssumedMinimumVscale = 1;
5686     if (Optional<unsigned> VScale = getVScaleForTuning())
5687       AssumedMinimumVscale = VScale.getValue();
5688     unsigned Width =
5689         Candidate.Width.isScalable()
5690             ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5691             : Candidate.Width.getFixedValue();
5692     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5693                       << " costs: " << (Candidate.Cost / Width));
5694     if (i.isScalable())
5695       LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
5696                         << AssumedMinimumVscale << ")");
5697     LLVM_DEBUG(dbgs() << ".\n");
5698 #endif
5699 
5700     if (!C.second && !ForceVectorization) {
5701       LLVM_DEBUG(
5702           dbgs() << "LV: Not considering vector loop of width " << i
5703                  << " because it will not generate any vector instructions.\n");
5704       continue;
5705     }
5706 
5707     // If profitable add it to ProfitableVF list.
5708     if (isMoreProfitable(Candidate, ScalarCost))
5709       ProfitableVFs.push_back(Candidate);
5710 
5711     if (isMoreProfitable(Candidate, ChosenFactor))
5712       ChosenFactor = Candidate;
5713   }
5714 
5715   // Emit a report of VFs with invalid costs in the loop.
5716   if (!InvalidCosts.empty()) {
5717     // Group the remarks per instruction, keeping the instruction order from
5718     // InvalidCosts.
5719     std::map<Instruction *, unsigned> Numbering;
5720     unsigned I = 0;
5721     for (auto &Pair : InvalidCosts)
5722       if (!Numbering.count(Pair.first))
5723         Numbering[Pair.first] = I++;
5724 
5725     // Sort the list, first on instruction(number) then on VF.
5726     llvm::sort(InvalidCosts,
5727                [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
5728                  if (Numbering[A.first] != Numbering[B.first])
5729                    return Numbering[A.first] < Numbering[B.first];
5730                  ElementCountComparator ECC;
5731                  return ECC(A.second, B.second);
5732                });
5733 
5734     // For a list of ordered instruction-vf pairs:
5735     //   [(load, vf1), (load, vf2), (store, vf1)]
5736     // Group the instructions together to emit separate remarks for:
5737     //   load  (vf1, vf2)
5738     //   store (vf1)
5739     auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
5740     auto Subset = ArrayRef<InstructionVFPair>();
5741     do {
5742       if (Subset.empty())
5743         Subset = Tail.take_front(1);
5744 
5745       Instruction *I = Subset.front().first;
5746 
5747       // If the next instruction is different, or if there are no other pairs,
5748       // emit a remark for the collated subset. e.g.
5749       //   [(load, vf1), (load, vf2))]
5750       // to emit:
5751       //  remark: invalid costs for 'load' at VF=(vf, vf2)
5752       if (Subset == Tail || Tail[Subset.size()].first != I) {
5753         std::string OutString;
5754         raw_string_ostream OS(OutString);
5755         assert(!Subset.empty() && "Unexpected empty range");
5756         OS << "Instruction with invalid costs prevented vectorization at VF=(";
5757         for (auto &Pair : Subset)
5758           OS << (Pair.second == Subset.front().second ? "" : ", ")
5759              << Pair.second;
5760         OS << "):";
5761         if (auto *CI = dyn_cast<CallInst>(I))
5762           OS << " call to " << CI->getCalledFunction()->getName();
5763         else
5764           OS << " " << I->getOpcodeName();
5765         OS.flush();
5766         reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
5767         Tail = Tail.drop_front(Subset.size());
5768         Subset = {};
5769       } else
5770         // Grow the subset by one element
5771         Subset = Tail.take_front(Subset.size() + 1);
5772     } while (!Tail.empty());
5773   }
5774 
5775   if (!EnableCondStoresVectorization && NumPredStores) {
5776     reportVectorizationFailure("There are conditional stores.",
5777         "store that is conditionally executed prevents vectorization",
5778         "ConditionalStore", ORE, TheLoop);
5779     ChosenFactor = ScalarCost;
5780   }
5781 
5782   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
5783                  ChosenFactor.Cost >= ScalarCost.Cost) dbgs()
5784              << "LV: Vectorization seems to be not beneficial, "
5785              << "but was forced by a user.\n");
5786   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
5787   return ChosenFactor;
5788 }
5789 
5790 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
5791     const Loop &L, ElementCount VF) const {
5792   // Cross iteration phis such as reductions need special handling and are
5793   // currently unsupported.
5794   if (any_of(L.getHeader()->phis(),
5795              [&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); }))
5796     return false;
5797 
5798   // Phis with uses outside of the loop require special handling and are
5799   // currently unsupported.
5800   for (auto &Entry : Legal->getInductionVars()) {
5801     // Look for uses of the value of the induction at the last iteration.
5802     Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
5803     for (User *U : PostInc->users())
5804       if (!L.contains(cast<Instruction>(U)))
5805         return false;
5806     // Look for uses of penultimate value of the induction.
5807     for (User *U : Entry.first->users())
5808       if (!L.contains(cast<Instruction>(U)))
5809         return false;
5810   }
5811 
5812   // Induction variables that are widened require special handling that is
5813   // currently not supported.
5814   if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
5815         return !(this->isScalarAfterVectorization(Entry.first, VF) ||
5816                  this->isProfitableToScalarize(Entry.first, VF));
5817       }))
5818     return false;
5819 
5820   // Epilogue vectorization code has not been auditted to ensure it handles
5821   // non-latch exits properly.  It may be fine, but it needs auditted and
5822   // tested.
5823   if (L.getExitingBlock() != L.getLoopLatch())
5824     return false;
5825 
5826   return true;
5827 }
5828 
5829 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5830     const ElementCount VF) const {
5831   // FIXME: We need a much better cost-model to take different parameters such
5832   // as register pressure, code size increase and cost of extra branches into
5833   // account. For now we apply a very crude heuristic and only consider loops
5834   // with vectorization factors larger than a certain value.
5835   // We also consider epilogue vectorization unprofitable for targets that don't
5836   // consider interleaving beneficial (eg. MVE).
5837   if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
5838     return false;
5839   // FIXME: We should consider changing the threshold for scalable
5840   // vectors to take VScaleForTuning into account.
5841   if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF)
5842     return true;
5843   return false;
5844 }
5845 
5846 VectorizationFactor
5847 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
5848     const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
5849   VectorizationFactor Result = VectorizationFactor::Disabled();
5850   if (!EnableEpilogueVectorization) {
5851     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
5852     return Result;
5853   }
5854 
5855   if (!isScalarEpilogueAllowed()) {
5856     LLVM_DEBUG(
5857         dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
5858                   "allowed.\n";);
5859     return Result;
5860   }
5861 
5862   // Not really a cost consideration, but check for unsupported cases here to
5863   // simplify the logic.
5864   if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
5865     LLVM_DEBUG(
5866         dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
5867                   "not a supported candidate.\n";);
5868     return Result;
5869   }
5870 
5871   if (EpilogueVectorizationForceVF > 1) {
5872     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
5873     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
5874     if (LVP.hasPlanWithVF(ForcedEC))
5875       return {ForcedEC, 0};
5876     else {
5877       LLVM_DEBUG(
5878           dbgs()
5879               << "LEV: Epilogue vectorization forced factor is not viable.\n";);
5880       return Result;
5881     }
5882   }
5883 
5884   if (TheLoop->getHeader()->getParent()->hasOptSize() ||
5885       TheLoop->getHeader()->getParent()->hasMinSize()) {
5886     LLVM_DEBUG(
5887         dbgs()
5888             << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
5889     return Result;
5890   }
5891 
5892   if (!isEpilogueVectorizationProfitable(MainLoopVF)) {
5893     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
5894                          "this loop\n");
5895     return Result;
5896   }
5897 
5898   // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
5899   // the main loop handles 8 lanes per iteration. We could still benefit from
5900   // vectorizing the epilogue loop with VF=4.
5901   ElementCount EstimatedRuntimeVF = MainLoopVF;
5902   if (MainLoopVF.isScalable()) {
5903     EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
5904     if (Optional<unsigned> VScale = getVScaleForTuning())
5905       EstimatedRuntimeVF *= VScale.getValue();
5906   }
5907 
5908   for (auto &NextVF : ProfitableVFs)
5909     if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
5910           ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) ||
5911          ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) &&
5912         (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) &&
5913         LVP.hasPlanWithVF(NextVF.Width))
5914       Result = NextVF;
5915 
5916   if (Result != VectorizationFactor::Disabled())
5917     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5918                       << Result.Width << "\n";);
5919   return Result;
5920 }
5921 
5922 std::pair<unsigned, unsigned>
5923 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5924   unsigned MinWidth = -1U;
5925   unsigned MaxWidth = 8;
5926   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5927   // For in-loop reductions, no element types are added to ElementTypesInLoop
5928   // if there are no loads/stores in the loop. In this case, check through the
5929   // reduction variables to determine the maximum width.
5930   if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
5931     // Reset MaxWidth so that we can find the smallest type used by recurrences
5932     // in the loop.
5933     MaxWidth = -1U;
5934     for (auto &PhiDescriptorPair : Legal->getReductionVars()) {
5935       const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
5936       // When finding the min width used by the recurrence we need to account
5937       // for casts on the input operands of the recurrence.
5938       MaxWidth = std::min<unsigned>(
5939           MaxWidth, std::min<unsigned>(
5940                         RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
5941                         RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
5942     }
5943   } else {
5944     for (Type *T : ElementTypesInLoop) {
5945       MinWidth = std::min<unsigned>(
5946           MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5947       MaxWidth = std::max<unsigned>(
5948           MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5949     }
5950   }
5951   return {MinWidth, MaxWidth};
5952 }
5953 
5954 void LoopVectorizationCostModel::collectElementTypesForWidening() {
5955   ElementTypesInLoop.clear();
5956   // For each block.
5957   for (BasicBlock *BB : TheLoop->blocks()) {
5958     // For each instruction in the loop.
5959     for (Instruction &I : BB->instructionsWithoutDebug()) {
5960       Type *T = I.getType();
5961 
5962       // Skip ignored values.
5963       if (ValuesToIgnore.count(&I))
5964         continue;
5965 
5966       // Only examine Loads, Stores and PHINodes.
5967       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5968         continue;
5969 
5970       // Examine PHI nodes that are reduction variables. Update the type to
5971       // account for the recurrence type.
5972       if (auto *PN = dyn_cast<PHINode>(&I)) {
5973         if (!Legal->isReductionVariable(PN))
5974           continue;
5975         const RecurrenceDescriptor &RdxDesc =
5976             Legal->getReductionVars().find(PN)->second;
5977         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
5978             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
5979                                       RdxDesc.getRecurrenceType(),
5980                                       TargetTransformInfo::ReductionFlags()))
5981           continue;
5982         T = RdxDesc.getRecurrenceType();
5983       }
5984 
5985       // Examine the stored values.
5986       if (auto *ST = dyn_cast<StoreInst>(&I))
5987         T = ST->getValueOperand()->getType();
5988 
5989       assert(T->isSized() &&
5990              "Expected the load/store/recurrence type to be sized");
5991 
5992       ElementTypesInLoop.insert(T);
5993     }
5994   }
5995 }
5996 
5997 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5998                                                            unsigned LoopCost) {
5999   // -- The interleave heuristics --
6000   // We interleave the loop in order to expose ILP and reduce the loop overhead.
6001   // There are many micro-architectural considerations that we can't predict
6002   // at this level. For example, frontend pressure (on decode or fetch) due to
6003   // code size, or the number and capabilities of the execution ports.
6004   //
6005   // We use the following heuristics to select the interleave count:
6006   // 1. If the code has reductions, then we interleave to break the cross
6007   // iteration dependency.
6008   // 2. If the loop is really small, then we interleave to reduce the loop
6009   // overhead.
6010   // 3. We don't interleave if we think that we will spill registers to memory
6011   // due to the increased register pressure.
6012 
6013   if (!isScalarEpilogueAllowed())
6014     return 1;
6015 
6016   // We used the distance for the interleave count.
6017   if (Legal->getMaxSafeDepDistBytes() != -1U)
6018     return 1;
6019 
6020   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
6021   const bool HasReductions = !Legal->getReductionVars().empty();
6022   // Do not interleave loops with a relatively small known or estimated trip
6023   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
6024   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
6025   // because with the above conditions interleaving can expose ILP and break
6026   // cross iteration dependences for reductions.
6027   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
6028       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
6029     return 1;
6030 
6031   RegisterUsage R = calculateRegisterUsage({VF})[0];
6032   // We divide by these constants so assume that we have at least one
6033   // instruction that uses at least one register.
6034   for (auto& pair : R.MaxLocalUsers) {
6035     pair.second = std::max(pair.second, 1U);
6036   }
6037 
6038   // We calculate the interleave count using the following formula.
6039   // Subtract the number of loop invariants from the number of available
6040   // registers. These registers are used by all of the interleaved instances.
6041   // Next, divide the remaining registers by the number of registers that is
6042   // required by the loop, in order to estimate how many parallel instances
6043   // fit without causing spills. All of this is rounded down if necessary to be
6044   // a power of two. We want power of two interleave count to simplify any
6045   // addressing operations or alignment considerations.
6046   // We also want power of two interleave counts to ensure that the induction
6047   // variable of the vector loop wraps to zero, when tail is folded by masking;
6048   // this currently happens when OptForSize, in which case IC is set to 1 above.
6049   unsigned IC = UINT_MAX;
6050 
6051   for (auto& pair : R.MaxLocalUsers) {
6052     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
6053     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
6054                       << " registers of "
6055                       << TTI.getRegisterClassName(pair.first) << " register class\n");
6056     if (VF.isScalar()) {
6057       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
6058         TargetNumRegisters = ForceTargetNumScalarRegs;
6059     } else {
6060       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
6061         TargetNumRegisters = ForceTargetNumVectorRegs;
6062     }
6063     unsigned MaxLocalUsers = pair.second;
6064     unsigned LoopInvariantRegs = 0;
6065     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
6066       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
6067 
6068     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
6069     // Don't count the induction variable as interleaved.
6070     if (EnableIndVarRegisterHeur) {
6071       TmpIC =
6072           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
6073                         std::max(1U, (MaxLocalUsers - 1)));
6074     }
6075 
6076     IC = std::min(IC, TmpIC);
6077   }
6078 
6079   // Clamp the interleave ranges to reasonable counts.
6080   unsigned MaxInterleaveCount =
6081       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
6082 
6083   // Check if the user has overridden the max.
6084   if (VF.isScalar()) {
6085     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
6086       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
6087   } else {
6088     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
6089       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
6090   }
6091 
6092   // If trip count is known or estimated compile time constant, limit the
6093   // interleave count to be less than the trip count divided by VF, provided it
6094   // is at least 1.
6095   //
6096   // For scalable vectors we can't know if interleaving is beneficial. It may
6097   // not be beneficial for small loops if none of the lanes in the second vector
6098   // iterations is enabled. However, for larger loops, there is likely to be a
6099   // similar benefit as for fixed-width vectors. For now, we choose to leave
6100   // the InterleaveCount as if vscale is '1', although if some information about
6101   // the vector is known (e.g. min vector size), we can make a better decision.
6102   if (BestKnownTC) {
6103     MaxInterleaveCount =
6104         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
6105     // Make sure MaxInterleaveCount is greater than 0.
6106     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
6107   }
6108 
6109   assert(MaxInterleaveCount > 0 &&
6110          "Maximum interleave count must be greater than 0");
6111 
6112   // Clamp the calculated IC to be between the 1 and the max interleave count
6113   // that the target and trip count allows.
6114   if (IC > MaxInterleaveCount)
6115     IC = MaxInterleaveCount;
6116   else
6117     // Make sure IC is greater than 0.
6118     IC = std::max(1u, IC);
6119 
6120   assert(IC > 0 && "Interleave count must be greater than 0.");
6121 
6122   // If we did not calculate the cost for VF (because the user selected the VF)
6123   // then we calculate the cost of VF here.
6124   if (LoopCost == 0) {
6125     InstructionCost C = expectedCost(VF).first;
6126     assert(C.isValid() && "Expected to have chosen a VF with valid cost");
6127     LoopCost = *C.getValue();
6128   }
6129 
6130   assert(LoopCost && "Non-zero loop cost expected");
6131 
6132   // Interleave if we vectorized this loop and there is a reduction that could
6133   // benefit from interleaving.
6134   if (VF.isVector() && HasReductions) {
6135     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
6136     return IC;
6137   }
6138 
6139   // For any scalar loop that either requires runtime checks or predication we
6140   // are better off leaving this to the unroller. Note that if we've already
6141   // vectorized the loop we will have done the runtime check and so interleaving
6142   // won't require further checks.
6143   bool ScalarInterleavingRequiresPredication =
6144       (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
6145          return Legal->blockNeedsPredication(BB);
6146        }));
6147   bool ScalarInterleavingRequiresRuntimePointerCheck =
6148       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
6149 
6150   // We want to interleave small loops in order to reduce the loop overhead and
6151   // potentially expose ILP opportunities.
6152   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
6153                     << "LV: IC is " << IC << '\n'
6154                     << "LV: VF is " << VF << '\n');
6155   const bool AggressivelyInterleaveReductions =
6156       TTI.enableAggressiveInterleaving(HasReductions);
6157   if (!ScalarInterleavingRequiresRuntimePointerCheck &&
6158       !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
6159     // We assume that the cost overhead is 1 and we use the cost model
6160     // to estimate the cost of the loop and interleave until the cost of the
6161     // loop overhead is about 5% of the cost of the loop.
6162     unsigned SmallIC =
6163         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
6164 
6165     // Interleave until store/load ports (estimated by max interleave count) are
6166     // saturated.
6167     unsigned NumStores = Legal->getNumStores();
6168     unsigned NumLoads = Legal->getNumLoads();
6169     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
6170     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
6171 
6172     // There is little point in interleaving for reductions containing selects
6173     // and compares when VF=1 since it may just create more overhead than it's
6174     // worth for loops with small trip counts. This is because we still have to
6175     // do the final reduction after the loop.
6176     bool HasSelectCmpReductions =
6177         HasReductions &&
6178         any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
6179           const RecurrenceDescriptor &RdxDesc = Reduction.second;
6180           return RecurrenceDescriptor::isSelectCmpRecurrenceKind(
6181               RdxDesc.getRecurrenceKind());
6182         });
6183     if (HasSelectCmpReductions) {
6184       LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
6185       return 1;
6186     }
6187 
6188     // If we have a scalar reduction (vector reductions are already dealt with
6189     // by this point), we can increase the critical path length if the loop
6190     // we're interleaving is inside another loop. For tree-wise reductions
6191     // set the limit to 2, and for ordered reductions it's best to disable
6192     // interleaving entirely.
6193     if (HasReductions && TheLoop->getLoopDepth() > 1) {
6194       bool HasOrderedReductions =
6195           any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
6196             const RecurrenceDescriptor &RdxDesc = Reduction.second;
6197             return RdxDesc.isOrdered();
6198           });
6199       if (HasOrderedReductions) {
6200         LLVM_DEBUG(
6201             dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
6202         return 1;
6203       }
6204 
6205       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
6206       SmallIC = std::min(SmallIC, F);
6207       StoresIC = std::min(StoresIC, F);
6208       LoadsIC = std::min(LoadsIC, F);
6209     }
6210 
6211     if (EnableLoadStoreRuntimeInterleave &&
6212         std::max(StoresIC, LoadsIC) > SmallIC) {
6213       LLVM_DEBUG(
6214           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
6215       return std::max(StoresIC, LoadsIC);
6216     }
6217 
6218     // If there are scalar reductions and TTI has enabled aggressive
6219     // interleaving for reductions, we will interleave to expose ILP.
6220     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
6221         AggressivelyInterleaveReductions) {
6222       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6223       // Interleave no less than SmallIC but not as aggressive as the normal IC
6224       // to satisfy the rare situation when resources are too limited.
6225       return std::max(IC / 2, SmallIC);
6226     } else {
6227       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
6228       return SmallIC;
6229     }
6230   }
6231 
6232   // Interleave if this is a large loop (small loops are already dealt with by
6233   // this point) that could benefit from interleaving.
6234   if (AggressivelyInterleaveReductions) {
6235     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
6236     return IC;
6237   }
6238 
6239   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
6240   return 1;
6241 }
6242 
6243 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
6244 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
6245   // This function calculates the register usage by measuring the highest number
6246   // of values that are alive at a single location. Obviously, this is a very
6247   // rough estimation. We scan the loop in a topological order in order and
6248   // assign a number to each instruction. We use RPO to ensure that defs are
6249   // met before their users. We assume that each instruction that has in-loop
6250   // users starts an interval. We record every time that an in-loop value is
6251   // used, so we have a list of the first and last occurrences of each
6252   // instruction. Next, we transpose this data structure into a multi map that
6253   // holds the list of intervals that *end* at a specific location. This multi
6254   // map allows us to perform a linear search. We scan the instructions linearly
6255   // and record each time that a new interval starts, by placing it in a set.
6256   // If we find this value in the multi-map then we remove it from the set.
6257   // The max register usage is the maximum size of the set.
6258   // We also search for instructions that are defined outside the loop, but are
6259   // used inside the loop. We need this number separately from the max-interval
6260   // usage number because when we unroll, loop-invariant values do not take
6261   // more register.
6262   LoopBlocksDFS DFS(TheLoop);
6263   DFS.perform(LI);
6264 
6265   RegisterUsage RU;
6266 
6267   // Each 'key' in the map opens a new interval. The values
6268   // of the map are the index of the 'last seen' usage of the
6269   // instruction that is the key.
6270   using IntervalMap = DenseMap<Instruction *, unsigned>;
6271 
6272   // Maps instruction to its index.
6273   SmallVector<Instruction *, 64> IdxToInstr;
6274   // Marks the end of each interval.
6275   IntervalMap EndPoint;
6276   // Saves the list of instruction indices that are used in the loop.
6277   SmallPtrSet<Instruction *, 8> Ends;
6278   // Saves the list of values that are used in the loop but are
6279   // defined outside the loop, such as arguments and constants.
6280   SmallPtrSet<Value *, 8> LoopInvariants;
6281 
6282   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
6283     for (Instruction &I : BB->instructionsWithoutDebug()) {
6284       IdxToInstr.push_back(&I);
6285 
6286       // Save the end location of each USE.
6287       for (Value *U : I.operands()) {
6288         auto *Instr = dyn_cast<Instruction>(U);
6289 
6290         // Ignore non-instruction values such as arguments, constants, etc.
6291         if (!Instr)
6292           continue;
6293 
6294         // If this instruction is outside the loop then record it and continue.
6295         if (!TheLoop->contains(Instr)) {
6296           LoopInvariants.insert(Instr);
6297           continue;
6298         }
6299 
6300         // Overwrite previous end points.
6301         EndPoint[Instr] = IdxToInstr.size();
6302         Ends.insert(Instr);
6303       }
6304     }
6305   }
6306 
6307   // Saves the list of intervals that end with the index in 'key'.
6308   using InstrList = SmallVector<Instruction *, 2>;
6309   DenseMap<unsigned, InstrList> TransposeEnds;
6310 
6311   // Transpose the EndPoints to a list of values that end at each index.
6312   for (auto &Interval : EndPoint)
6313     TransposeEnds[Interval.second].push_back(Interval.first);
6314 
6315   SmallPtrSet<Instruction *, 8> OpenIntervals;
6316   SmallVector<RegisterUsage, 8> RUs(VFs.size());
6317   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
6318 
6319   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
6320 
6321   // A lambda that gets the register usage for the given type and VF.
6322   const auto &TTICapture = TTI;
6323   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
6324     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
6325       return 0;
6326     InstructionCost::CostType RegUsage =
6327         *TTICapture.getRegUsageForType(VectorType::get(Ty, VF)).getValue();
6328     assert(RegUsage >= 0 && RegUsage <= std::numeric_limits<unsigned>::max() &&
6329            "Nonsensical values for register usage.");
6330     return RegUsage;
6331   };
6332 
6333   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
6334     Instruction *I = IdxToInstr[i];
6335 
6336     // Remove all of the instructions that end at this location.
6337     InstrList &List = TransposeEnds[i];
6338     for (Instruction *ToRemove : List)
6339       OpenIntervals.erase(ToRemove);
6340 
6341     // Ignore instructions that are never used within the loop.
6342     if (!Ends.count(I))
6343       continue;
6344 
6345     // Skip ignored values.
6346     if (ValuesToIgnore.count(I))
6347       continue;
6348 
6349     // For each VF find the maximum usage of registers.
6350     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
6351       // Count the number of live intervals.
6352       SmallMapVector<unsigned, unsigned, 4> RegUsage;
6353 
6354       if (VFs[j].isScalar()) {
6355         for (auto Inst : OpenIntervals) {
6356           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6357           if (RegUsage.find(ClassID) == RegUsage.end())
6358             RegUsage[ClassID] = 1;
6359           else
6360             RegUsage[ClassID] += 1;
6361         }
6362       } else {
6363         collectUniformsAndScalars(VFs[j]);
6364         for (auto Inst : OpenIntervals) {
6365           // Skip ignored values for VF > 1.
6366           if (VecValuesToIgnore.count(Inst))
6367             continue;
6368           if (isScalarAfterVectorization(Inst, VFs[j])) {
6369             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
6370             if (RegUsage.find(ClassID) == RegUsage.end())
6371               RegUsage[ClassID] = 1;
6372             else
6373               RegUsage[ClassID] += 1;
6374           } else {
6375             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
6376             if (RegUsage.find(ClassID) == RegUsage.end())
6377               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
6378             else
6379               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
6380           }
6381         }
6382       }
6383 
6384       for (auto& pair : RegUsage) {
6385         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
6386           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
6387         else
6388           MaxUsages[j][pair.first] = pair.second;
6389       }
6390     }
6391 
6392     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6393                       << OpenIntervals.size() << '\n');
6394 
6395     // Add the current instruction to the list of open intervals.
6396     OpenIntervals.insert(I);
6397   }
6398 
6399   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6400     SmallMapVector<unsigned, unsigned, 4> Invariant;
6401 
6402     for (auto Inst : LoopInvariants) {
6403       unsigned Usage =
6404           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6405       unsigned ClassID =
6406           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6407       if (Invariant.find(ClassID) == Invariant.end())
6408         Invariant[ClassID] = Usage;
6409       else
6410         Invariant[ClassID] += Usage;
6411     }
6412 
6413     LLVM_DEBUG({
6414       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
6415       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
6416              << " item\n";
6417       for (const auto &pair : MaxUsages[i]) {
6418         dbgs() << "LV(REG): RegisterClass: "
6419                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6420                << " registers\n";
6421       }
6422       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
6423              << " item\n";
6424       for (const auto &pair : Invariant) {
6425         dbgs() << "LV(REG): RegisterClass: "
6426                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6427                << " registers\n";
6428       }
6429     });
6430 
6431     RU.LoopInvariantRegs = Invariant;
6432     RU.MaxLocalUsers = MaxUsages[i];
6433     RUs[i] = RU;
6434   }
6435 
6436   return RUs;
6437 }
6438 
6439 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6440   // If we aren't vectorizing the loop, or if we've already collected the
6441   // instructions to scalarize, there's nothing to do. Collection may already
6442   // have occurred if we have a user-selected VF and are now computing the
6443   // expected cost for interleaving.
6444   if (VF.isScalar() || VF.isZero() ||
6445       InstsToScalarize.find(VF) != InstsToScalarize.end())
6446     return;
6447 
6448   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6449   // not profitable to scalarize any instructions, the presence of VF in the
6450   // map will indicate that we've analyzed it already.
6451   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6452 
6453   // Find all the instructions that are scalar with predication in the loop and
6454   // determine if it would be better to not if-convert the blocks they are in.
6455   // If so, we also record the instructions to scalarize.
6456   for (BasicBlock *BB : TheLoop->blocks()) {
6457     if (!blockNeedsPredicationForAnyReason(BB))
6458       continue;
6459     for (Instruction &I : *BB)
6460       if (isScalarWithPredication(&I, VF)) {
6461         ScalarCostsTy ScalarCosts;
6462         // Do not apply discount if scalable, because that would lead to
6463         // invalid scalarization costs.
6464         if (!VF.isScalable() &&
6465             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6466           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6467         // Remember that BB will remain after vectorization.
6468         PredicatedBBsAfterVectorization.insert(BB);
6469       }
6470   }
6471 }
6472 
6473 int LoopVectorizationCostModel::computePredInstDiscount(
6474     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
6475   assert(!isUniformAfterVectorization(PredInst, VF) &&
6476          "Instruction marked uniform-after-vectorization will be predicated");
6477 
6478   // Initialize the discount to zero, meaning that the scalar version and the
6479   // vector version cost the same.
6480   InstructionCost Discount = 0;
6481 
6482   // Holds instructions to analyze. The instructions we visit are mapped in
6483   // ScalarCosts. Those instructions are the ones that would be scalarized if
6484   // we find that the scalar version costs less.
6485   SmallVector<Instruction *, 8> Worklist;
6486 
6487   // Returns true if the given instruction can be scalarized.
6488   auto canBeScalarized = [&](Instruction *I) -> bool {
6489     // We only attempt to scalarize instructions forming a single-use chain
6490     // from the original predicated block that would otherwise be vectorized.
6491     // Although not strictly necessary, we give up on instructions we know will
6492     // already be scalar to avoid traversing chains that are unlikely to be
6493     // beneficial.
6494     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6495         isScalarAfterVectorization(I, VF))
6496       return false;
6497 
6498     // If the instruction is scalar with predication, it will be analyzed
6499     // separately. We ignore it within the context of PredInst.
6500     if (isScalarWithPredication(I, VF))
6501       return false;
6502 
6503     // If any of the instruction's operands are uniform after vectorization,
6504     // the instruction cannot be scalarized. This prevents, for example, a
6505     // masked load from being scalarized.
6506     //
6507     // We assume we will only emit a value for lane zero of an instruction
6508     // marked uniform after vectorization, rather than VF identical values.
6509     // Thus, if we scalarize an instruction that uses a uniform, we would
6510     // create uses of values corresponding to the lanes we aren't emitting code
6511     // for. This behavior can be changed by allowing getScalarValue to clone
6512     // the lane zero values for uniforms rather than asserting.
6513     for (Use &U : I->operands())
6514       if (auto *J = dyn_cast<Instruction>(U.get()))
6515         if (isUniformAfterVectorization(J, VF))
6516           return false;
6517 
6518     // Otherwise, we can scalarize the instruction.
6519     return true;
6520   };
6521 
6522   // Compute the expected cost discount from scalarizing the entire expression
6523   // feeding the predicated instruction. We currently only consider expressions
6524   // that are single-use instruction chains.
6525   Worklist.push_back(PredInst);
6526   while (!Worklist.empty()) {
6527     Instruction *I = Worklist.pop_back_val();
6528 
6529     // If we've already analyzed the instruction, there's nothing to do.
6530     if (ScalarCosts.find(I) != ScalarCosts.end())
6531       continue;
6532 
6533     // Compute the cost of the vector instruction. Note that this cost already
6534     // includes the scalarization overhead of the predicated instruction.
6535     InstructionCost VectorCost = getInstructionCost(I, VF).first;
6536 
6537     // Compute the cost of the scalarized instruction. This cost is the cost of
6538     // the instruction as if it wasn't if-converted and instead remained in the
6539     // predicated block. We will scale this cost by block probability after
6540     // computing the scalarization overhead.
6541     InstructionCost ScalarCost =
6542         VF.getFixedValue() *
6543         getInstructionCost(I, ElementCount::getFixed(1)).first;
6544 
6545     // Compute the scalarization overhead of needed insertelement instructions
6546     // and phi nodes.
6547     if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
6548       ScalarCost += TTI.getScalarizationOverhead(
6549           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6550           APInt::getAllOnes(VF.getFixedValue()), true, false);
6551       ScalarCost +=
6552           VF.getFixedValue() *
6553           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6554     }
6555 
6556     // Compute the scalarization overhead of needed extractelement
6557     // instructions. For each of the instruction's operands, if the operand can
6558     // be scalarized, add it to the worklist; otherwise, account for the
6559     // overhead.
6560     for (Use &U : I->operands())
6561       if (auto *J = dyn_cast<Instruction>(U.get())) {
6562         assert(VectorType::isValidElementType(J->getType()) &&
6563                "Instruction has non-scalar type");
6564         if (canBeScalarized(J))
6565           Worklist.push_back(J);
6566         else if (needsExtract(J, VF)) {
6567           ScalarCost += TTI.getScalarizationOverhead(
6568               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6569               APInt::getAllOnes(VF.getFixedValue()), false, true);
6570         }
6571       }
6572 
6573     // Scale the total scalar cost by block probability.
6574     ScalarCost /= getReciprocalPredBlockProb();
6575 
6576     // Compute the discount. A non-negative discount means the vector version
6577     // of the instruction costs more, and scalarizing would be beneficial.
6578     Discount += VectorCost - ScalarCost;
6579     ScalarCosts[I] = ScalarCost;
6580   }
6581 
6582   return *Discount.getValue();
6583 }
6584 
6585 LoopVectorizationCostModel::VectorizationCostTy
6586 LoopVectorizationCostModel::expectedCost(
6587     ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
6588   VectorizationCostTy Cost;
6589 
6590   // For each block.
6591   for (BasicBlock *BB : TheLoop->blocks()) {
6592     VectorizationCostTy BlockCost;
6593 
6594     // For each instruction in the old loop.
6595     for (Instruction &I : BB->instructionsWithoutDebug()) {
6596       // Skip ignored values.
6597       if (ValuesToIgnore.count(&I) ||
6598           (VF.isVector() && VecValuesToIgnore.count(&I)))
6599         continue;
6600 
6601       VectorizationCostTy C = getInstructionCost(&I, VF);
6602 
6603       // Check if we should override the cost.
6604       if (C.first.isValid() &&
6605           ForceTargetInstructionCost.getNumOccurrences() > 0)
6606         C.first = InstructionCost(ForceTargetInstructionCost);
6607 
6608       // Keep a list of instructions with invalid costs.
6609       if (Invalid && !C.first.isValid())
6610         Invalid->emplace_back(&I, VF);
6611 
6612       BlockCost.first += C.first;
6613       BlockCost.second |= C.second;
6614       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6615                         << " for VF " << VF << " For instruction: " << I
6616                         << '\n');
6617     }
6618 
6619     // If we are vectorizing a predicated block, it will have been
6620     // if-converted. This means that the block's instructions (aside from
6621     // stores and instructions that may divide by zero) will now be
6622     // unconditionally executed. For the scalar case, we may not always execute
6623     // the predicated block, if it is an if-else block. Thus, scale the block's
6624     // cost by the probability of executing it. blockNeedsPredication from
6625     // Legal is used so as to not include all blocks in tail folded loops.
6626     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6627       BlockCost.first /= getReciprocalPredBlockProb();
6628 
6629     Cost.first += BlockCost.first;
6630     Cost.second |= BlockCost.second;
6631   }
6632 
6633   return Cost;
6634 }
6635 
6636 /// Gets Address Access SCEV after verifying that the access pattern
6637 /// is loop invariant except the induction variable dependence.
6638 ///
6639 /// This SCEV can be sent to the Target in order to estimate the address
6640 /// calculation cost.
6641 static const SCEV *getAddressAccessSCEV(
6642               Value *Ptr,
6643               LoopVectorizationLegality *Legal,
6644               PredicatedScalarEvolution &PSE,
6645               const Loop *TheLoop) {
6646 
6647   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6648   if (!Gep)
6649     return nullptr;
6650 
6651   // We are looking for a gep with all loop invariant indices except for one
6652   // which should be an induction variable.
6653   auto SE = PSE.getSE();
6654   unsigned NumOperands = Gep->getNumOperands();
6655   for (unsigned i = 1; i < NumOperands; ++i) {
6656     Value *Opd = Gep->getOperand(i);
6657     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6658         !Legal->isInductionVariable(Opd))
6659       return nullptr;
6660   }
6661 
6662   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6663   return PSE.getSCEV(Ptr);
6664 }
6665 
6666 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6667   return Legal->hasStride(I->getOperand(0)) ||
6668          Legal->hasStride(I->getOperand(1));
6669 }
6670 
6671 InstructionCost
6672 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6673                                                         ElementCount VF) {
6674   assert(VF.isVector() &&
6675          "Scalarization cost of instruction implies vectorization.");
6676   if (VF.isScalable())
6677     return InstructionCost::getInvalid();
6678 
6679   Type *ValTy = getLoadStoreType(I);
6680   auto SE = PSE.getSE();
6681 
6682   unsigned AS = getLoadStoreAddressSpace(I);
6683   Value *Ptr = getLoadStorePointerOperand(I);
6684   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6685   // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
6686   //       that it is being called from this specific place.
6687 
6688   // Figure out whether the access is strided and get the stride value
6689   // if it's known in compile time
6690   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6691 
6692   // Get the cost of the scalar memory instruction and address computation.
6693   InstructionCost Cost =
6694       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6695 
6696   // Don't pass *I here, since it is scalar but will actually be part of a
6697   // vectorized loop where the user of it is a vectorized instruction.
6698   const Align Alignment = getLoadStoreAlignment(I);
6699   Cost += VF.getKnownMinValue() *
6700           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6701                               AS, TTI::TCK_RecipThroughput);
6702 
6703   // Get the overhead of the extractelement and insertelement instructions
6704   // we might create due to scalarization.
6705   Cost += getScalarizationOverhead(I, VF);
6706 
6707   // If we have a predicated load/store, it will need extra i1 extracts and
6708   // conditional branches, but may not be executed for each vector lane. Scale
6709   // the cost by the probability of executing the predicated block.
6710   if (isPredicatedInst(I, VF)) {
6711     Cost /= getReciprocalPredBlockProb();
6712 
6713     // Add the cost of an i1 extract and a branch
6714     auto *Vec_i1Ty =
6715         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
6716     Cost += TTI.getScalarizationOverhead(
6717         Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
6718         /*Insert=*/false, /*Extract=*/true);
6719     Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
6720   }
6721 
6722   return Cost;
6723 }
6724 
6725 InstructionCost
6726 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6727                                                     ElementCount VF) {
6728   Type *ValTy = getLoadStoreType(I);
6729   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6730   Value *Ptr = getLoadStorePointerOperand(I);
6731   unsigned AS = getLoadStoreAddressSpace(I);
6732   int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
6733   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6734 
6735   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6736          "Stride should be 1 or -1 for consecutive memory access");
6737   const Align Alignment = getLoadStoreAlignment(I);
6738   InstructionCost Cost = 0;
6739   if (Legal->isMaskRequired(I))
6740     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6741                                       CostKind);
6742   else
6743     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6744                                 CostKind, I);
6745 
6746   bool Reverse = ConsecutiveStride < 0;
6747   if (Reverse)
6748     Cost +=
6749         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
6750   return Cost;
6751 }
6752 
6753 InstructionCost
6754 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6755                                                 ElementCount VF) {
6756   assert(Legal->isUniformMemOp(*I));
6757 
6758   Type *ValTy = getLoadStoreType(I);
6759   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6760   const Align Alignment = getLoadStoreAlignment(I);
6761   unsigned AS = getLoadStoreAddressSpace(I);
6762   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6763   if (isa<LoadInst>(I)) {
6764     return TTI.getAddressComputationCost(ValTy) +
6765            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6766                                CostKind) +
6767            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6768   }
6769   StoreInst *SI = cast<StoreInst>(I);
6770 
6771   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6772   return TTI.getAddressComputationCost(ValTy) +
6773          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6774                              CostKind) +
6775          (isLoopInvariantStoreValue
6776               ? 0
6777               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6778                                        VF.getKnownMinValue() - 1));
6779 }
6780 
6781 InstructionCost
6782 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6783                                                  ElementCount VF) {
6784   Type *ValTy = getLoadStoreType(I);
6785   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6786   const Align Alignment = getLoadStoreAlignment(I);
6787   const Value *Ptr = getLoadStorePointerOperand(I);
6788 
6789   return TTI.getAddressComputationCost(VectorTy) +
6790          TTI.getGatherScatterOpCost(
6791              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6792              TargetTransformInfo::TCK_RecipThroughput, I);
6793 }
6794 
6795 InstructionCost
6796 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6797                                                    ElementCount VF) {
6798   // TODO: Once we have support for interleaving with scalable vectors
6799   // we can calculate the cost properly here.
6800   if (VF.isScalable())
6801     return InstructionCost::getInvalid();
6802 
6803   Type *ValTy = getLoadStoreType(I);
6804   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6805   unsigned AS = getLoadStoreAddressSpace(I);
6806 
6807   auto Group = getInterleavedAccessGroup(I);
6808   assert(Group && "Fail to get an interleaved access group.");
6809 
6810   unsigned InterleaveFactor = Group->getFactor();
6811   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6812 
6813   // Holds the indices of existing members in the interleaved group.
6814   SmallVector<unsigned, 4> Indices;
6815   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
6816     if (Group->getMember(IF))
6817       Indices.push_back(IF);
6818 
6819   // Calculate the cost of the whole interleaved group.
6820   bool UseMaskForGaps =
6821       (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6822       (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
6823   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
6824       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6825       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6826 
6827   if (Group->isReverse()) {
6828     // TODO: Add support for reversed masked interleaved access.
6829     assert(!Legal->isMaskRequired(I) &&
6830            "Reverse masked interleaved access not supported.");
6831     Cost +=
6832         Group->getNumMembers() *
6833         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
6834   }
6835   return Cost;
6836 }
6837 
6838 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
6839     Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
6840   using namespace llvm::PatternMatch;
6841   // Early exit for no inloop reductions
6842   if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6843     return None;
6844   auto *VectorTy = cast<VectorType>(Ty);
6845 
6846   // We are looking for a pattern of, and finding the minimal acceptable cost:
6847   //  reduce(mul(ext(A), ext(B))) or
6848   //  reduce(mul(A, B)) or
6849   //  reduce(ext(A)) or
6850   //  reduce(A).
6851   // The basic idea is that we walk down the tree to do that, finding the root
6852   // reduction instruction in InLoopReductionImmediateChains. From there we find
6853   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6854   // of the components. If the reduction cost is lower then we return it for the
6855   // reduction instruction and 0 for the other instructions in the pattern. If
6856   // it is not we return an invalid cost specifying the orignal cost method
6857   // should be used.
6858   Instruction *RetI = I;
6859   if (match(RetI, m_ZExtOrSExt(m_Value()))) {
6860     if (!RetI->hasOneUser())
6861       return None;
6862     RetI = RetI->user_back();
6863   }
6864   if (match(RetI, m_Mul(m_Value(), m_Value())) &&
6865       RetI->user_back()->getOpcode() == Instruction::Add) {
6866     if (!RetI->hasOneUser())
6867       return None;
6868     RetI = RetI->user_back();
6869   }
6870 
6871   // Test if the found instruction is a reduction, and if not return an invalid
6872   // cost specifying the parent to use the original cost modelling.
6873   if (!InLoopReductionImmediateChains.count(RetI))
6874     return None;
6875 
6876   // Find the reduction this chain is a part of and calculate the basic cost of
6877   // the reduction on its own.
6878   Instruction *LastChain = InLoopReductionImmediateChains[RetI];
6879   Instruction *ReductionPhi = LastChain;
6880   while (!isa<PHINode>(ReductionPhi))
6881     ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
6882 
6883   const RecurrenceDescriptor &RdxDesc =
6884       Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
6885 
6886   InstructionCost BaseCost = TTI.getArithmeticReductionCost(
6887       RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
6888 
6889   // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6890   // normal fmul instruction to the cost of the fadd reduction.
6891   if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
6892     BaseCost +=
6893         TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
6894 
6895   // If we're using ordered reductions then we can just return the base cost
6896   // here, since getArithmeticReductionCost calculates the full ordered
6897   // reduction cost when FP reassociation is not allowed.
6898   if (useOrderedReductions(RdxDesc))
6899     return BaseCost;
6900 
6901   // Get the operand that was not the reduction chain and match it to one of the
6902   // patterns, returning the better cost if it is found.
6903   Instruction *RedOp = RetI->getOperand(1) == LastChain
6904                            ? dyn_cast<Instruction>(RetI->getOperand(0))
6905                            : dyn_cast<Instruction>(RetI->getOperand(1));
6906 
6907   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6908 
6909   Instruction *Op0, *Op1;
6910   if (RedOp &&
6911       match(RedOp,
6912             m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
6913       match(Op0, m_ZExtOrSExt(m_Value())) &&
6914       Op0->getOpcode() == Op1->getOpcode() &&
6915       Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6916       !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
6917       (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
6918 
6919     // Matched reduce(ext(mul(ext(A), ext(B)))
6920     // Note that the extend opcodes need to all match, or if A==B they will have
6921     // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6922     // which is equally fine.
6923     bool IsUnsigned = isa<ZExtInst>(Op0);
6924     auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6925     auto *MulType = VectorType::get(Op0->getType(), VectorTy);
6926 
6927     InstructionCost ExtCost =
6928         TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
6929                              TTI::CastContextHint::None, CostKind, Op0);
6930     InstructionCost MulCost =
6931         TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
6932     InstructionCost Ext2Cost =
6933         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
6934                              TTI::CastContextHint::None, CostKind, RedOp);
6935 
6936     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6937         /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6938         CostKind);
6939 
6940     if (RedCost.isValid() &&
6941         RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6942       return I == RetI ? RedCost : 0;
6943   } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
6944              !TheLoop->isLoopInvariant(RedOp)) {
6945     // Matched reduce(ext(A))
6946     bool IsUnsigned = isa<ZExtInst>(RedOp);
6947     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6948     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6949         /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6950         CostKind);
6951 
6952     InstructionCost ExtCost =
6953         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6954                              TTI::CastContextHint::None, CostKind, RedOp);
6955     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6956       return I == RetI ? RedCost : 0;
6957   } else if (RedOp &&
6958              match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
6959     if (match(Op0, m_ZExtOrSExt(m_Value())) &&
6960         Op0->getOpcode() == Op1->getOpcode() &&
6961         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
6962       bool IsUnsigned = isa<ZExtInst>(Op0);
6963       Type *Op0Ty = Op0->getOperand(0)->getType();
6964       Type *Op1Ty = Op1->getOperand(0)->getType();
6965       Type *LargestOpTy =
6966           Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
6967                                                                     : Op0Ty;
6968       auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
6969 
6970       // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of
6971       // different sizes. We take the largest type as the ext to reduce, and add
6972       // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6973       InstructionCost ExtCost0 = TTI.getCastInstrCost(
6974           Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
6975           TTI::CastContextHint::None, CostKind, Op0);
6976       InstructionCost ExtCost1 = TTI.getCastInstrCost(
6977           Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
6978           TTI::CastContextHint::None, CostKind, Op1);
6979       InstructionCost MulCost =
6980           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6981 
6982       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6983           /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6984           CostKind);
6985       InstructionCost ExtraExtCost = 0;
6986       if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
6987         Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
6988         ExtraExtCost = TTI.getCastInstrCost(
6989             ExtraExtOp->getOpcode(), ExtType,
6990             VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
6991             TTI::CastContextHint::None, CostKind, ExtraExtOp);
6992       }
6993 
6994       if (RedCost.isValid() &&
6995           (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6996         return I == RetI ? RedCost : 0;
6997     } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
6998       // Matched reduce(mul())
6999       InstructionCost MulCost =
7000           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7001 
7002       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
7003           /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy,
7004           CostKind);
7005 
7006       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
7007         return I == RetI ? RedCost : 0;
7008     }
7009   }
7010 
7011   return I == RetI ? Optional<InstructionCost>(BaseCost) : None;
7012 }
7013 
7014 InstructionCost
7015 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
7016                                                      ElementCount VF) {
7017   // Calculate scalar cost only. Vectorization cost should be ready at this
7018   // moment.
7019   if (VF.isScalar()) {
7020     Type *ValTy = getLoadStoreType(I);
7021     const Align Alignment = getLoadStoreAlignment(I);
7022     unsigned AS = getLoadStoreAddressSpace(I);
7023 
7024     return TTI.getAddressComputationCost(ValTy) +
7025            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
7026                                TTI::TCK_RecipThroughput, I);
7027   }
7028   return getWideningCost(I, VF);
7029 }
7030 
7031 LoopVectorizationCostModel::VectorizationCostTy
7032 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
7033                                                ElementCount VF) {
7034   // If we know that this instruction will remain uniform, check the cost of
7035   // the scalar version.
7036   if (isUniformAfterVectorization(I, VF))
7037     VF = ElementCount::getFixed(1);
7038 
7039   if (VF.isVector() && isProfitableToScalarize(I, VF))
7040     return VectorizationCostTy(InstsToScalarize[VF][I], false);
7041 
7042   // Forced scalars do not have any scalarization overhead.
7043   auto ForcedScalar = ForcedScalars.find(VF);
7044   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
7045     auto InstSet = ForcedScalar->second;
7046     if (InstSet.count(I))
7047       return VectorizationCostTy(
7048           (getInstructionCost(I, ElementCount::getFixed(1)).first *
7049            VF.getKnownMinValue()),
7050           false);
7051   }
7052 
7053   Type *VectorTy;
7054   InstructionCost C = getInstructionCost(I, VF, VectorTy);
7055 
7056   bool TypeNotScalarized = false;
7057   if (VF.isVector() && VectorTy->isVectorTy()) {
7058     unsigned NumParts = TTI.getNumberOfParts(VectorTy);
7059     if (NumParts)
7060       TypeNotScalarized = NumParts < VF.getKnownMinValue();
7061     else
7062       C = InstructionCost::getInvalid();
7063   }
7064   return VectorizationCostTy(C, TypeNotScalarized);
7065 }
7066 
7067 InstructionCost
7068 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
7069                                                      ElementCount VF) const {
7070 
7071   // There is no mechanism yet to create a scalable scalarization loop,
7072   // so this is currently Invalid.
7073   if (VF.isScalable())
7074     return InstructionCost::getInvalid();
7075 
7076   if (VF.isScalar())
7077     return 0;
7078 
7079   InstructionCost Cost = 0;
7080   Type *RetTy = ToVectorTy(I->getType(), VF);
7081   if (!RetTy->isVoidTy() &&
7082       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
7083     Cost += TTI.getScalarizationOverhead(
7084         cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true,
7085         false);
7086 
7087   // Some targets keep addresses scalar.
7088   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
7089     return Cost;
7090 
7091   // Some targets support efficient element stores.
7092   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
7093     return Cost;
7094 
7095   // Collect operands to consider.
7096   CallInst *CI = dyn_cast<CallInst>(I);
7097   Instruction::op_range Ops = CI ? CI->args() : I->operands();
7098 
7099   // Skip operands that do not require extraction/scalarization and do not incur
7100   // any overhead.
7101   SmallVector<Type *> Tys;
7102   for (auto *V : filterExtractingOperands(Ops, VF))
7103     Tys.push_back(MaybeVectorizeType(V->getType(), VF));
7104   return Cost + TTI.getOperandsScalarizationOverhead(
7105                     filterExtractingOperands(Ops, VF), Tys);
7106 }
7107 
7108 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
7109   if (VF.isScalar())
7110     return;
7111   NumPredStores = 0;
7112   for (BasicBlock *BB : TheLoop->blocks()) {
7113     // For each instruction in the old loop.
7114     for (Instruction &I : *BB) {
7115       Value *Ptr =  getLoadStorePointerOperand(&I);
7116       if (!Ptr)
7117         continue;
7118 
7119       // TODO: We should generate better code and update the cost model for
7120       // predicated uniform stores. Today they are treated as any other
7121       // predicated store (see added test cases in
7122       // invariant-store-vectorization.ll).
7123       if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
7124         NumPredStores++;
7125 
7126       if (Legal->isUniformMemOp(I)) {
7127         // TODO: Avoid replicating loads and stores instead of
7128         // relying on instcombine to remove them.
7129         // Load: Scalar load + broadcast
7130         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
7131         InstructionCost Cost;
7132         if (isa<StoreInst>(&I) && VF.isScalable() &&
7133             isLegalGatherOrScatter(&I, VF)) {
7134           Cost = getGatherScatterCost(&I, VF);
7135           setWideningDecision(&I, VF, CM_GatherScatter, Cost);
7136         } else {
7137           assert((isa<LoadInst>(&I) || !VF.isScalable()) &&
7138                  "Cannot yet scalarize uniform stores");
7139           Cost = getUniformMemOpCost(&I, VF);
7140           setWideningDecision(&I, VF, CM_Scalarize, Cost);
7141         }
7142         continue;
7143       }
7144 
7145       // We assume that widening is the best solution when possible.
7146       if (memoryInstructionCanBeWidened(&I, VF)) {
7147         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
7148         int ConsecutiveStride = Legal->isConsecutivePtr(
7149             getLoadStoreType(&I), getLoadStorePointerOperand(&I));
7150         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
7151                "Expected consecutive stride.");
7152         InstWidening Decision =
7153             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
7154         setWideningDecision(&I, VF, Decision, Cost);
7155         continue;
7156       }
7157 
7158       // Choose between Interleaving, Gather/Scatter or Scalarization.
7159       InstructionCost InterleaveCost = InstructionCost::getInvalid();
7160       unsigned NumAccesses = 1;
7161       if (isAccessInterleaved(&I)) {
7162         auto Group = getInterleavedAccessGroup(&I);
7163         assert(Group && "Fail to get an interleaved access group.");
7164 
7165         // Make one decision for the whole group.
7166         if (getWideningDecision(&I, VF) != CM_Unknown)
7167           continue;
7168 
7169         NumAccesses = Group->getNumMembers();
7170         if (interleavedAccessCanBeWidened(&I, VF))
7171           InterleaveCost = getInterleaveGroupCost(&I, VF);
7172       }
7173 
7174       InstructionCost GatherScatterCost =
7175           isLegalGatherOrScatter(&I, VF)
7176               ? getGatherScatterCost(&I, VF) * NumAccesses
7177               : InstructionCost::getInvalid();
7178 
7179       InstructionCost ScalarizationCost =
7180           getMemInstScalarizationCost(&I, VF) * NumAccesses;
7181 
7182       // Choose better solution for the current VF,
7183       // write down this decision and use it during vectorization.
7184       InstructionCost Cost;
7185       InstWidening Decision;
7186       if (InterleaveCost <= GatherScatterCost &&
7187           InterleaveCost < ScalarizationCost) {
7188         Decision = CM_Interleave;
7189         Cost = InterleaveCost;
7190       } else if (GatherScatterCost < ScalarizationCost) {
7191         Decision = CM_GatherScatter;
7192         Cost = GatherScatterCost;
7193       } else {
7194         Decision = CM_Scalarize;
7195         Cost = ScalarizationCost;
7196       }
7197       // If the instructions belongs to an interleave group, the whole group
7198       // receives the same decision. The whole group receives the cost, but
7199       // the cost will actually be assigned to one instruction.
7200       if (auto Group = getInterleavedAccessGroup(&I))
7201         setWideningDecision(Group, VF, Decision, Cost);
7202       else
7203         setWideningDecision(&I, VF, Decision, Cost);
7204     }
7205   }
7206 
7207   // Make sure that any load of address and any other address computation
7208   // remains scalar unless there is gather/scatter support. This avoids
7209   // inevitable extracts into address registers, and also has the benefit of
7210   // activating LSR more, since that pass can't optimize vectorized
7211   // addresses.
7212   if (TTI.prefersVectorizedAddressing())
7213     return;
7214 
7215   // Start with all scalar pointer uses.
7216   SmallPtrSet<Instruction *, 8> AddrDefs;
7217   for (BasicBlock *BB : TheLoop->blocks())
7218     for (Instruction &I : *BB) {
7219       Instruction *PtrDef =
7220         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
7221       if (PtrDef && TheLoop->contains(PtrDef) &&
7222           getWideningDecision(&I, VF) != CM_GatherScatter)
7223         AddrDefs.insert(PtrDef);
7224     }
7225 
7226   // Add all instructions used to generate the addresses.
7227   SmallVector<Instruction *, 4> Worklist;
7228   append_range(Worklist, AddrDefs);
7229   while (!Worklist.empty()) {
7230     Instruction *I = Worklist.pop_back_val();
7231     for (auto &Op : I->operands())
7232       if (auto *InstOp = dyn_cast<Instruction>(Op))
7233         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
7234             AddrDefs.insert(InstOp).second)
7235           Worklist.push_back(InstOp);
7236   }
7237 
7238   for (auto *I : AddrDefs) {
7239     if (isa<LoadInst>(I)) {
7240       // Setting the desired widening decision should ideally be handled in
7241       // by cost functions, but since this involves the task of finding out
7242       // if the loaded register is involved in an address computation, it is
7243       // instead changed here when we know this is the case.
7244       InstWidening Decision = getWideningDecision(I, VF);
7245       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
7246         // Scalarize a widened load of address.
7247         setWideningDecision(
7248             I, VF, CM_Scalarize,
7249             (VF.getKnownMinValue() *
7250              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
7251       else if (auto Group = getInterleavedAccessGroup(I)) {
7252         // Scalarize an interleave group of address loads.
7253         for (unsigned I = 0; I < Group->getFactor(); ++I) {
7254           if (Instruction *Member = Group->getMember(I))
7255             setWideningDecision(
7256                 Member, VF, CM_Scalarize,
7257                 (VF.getKnownMinValue() *
7258                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
7259         }
7260       }
7261     } else
7262       // Make sure I gets scalarized and a cost estimate without
7263       // scalarization overhead.
7264       ForcedScalars[VF].insert(I);
7265   }
7266 }
7267 
7268 InstructionCost
7269 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
7270                                                Type *&VectorTy) {
7271   Type *RetTy = I->getType();
7272   if (canTruncateToMinimalBitwidth(I, VF))
7273     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
7274   auto SE = PSE.getSE();
7275   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
7276 
7277   auto hasSingleCopyAfterVectorization = [this](Instruction *I,
7278                                                 ElementCount VF) -> bool {
7279     if (VF.isScalar())
7280       return true;
7281 
7282     auto Scalarized = InstsToScalarize.find(VF);
7283     assert(Scalarized != InstsToScalarize.end() &&
7284            "VF not yet analyzed for scalarization profitability");
7285     return !Scalarized->second.count(I) &&
7286            llvm::all_of(I->users(), [&](User *U) {
7287              auto *UI = cast<Instruction>(U);
7288              return !Scalarized->second.count(UI);
7289            });
7290   };
7291   (void) hasSingleCopyAfterVectorization;
7292 
7293   if (isScalarAfterVectorization(I, VF)) {
7294     // With the exception of GEPs and PHIs, after scalarization there should
7295     // only be one copy of the instruction generated in the loop. This is
7296     // because the VF is either 1, or any instructions that need scalarizing
7297     // have already been dealt with by the the time we get here. As a result,
7298     // it means we don't have to multiply the instruction cost by VF.
7299     assert(I->getOpcode() == Instruction::GetElementPtr ||
7300            I->getOpcode() == Instruction::PHI ||
7301            (I->getOpcode() == Instruction::BitCast &&
7302             I->getType()->isPointerTy()) ||
7303            hasSingleCopyAfterVectorization(I, VF));
7304     VectorTy = RetTy;
7305   } else
7306     VectorTy = ToVectorTy(RetTy, VF);
7307 
7308   // TODO: We need to estimate the cost of intrinsic calls.
7309   switch (I->getOpcode()) {
7310   case Instruction::GetElementPtr:
7311     // We mark this instruction as zero-cost because the cost of GEPs in
7312     // vectorized code depends on whether the corresponding memory instruction
7313     // is scalarized or not. Therefore, we handle GEPs with the memory
7314     // instruction cost.
7315     return 0;
7316   case Instruction::Br: {
7317     // In cases of scalarized and predicated instructions, there will be VF
7318     // predicated blocks in the vectorized loop. Each branch around these
7319     // blocks requires also an extract of its vector compare i1 element.
7320     bool ScalarPredicatedBB = false;
7321     BranchInst *BI = cast<BranchInst>(I);
7322     if (VF.isVector() && BI->isConditional() &&
7323         (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
7324          PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
7325       ScalarPredicatedBB = true;
7326 
7327     if (ScalarPredicatedBB) {
7328       // Not possible to scalarize scalable vector with predicated instructions.
7329       if (VF.isScalable())
7330         return InstructionCost::getInvalid();
7331       // Return cost for branches around scalarized and predicated blocks.
7332       auto *Vec_i1Ty =
7333           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
7334       return (
7335           TTI.getScalarizationOverhead(
7336               Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) +
7337           (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
7338     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
7339       // The back-edge branch will remain, as will all scalar branches.
7340       return TTI.getCFInstrCost(Instruction::Br, CostKind);
7341     else
7342       // This branch will be eliminated by if-conversion.
7343       return 0;
7344     // Note: We currently assume zero cost for an unconditional branch inside
7345     // a predicated block since it will become a fall-through, although we
7346     // may decide in the future to call TTI for all branches.
7347   }
7348   case Instruction::PHI: {
7349     auto *Phi = cast<PHINode>(I);
7350 
7351     // First-order recurrences are replaced by vector shuffles inside the loop.
7352     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
7353     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
7354       return TTI.getShuffleCost(
7355           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
7356           None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
7357 
7358     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7359     // converted into select instructions. We require N - 1 selects per phi
7360     // node, where N is the number of incoming values.
7361     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
7362       return (Phi->getNumIncomingValues() - 1) *
7363              TTI.getCmpSelInstrCost(
7364                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
7365                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
7366                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
7367 
7368     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
7369   }
7370   case Instruction::UDiv:
7371   case Instruction::SDiv:
7372   case Instruction::URem:
7373   case Instruction::SRem:
7374     // If we have a predicated instruction, it may not be executed for each
7375     // vector lane. Get the scalarization cost and scale this amount by the
7376     // probability of executing the predicated block. If the instruction is not
7377     // predicated, we fall through to the next case.
7378     if (VF.isVector() && isScalarWithPredication(I, VF)) {
7379       InstructionCost Cost = 0;
7380 
7381       // These instructions have a non-void type, so account for the phi nodes
7382       // that we will create. This cost is likely to be zero. The phi node
7383       // cost, if any, should be scaled by the block probability because it
7384       // models a copy at the end of each predicated block.
7385       Cost += VF.getKnownMinValue() *
7386               TTI.getCFInstrCost(Instruction::PHI, CostKind);
7387 
7388       // The cost of the non-predicated instruction.
7389       Cost += VF.getKnownMinValue() *
7390               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
7391 
7392       // The cost of insertelement and extractelement instructions needed for
7393       // scalarization.
7394       Cost += getScalarizationOverhead(I, VF);
7395 
7396       // Scale the cost by the probability of executing the predicated blocks.
7397       // This assumes the predicated block for each vector lane is equally
7398       // likely.
7399       return Cost / getReciprocalPredBlockProb();
7400     }
7401     LLVM_FALLTHROUGH;
7402   case Instruction::Add:
7403   case Instruction::FAdd:
7404   case Instruction::Sub:
7405   case Instruction::FSub:
7406   case Instruction::Mul:
7407   case Instruction::FMul:
7408   case Instruction::FDiv:
7409   case Instruction::FRem:
7410   case Instruction::Shl:
7411   case Instruction::LShr:
7412   case Instruction::AShr:
7413   case Instruction::And:
7414   case Instruction::Or:
7415   case Instruction::Xor: {
7416     // Since we will replace the stride by 1 the multiplication should go away.
7417     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7418       return 0;
7419 
7420     // Detect reduction patterns
7421     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7422       return *RedCost;
7423 
7424     // Certain instructions can be cheaper to vectorize if they have a constant
7425     // second vector operand. One example of this are shifts on x86.
7426     Value *Op2 = I->getOperand(1);
7427     TargetTransformInfo::OperandValueProperties Op2VP;
7428     TargetTransformInfo::OperandValueKind Op2VK =
7429         TTI.getOperandInfo(Op2, Op2VP);
7430     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7431       Op2VK = TargetTransformInfo::OK_UniformValue;
7432 
7433     SmallVector<const Value *, 4> Operands(I->operand_values());
7434     return TTI.getArithmeticInstrCost(
7435         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7436         Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
7437   }
7438   case Instruction::FNeg: {
7439     return TTI.getArithmeticInstrCost(
7440         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7441         TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None,
7442         TargetTransformInfo::OP_None, I->getOperand(0), I);
7443   }
7444   case Instruction::Select: {
7445     SelectInst *SI = cast<SelectInst>(I);
7446     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7447     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7448 
7449     const Value *Op0, *Op1;
7450     using namespace llvm::PatternMatch;
7451     if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
7452                         match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
7453       // select x, y, false --> x & y
7454       // select x, true, y --> x | y
7455       TTI::OperandValueProperties Op1VP = TTI::OP_None;
7456       TTI::OperandValueProperties Op2VP = TTI::OP_None;
7457       TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP);
7458       TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP);
7459       assert(Op0->getType()->getScalarSizeInBits() == 1 &&
7460               Op1->getType()->getScalarSizeInBits() == 1);
7461 
7462       SmallVector<const Value *, 2> Operands{Op0, Op1};
7463       return TTI.getArithmeticInstrCost(
7464           match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
7465           CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I);
7466     }
7467 
7468     Type *CondTy = SI->getCondition()->getType();
7469     if (!ScalarCond)
7470       CondTy = VectorType::get(CondTy, VF);
7471 
7472     CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
7473     if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
7474       Pred = Cmp->getPredicate();
7475     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
7476                                   CostKind, I);
7477   }
7478   case Instruction::ICmp:
7479   case Instruction::FCmp: {
7480     Type *ValTy = I->getOperand(0)->getType();
7481     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7482     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7483       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7484     VectorTy = ToVectorTy(ValTy, VF);
7485     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7486                                   cast<CmpInst>(I)->getPredicate(), CostKind,
7487                                   I);
7488   }
7489   case Instruction::Store:
7490   case Instruction::Load: {
7491     ElementCount Width = VF;
7492     if (Width.isVector()) {
7493       InstWidening Decision = getWideningDecision(I, Width);
7494       assert(Decision != CM_Unknown &&
7495              "CM decision should be taken at this point");
7496       if (Decision == CM_Scalarize)
7497         Width = ElementCount::getFixed(1);
7498     }
7499     VectorTy = ToVectorTy(getLoadStoreType(I), Width);
7500     return getMemoryInstructionCost(I, VF);
7501   }
7502   case Instruction::BitCast:
7503     if (I->getType()->isPointerTy())
7504       return 0;
7505     LLVM_FALLTHROUGH;
7506   case Instruction::ZExt:
7507   case Instruction::SExt:
7508   case Instruction::FPToUI:
7509   case Instruction::FPToSI:
7510   case Instruction::FPExt:
7511   case Instruction::PtrToInt:
7512   case Instruction::IntToPtr:
7513   case Instruction::SIToFP:
7514   case Instruction::UIToFP:
7515   case Instruction::Trunc:
7516   case Instruction::FPTrunc: {
7517     // Computes the CastContextHint from a Load/Store instruction.
7518     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7519       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7520              "Expected a load or a store!");
7521 
7522       if (VF.isScalar() || !TheLoop->contains(I))
7523         return TTI::CastContextHint::Normal;
7524 
7525       switch (getWideningDecision(I, VF)) {
7526       case LoopVectorizationCostModel::CM_GatherScatter:
7527         return TTI::CastContextHint::GatherScatter;
7528       case LoopVectorizationCostModel::CM_Interleave:
7529         return TTI::CastContextHint::Interleave;
7530       case LoopVectorizationCostModel::CM_Scalarize:
7531       case LoopVectorizationCostModel::CM_Widen:
7532         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7533                                         : TTI::CastContextHint::Normal;
7534       case LoopVectorizationCostModel::CM_Widen_Reverse:
7535         return TTI::CastContextHint::Reversed;
7536       case LoopVectorizationCostModel::CM_Unknown:
7537         llvm_unreachable("Instr did not go through cost modelling?");
7538       }
7539 
7540       llvm_unreachable("Unhandled case!");
7541     };
7542 
7543     unsigned Opcode = I->getOpcode();
7544     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7545     // For Trunc, the context is the only user, which must be a StoreInst.
7546     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7547       if (I->hasOneUse())
7548         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7549           CCH = ComputeCCH(Store);
7550     }
7551     // For Z/Sext, the context is the operand, which must be a LoadInst.
7552     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7553              Opcode == Instruction::FPExt) {
7554       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7555         CCH = ComputeCCH(Load);
7556     }
7557 
7558     // We optimize the truncation of induction variables having constant
7559     // integer steps. The cost of these truncations is the same as the scalar
7560     // operation.
7561     if (isOptimizableIVTruncate(I, VF)) {
7562       auto *Trunc = cast<TruncInst>(I);
7563       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7564                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7565     }
7566 
7567     // Detect reduction patterns
7568     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7569       return *RedCost;
7570 
7571     Type *SrcScalarTy = I->getOperand(0)->getType();
7572     Type *SrcVecTy =
7573         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7574     if (canTruncateToMinimalBitwidth(I, VF)) {
7575       // This cast is going to be shrunk. This may remove the cast or it might
7576       // turn it into slightly different cast. For example, if MinBW == 16,
7577       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7578       //
7579       // Calculate the modified src and dest types.
7580       Type *MinVecTy = VectorTy;
7581       if (Opcode == Instruction::Trunc) {
7582         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7583         VectorTy =
7584             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7585       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7586         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7587         VectorTy =
7588             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7589       }
7590     }
7591 
7592     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7593   }
7594   case Instruction::Call: {
7595     if (RecurrenceDescriptor::isFMulAddIntrinsic(I))
7596       if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7597         return *RedCost;
7598     bool NeedToScalarize;
7599     CallInst *CI = cast<CallInst>(I);
7600     InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7601     if (getVectorIntrinsicIDForCall(CI, TLI)) {
7602       InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
7603       return std::min(CallCost, IntrinsicCost);
7604     }
7605     return CallCost;
7606   }
7607   case Instruction::ExtractValue:
7608     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7609   case Instruction::Alloca:
7610     // We cannot easily widen alloca to a scalable alloca, as
7611     // the result would need to be a vector of pointers.
7612     if (VF.isScalable())
7613       return InstructionCost::getInvalid();
7614     LLVM_FALLTHROUGH;
7615   default:
7616     // This opcode is unknown. Assume that it is the same as 'mul'.
7617     return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7618   } // end of switch.
7619 }
7620 
7621 char LoopVectorize::ID = 0;
7622 
7623 static const char lv_name[] = "Loop Vectorization";
7624 
7625 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7626 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7627 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7628 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
7629 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7630 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7631 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7632 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7633 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7634 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7635 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7636 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7637 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7638 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
7639 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
7640 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7641 
7642 namespace llvm {
7643 
7644 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7645 
7646 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7647                               bool VectorizeOnlyWhenForced) {
7648   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7649 }
7650 
7651 } // end namespace llvm
7652 
7653 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
7654   // Check if the pointer operand of a load or store instruction is
7655   // consecutive.
7656   if (auto *Ptr = getLoadStorePointerOperand(Inst))
7657     return Legal->isConsecutivePtr(getLoadStoreType(Inst), Ptr);
7658   return false;
7659 }
7660 
7661 void LoopVectorizationCostModel::collectValuesToIgnore() {
7662   // Ignore ephemeral values.
7663   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7664 
7665   // Ignore type-promoting instructions we identified during reduction
7666   // detection.
7667   for (auto &Reduction : Legal->getReductionVars()) {
7668     const RecurrenceDescriptor &RedDes = Reduction.second;
7669     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7670     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7671   }
7672   // Ignore type-casting instructions we identified during induction
7673   // detection.
7674   for (auto &Induction : Legal->getInductionVars()) {
7675     const InductionDescriptor &IndDes = Induction.second;
7676     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7677     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7678   }
7679 }
7680 
7681 void LoopVectorizationCostModel::collectInLoopReductions() {
7682   for (auto &Reduction : Legal->getReductionVars()) {
7683     PHINode *Phi = Reduction.first;
7684     const RecurrenceDescriptor &RdxDesc = Reduction.second;
7685 
7686     // We don't collect reductions that are type promoted (yet).
7687     if (RdxDesc.getRecurrenceType() != Phi->getType())
7688       continue;
7689 
7690     // If the target would prefer this reduction to happen "in-loop", then we
7691     // want to record it as such.
7692     unsigned Opcode = RdxDesc.getOpcode();
7693     if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7694         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7695                                    TargetTransformInfo::ReductionFlags()))
7696       continue;
7697 
7698     // Check that we can correctly put the reductions into the loop, by
7699     // finding the chain of operations that leads from the phi to the loop
7700     // exit value.
7701     SmallVector<Instruction *, 4> ReductionOperations =
7702         RdxDesc.getReductionOpChain(Phi, TheLoop);
7703     bool InLoop = !ReductionOperations.empty();
7704     if (InLoop) {
7705       InLoopReductionChains[Phi] = ReductionOperations;
7706       // Add the elements to InLoopReductionImmediateChains for cost modelling.
7707       Instruction *LastChain = Phi;
7708       for (auto *I : ReductionOperations) {
7709         InLoopReductionImmediateChains[I] = LastChain;
7710         LastChain = I;
7711       }
7712     }
7713     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7714                       << " reduction for phi: " << *Phi << "\n");
7715   }
7716 }
7717 
7718 // TODO: we could return a pair of values that specify the max VF and
7719 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7720 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7721 // doesn't have a cost model that can choose which plan to execute if
7722 // more than one is generated.
7723 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7724                                  LoopVectorizationCostModel &CM) {
7725   unsigned WidestType;
7726   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7727   return WidestVectorRegBits / WidestType;
7728 }
7729 
7730 VectorizationFactor
7731 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7732   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
7733   ElementCount VF = UserVF;
7734   // Outer loop handling: They may require CFG and instruction level
7735   // transformations before even evaluating whether vectorization is profitable.
7736   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7737   // the vectorization pipeline.
7738   if (!OrigLoop->isInnermost()) {
7739     // If the user doesn't provide a vectorization factor, determine a
7740     // reasonable one.
7741     if (UserVF.isZero()) {
7742       VF = ElementCount::getFixed(determineVPlanVF(
7743           TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
7744               .getFixedSize(),
7745           CM));
7746       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7747 
7748       // Make sure we have a VF > 1 for stress testing.
7749       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7750         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7751                           << "overriding computed VF.\n");
7752         VF = ElementCount::getFixed(4);
7753       }
7754     }
7755     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7756     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7757            "VF needs to be a power of two");
7758     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7759                       << "VF " << VF << " to build VPlans.\n");
7760     buildVPlans(VF, VF);
7761 
7762     // For VPlan build stress testing, we bail out after VPlan construction.
7763     if (VPlanBuildStressTest)
7764       return VectorizationFactor::Disabled();
7765 
7766     return {VF, 0 /*Cost*/};
7767   }
7768 
7769   LLVM_DEBUG(
7770       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7771                 "VPlan-native path.\n");
7772   return VectorizationFactor::Disabled();
7773 }
7774 
7775 Optional<VectorizationFactor>
7776 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7777   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7778   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7779   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7780     return None;
7781 
7782   // Invalidate interleave groups if all blocks of loop will be predicated.
7783   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7784       !useMaskedInterleavedAccesses(*TTI)) {
7785     LLVM_DEBUG(
7786         dbgs()
7787         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7788            "which requires masked-interleaved support.\n");
7789     if (CM.InterleaveInfo.invalidateGroups())
7790       // Invalidating interleave groups also requires invalidating all decisions
7791       // based on them, which includes widening decisions and uniform and scalar
7792       // values.
7793       CM.invalidateCostModelingDecisions();
7794   }
7795 
7796   ElementCount MaxUserVF =
7797       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7798   bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
7799   if (!UserVF.isZero() && UserVFIsLegal) {
7800     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7801            "VF needs to be a power of two");
7802     // Collect the instructions (and their associated costs) that will be more
7803     // profitable to scalarize.
7804     if (CM.selectUserVectorizationFactor(UserVF)) {
7805       LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7806       CM.collectInLoopReductions();
7807       buildVPlansWithVPRecipes(UserVF, UserVF);
7808       LLVM_DEBUG(printPlans(dbgs()));
7809       return {{UserVF, 0}};
7810     } else
7811       reportVectorizationInfo("UserVF ignored because of invalid costs.",
7812                               "InvalidCost", ORE, OrigLoop);
7813   }
7814 
7815   // Populate the set of Vectorization Factor Candidates.
7816   ElementCountSet VFCandidates;
7817   for (auto VF = ElementCount::getFixed(1);
7818        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7819     VFCandidates.insert(VF);
7820   for (auto VF = ElementCount::getScalable(1);
7821        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7822     VFCandidates.insert(VF);
7823 
7824   for (const auto &VF : VFCandidates) {
7825     // Collect Uniform and Scalar instructions after vectorization with VF.
7826     CM.collectUniformsAndScalars(VF);
7827 
7828     // Collect the instructions (and their associated costs) that will be more
7829     // profitable to scalarize.
7830     if (VF.isVector())
7831       CM.collectInstsToScalarize(VF);
7832   }
7833 
7834   CM.collectInLoopReductions();
7835   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7836   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7837 
7838   LLVM_DEBUG(printPlans(dbgs()));
7839   if (!MaxFactors.hasVector())
7840     return VectorizationFactor::Disabled();
7841 
7842   // Select the optimal vectorization factor.
7843   auto SelectedVF = CM.selectVectorizationFactor(VFCandidates);
7844 
7845   // Check if it is profitable to vectorize with runtime checks.
7846   unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
7847   if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) {
7848     bool PragmaThresholdReached =
7849         NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
7850     bool ThresholdReached =
7851         NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
7852     if ((ThresholdReached && !Hints.allowReordering()) ||
7853         PragmaThresholdReached) {
7854       ORE->emit([&]() {
7855         return OptimizationRemarkAnalysisAliasing(
7856                    DEBUG_TYPE, "CantReorderMemOps", OrigLoop->getStartLoc(),
7857                    OrigLoop->getHeader())
7858                << "loop not vectorized: cannot prove it is safe to reorder "
7859                   "memory operations";
7860       });
7861       LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
7862       Hints.emitRemarkWithHints();
7863       return VectorizationFactor::Disabled();
7864     }
7865   }
7866   return SelectedVF;
7867 }
7868 
7869 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
7870   assert(count_if(VPlans,
7871                   [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7872              1 &&
7873          "Best VF has not a single VPlan.");
7874 
7875   for (const VPlanPtr &Plan : VPlans) {
7876     if (Plan->hasVF(VF))
7877       return *Plan.get();
7878   }
7879   llvm_unreachable("No plan found!");
7880 }
7881 
7882 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7883   SmallVector<Metadata *, 4> MDs;
7884   // Reserve first location for self reference to the LoopID metadata node.
7885   MDs.push_back(nullptr);
7886   bool IsUnrollMetadata = false;
7887   MDNode *LoopID = L->getLoopID();
7888   if (LoopID) {
7889     // First find existing loop unrolling disable metadata.
7890     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7891       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7892       if (MD) {
7893         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7894         IsUnrollMetadata =
7895             S && S->getString().startswith("llvm.loop.unroll.disable");
7896       }
7897       MDs.push_back(LoopID->getOperand(i));
7898     }
7899   }
7900 
7901   if (!IsUnrollMetadata) {
7902     // Add runtime unroll disable metadata.
7903     LLVMContext &Context = L->getHeader()->getContext();
7904     SmallVector<Metadata *, 1> DisableOperands;
7905     DisableOperands.push_back(
7906         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7907     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7908     MDs.push_back(DisableNode);
7909     MDNode *NewLoopID = MDNode::get(Context, MDs);
7910     // Set operand 0 to refer to the loop id itself.
7911     NewLoopID->replaceOperandWith(0, NewLoopID);
7912     L->setLoopID(NewLoopID);
7913   }
7914 }
7915 
7916 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
7917                                            VPlan &BestVPlan,
7918                                            InnerLoopVectorizer &ILV,
7919                                            DominatorTree *DT) {
7920   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
7921                     << '\n');
7922 
7923   // Perform the actual loop transformation.
7924 
7925   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
7926   VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
7927   Value *CanonicalIVStartValue;
7928   std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
7929       ILV.createVectorizedLoopSkeleton();
7930   ILV.collectPoisonGeneratingRecipes(State);
7931 
7932   ILV.printDebugTracesAtStart();
7933 
7934   //===------------------------------------------------===//
7935   //
7936   // Notice: any optimization or new instruction that go
7937   // into the code below should also be implemented in
7938   // the cost-model.
7939   //
7940   //===------------------------------------------------===//
7941 
7942   // 2. Copy and widen instructions from the old loop into the new loop.
7943   BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr),
7944                              ILV.getOrCreateVectorTripCount(nullptr),
7945                              CanonicalIVStartValue, State);
7946   BestVPlan.execute(&State);
7947 
7948   // Keep all loop hints from the original loop on the vector loop (we'll
7949   // replace the vectorizer-specific hints below).
7950   MDNode *OrigLoopID = OrigLoop->getLoopID();
7951 
7952   Optional<MDNode *> VectorizedLoopID =
7953       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7954                                       LLVMLoopVectorizeFollowupVectorized});
7955 
7956   Loop *L = LI->getLoopFor(State.CFG.PrevBB);
7957   if (VectorizedLoopID.hasValue())
7958     L->setLoopID(VectorizedLoopID.getValue());
7959   else {
7960     // Keep all loop hints from the original loop on the vector loop (we'll
7961     // replace the vectorizer-specific hints below).
7962     if (MDNode *LID = OrigLoop->getLoopID())
7963       L->setLoopID(LID);
7964 
7965     LoopVectorizeHints Hints(L, true, *ORE);
7966     Hints.setAlreadyVectorized();
7967   }
7968   // Disable runtime unrolling when vectorizing the epilogue loop.
7969   if (CanonicalIVStartValue)
7970     AddRuntimeUnrollDisableMetaData(L);
7971 
7972   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7973   //    predication, updating analyses.
7974   ILV.fixVectorizedLoop(State);
7975 
7976   ILV.printDebugTracesAtEnd();
7977 }
7978 
7979 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
7980 void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
7981   for (const auto &Plan : VPlans)
7982     if (PrintVPlansInDotFormat)
7983       Plan->printDOT(O);
7984     else
7985       Plan->print(O);
7986 }
7987 #endif
7988 
7989 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
7990     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
7991 
7992   // We create new control-flow for the vectorized loop, so the original exit
7993   // conditions will be dead after vectorization if it's only used by the
7994   // terminator
7995   SmallVector<BasicBlock*> ExitingBlocks;
7996   OrigLoop->getExitingBlocks(ExitingBlocks);
7997   for (auto *BB : ExitingBlocks) {
7998     auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));
7999     if (!Cmp || !Cmp->hasOneUse())
8000       continue;
8001 
8002     // TODO: we should introduce a getUniqueExitingBlocks on Loop
8003     if (!DeadInstructions.insert(Cmp).second)
8004       continue;
8005 
8006     // The operands of the icmp is often a dead trunc, used by IndUpdate.
8007     // TODO: can recurse through operands in general
8008     for (Value *Op : Cmp->operands()) {
8009       if (isa<TruncInst>(Op) && Op->hasOneUse())
8010           DeadInstructions.insert(cast<Instruction>(Op));
8011     }
8012   }
8013 
8014   // We create new "steps" for induction variable updates to which the original
8015   // induction variables map. An original update instruction will be dead if
8016   // all its users except the induction variable are dead.
8017   auto *Latch = OrigLoop->getLoopLatch();
8018   for (auto &Induction : Legal->getInductionVars()) {
8019     PHINode *Ind = Induction.first;
8020     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
8021 
8022     // If the tail is to be folded by masking, the primary induction variable,
8023     // if exists, isn't dead: it will be used for masking. Don't kill it.
8024     if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
8025       continue;
8026 
8027     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
8028           return U == Ind || DeadInstructions.count(cast<Instruction>(U));
8029         }))
8030       DeadInstructions.insert(IndUpdate);
8031   }
8032 }
8033 
8034 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
8035 
8036 //===--------------------------------------------------------------------===//
8037 // EpilogueVectorizerMainLoop
8038 //===--------------------------------------------------------------------===//
8039 
8040 /// This function is partially responsible for generating the control flow
8041 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
8042 std::pair<BasicBlock *, Value *>
8043 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
8044   MDNode *OrigLoopID = OrigLoop->getLoopID();
8045   Loop *Lp = createVectorLoopSkeleton("");
8046 
8047   // Generate the code to check the minimum iteration count of the vector
8048   // epilogue (see below).
8049   EPI.EpilogueIterationCountCheck =
8050       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, true);
8051   EPI.EpilogueIterationCountCheck->setName("iter.check");
8052 
8053   // Generate the code to check any assumptions that we've made for SCEV
8054   // expressions.
8055   EPI.SCEVSafetyCheck = emitSCEVChecks(Lp, LoopScalarPreHeader);
8056 
8057   // Generate the code that checks at runtime if arrays overlap. We put the
8058   // checks into a separate block to make the more common case of few elements
8059   // faster.
8060   EPI.MemSafetyCheck = emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
8061 
8062   // Generate the iteration count check for the main loop, *after* the check
8063   // for the epilogue loop, so that the path-length is shorter for the case
8064   // that goes directly through the vector epilogue. The longer-path length for
8065   // the main loop is compensated for, by the gain from vectorizing the larger
8066   // trip count. Note: the branch will get updated later on when we vectorize
8067   // the epilogue.
8068   EPI.MainLoopIterationCountCheck =
8069       emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader, false);
8070 
8071   // Generate the induction variable.
8072   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
8073   EPI.VectorTripCount = CountRoundDown;
8074   createHeaderBranch(Lp);
8075 
8076   // Skip induction resume value creation here because they will be created in
8077   // the second pass. If we created them here, they wouldn't be used anyway,
8078   // because the vplan in the second pass still contains the inductions from the
8079   // original loop.
8080 
8081   return {completeLoopSkeleton(Lp, OrigLoopID), nullptr};
8082 }
8083 
8084 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
8085   LLVM_DEBUG({
8086     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
8087            << "Main Loop VF:" << EPI.MainLoopVF
8088            << ", Main Loop UF:" << EPI.MainLoopUF
8089            << ", Epilogue Loop VF:" << EPI.EpilogueVF
8090            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8091   });
8092 }
8093 
8094 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
8095   DEBUG_WITH_TYPE(VerboseDebug, {
8096     dbgs() << "intermediate fn:\n"
8097            << *OrigLoop->getHeader()->getParent() << "\n";
8098   });
8099 }
8100 
8101 BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
8102     Loop *L, BasicBlock *Bypass, bool ForEpilogue) {
8103   assert(L && "Expected valid Loop.");
8104   assert(Bypass && "Expected valid bypass basic block.");
8105   ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
8106   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
8107   Value *Count = getOrCreateTripCount(L);
8108   // Reuse existing vector loop preheader for TC checks.
8109   // Note that new preheader block is generated for vector loop.
8110   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
8111   IRBuilder<> Builder(TCCheckBlock->getTerminator());
8112 
8113   // Generate code to check if the loop's trip count is less than VF * UF of the
8114   // main vector loop.
8115   auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ?
8116       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
8117 
8118   Value *CheckMinIters = Builder.CreateICmp(
8119       P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
8120       "min.iters.check");
8121 
8122   if (!ForEpilogue)
8123     TCCheckBlock->setName("vector.main.loop.iter.check");
8124 
8125   // Create new preheader for vector loop.
8126   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
8127                                    DT, LI, nullptr, "vector.ph");
8128 
8129   if (ForEpilogue) {
8130     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
8131                                  DT->getNode(Bypass)->getIDom()) &&
8132            "TC check is expected to dominate Bypass");
8133 
8134     // Update dominator for Bypass & LoopExit.
8135     DT->changeImmediateDominator(Bypass, TCCheckBlock);
8136     if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
8137       // For loops with multiple exits, there's no edge from the middle block
8138       // to exit blocks (as the epilogue must run) and thus no need to update
8139       // the immediate dominator of the exit blocks.
8140       DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
8141 
8142     LoopBypassBlocks.push_back(TCCheckBlock);
8143 
8144     // Save the trip count so we don't have to regenerate it in the
8145     // vec.epilog.iter.check. This is safe to do because the trip count
8146     // generated here dominates the vector epilog iter check.
8147     EPI.TripCount = Count;
8148   }
8149 
8150   ReplaceInstWithInst(
8151       TCCheckBlock->getTerminator(),
8152       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
8153 
8154   return TCCheckBlock;
8155 }
8156 
8157 //===--------------------------------------------------------------------===//
8158 // EpilogueVectorizerEpilogueLoop
8159 //===--------------------------------------------------------------------===//
8160 
8161 /// This function is partially responsible for generating the control flow
8162 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
8163 std::pair<BasicBlock *, Value *>
8164 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
8165   MDNode *OrigLoopID = OrigLoop->getLoopID();
8166   Loop *Lp = createVectorLoopSkeleton("vec.epilog.");
8167 
8168   // Now, compare the remaining count and if there aren't enough iterations to
8169   // execute the vectorized epilogue skip to the scalar part.
8170   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
8171   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
8172   LoopVectorPreHeader =
8173       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
8174                  LI, nullptr, "vec.epilog.ph");
8175   emitMinimumVectorEpilogueIterCountCheck(Lp, LoopScalarPreHeader,
8176                                           VecEpilogueIterationCountCheck);
8177 
8178   // Adjust the control flow taking the state info from the main loop
8179   // vectorization into account.
8180   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
8181          "expected this to be saved from the previous pass.");
8182   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
8183       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
8184 
8185   DT->changeImmediateDominator(LoopVectorPreHeader,
8186                                EPI.MainLoopIterationCountCheck);
8187 
8188   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
8189       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8190 
8191   if (EPI.SCEVSafetyCheck)
8192     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
8193         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8194   if (EPI.MemSafetyCheck)
8195     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
8196         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
8197 
8198   DT->changeImmediateDominator(
8199       VecEpilogueIterationCountCheck,
8200       VecEpilogueIterationCountCheck->getSinglePredecessor());
8201 
8202   DT->changeImmediateDominator(LoopScalarPreHeader,
8203                                EPI.EpilogueIterationCountCheck);
8204   if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
8205     // If there is an epilogue which must run, there's no edge from the
8206     // middle block to exit blocks  and thus no need to update the immediate
8207     // dominator of the exit blocks.
8208     DT->changeImmediateDominator(LoopExitBlock,
8209                                  EPI.EpilogueIterationCountCheck);
8210 
8211   // Keep track of bypass blocks, as they feed start values to the induction
8212   // phis in the scalar loop preheader.
8213   if (EPI.SCEVSafetyCheck)
8214     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
8215   if (EPI.MemSafetyCheck)
8216     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
8217   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
8218 
8219   // The vec.epilog.iter.check block may contain Phi nodes from reductions which
8220   // merge control-flow from the latch block and the middle block. Update the
8221   // incoming values here and move the Phi into the preheader.
8222   SmallVector<PHINode *, 4> PhisInBlock;
8223   for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
8224     PhisInBlock.push_back(&Phi);
8225 
8226   for (PHINode *Phi : PhisInBlock) {
8227     Phi->replaceIncomingBlockWith(
8228         VecEpilogueIterationCountCheck->getSinglePredecessor(),
8229         VecEpilogueIterationCountCheck);
8230     Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
8231     if (EPI.SCEVSafetyCheck)
8232       Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
8233     if (EPI.MemSafetyCheck)
8234       Phi->removeIncomingValue(EPI.MemSafetyCheck);
8235     Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
8236   }
8237 
8238   // Generate a resume induction for the vector epilogue and put it in the
8239   // vector epilogue preheader
8240   Type *IdxTy = Legal->getWidestInductionType();
8241   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
8242                                          LoopVectorPreHeader->getFirstNonPHI());
8243   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
8244   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
8245                            EPI.MainLoopIterationCountCheck);
8246 
8247   // Generate the induction variable.
8248   createHeaderBranch(Lp);
8249 
8250   // Generate induction resume values. These variables save the new starting
8251   // indexes for the scalar loop. They are used to test if there are any tail
8252   // iterations left once the vector loop has completed.
8253   // Note that when the vectorized epilogue is skipped due to iteration count
8254   // check, then the resume value for the induction variable comes from
8255   // the trip count of the main vector loop, hence passing the AdditionalBypass
8256   // argument.
8257   createInductionResumeValues(Lp, {VecEpilogueIterationCountCheck,
8258                                    EPI.VectorTripCount} /* AdditionalBypass */);
8259 
8260   return {completeLoopSkeleton(Lp, OrigLoopID), EPResumeVal};
8261 }
8262 
8263 BasicBlock *
8264 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
8265     Loop *L, BasicBlock *Bypass, BasicBlock *Insert) {
8266 
8267   assert(EPI.TripCount &&
8268          "Expected trip count to have been safed in the first pass.");
8269   assert(
8270       (!isa<Instruction>(EPI.TripCount) ||
8271        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
8272       "saved trip count does not dominate insertion point.");
8273   Value *TC = EPI.TripCount;
8274   IRBuilder<> Builder(Insert->getTerminator());
8275   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
8276 
8277   // Generate code to check if the loop's trip count is less than VF * UF of the
8278   // vector epilogue loop.
8279   auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ?
8280       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
8281 
8282   Value *CheckMinIters =
8283       Builder.CreateICmp(P, Count,
8284                          createStepForVF(Builder, Count->getType(),
8285                                          EPI.EpilogueVF, EPI.EpilogueUF),
8286                          "min.epilog.iters.check");
8287 
8288   ReplaceInstWithInst(
8289       Insert->getTerminator(),
8290       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
8291 
8292   LoopBypassBlocks.push_back(Insert);
8293   return Insert;
8294 }
8295 
8296 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
8297   LLVM_DEBUG({
8298     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
8299            << "Epilogue Loop VF:" << EPI.EpilogueVF
8300            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
8301   });
8302 }
8303 
8304 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
8305   DEBUG_WITH_TYPE(VerboseDebug, {
8306     dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
8307   });
8308 }
8309 
8310 bool LoopVectorizationPlanner::getDecisionAndClampRange(
8311     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
8312   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
8313   bool PredicateAtRangeStart = Predicate(Range.Start);
8314 
8315   for (ElementCount TmpVF = Range.Start * 2;
8316        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
8317     if (Predicate(TmpVF) != PredicateAtRangeStart) {
8318       Range.End = TmpVF;
8319       break;
8320     }
8321 
8322   return PredicateAtRangeStart;
8323 }
8324 
8325 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
8326 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
8327 /// of VF's starting at a given VF and extending it as much as possible. Each
8328 /// vectorization decision can potentially shorten this sub-range during
8329 /// buildVPlan().
8330 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
8331                                            ElementCount MaxVF) {
8332   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8333   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8334     VFRange SubRange = {VF, MaxVFPlusOne};
8335     VPlans.push_back(buildVPlan(SubRange));
8336     VF = SubRange.End;
8337   }
8338 }
8339 
8340 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
8341                                          VPlanPtr &Plan) {
8342   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
8343 
8344   // Look for cached value.
8345   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8346   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
8347   if (ECEntryIt != EdgeMaskCache.end())
8348     return ECEntryIt->second;
8349 
8350   VPValue *SrcMask = createBlockInMask(Src, Plan);
8351 
8352   // The terminator has to be a branch inst!
8353   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8354   assert(BI && "Unexpected terminator found");
8355 
8356   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
8357     return EdgeMaskCache[Edge] = SrcMask;
8358 
8359   // If source is an exiting block, we know the exit edge is dynamically dead
8360   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
8361   // adding uses of an otherwise potentially dead instruction.
8362   if (OrigLoop->isLoopExiting(Src))
8363     return EdgeMaskCache[Edge] = SrcMask;
8364 
8365   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
8366   assert(EdgeMask && "No Edge Mask found for condition");
8367 
8368   if (BI->getSuccessor(0) != Dst)
8369     EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
8370 
8371   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
8372     // The condition is 'SrcMask && EdgeMask', which is equivalent to
8373     // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
8374     // The select version does not introduce new UB if SrcMask is false and
8375     // EdgeMask is poison. Using 'and' here introduces undefined behavior.
8376     VPValue *False = Plan->getOrAddVPValue(
8377         ConstantInt::getFalse(BI->getCondition()->getType()));
8378     EdgeMask =
8379         Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
8380   }
8381 
8382   return EdgeMaskCache[Edge] = EdgeMask;
8383 }
8384 
8385 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
8386   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8387 
8388   // Look for cached value.
8389   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
8390   if (BCEntryIt != BlockMaskCache.end())
8391     return BCEntryIt->second;
8392 
8393   // All-one mask is modelled as no-mask following the convention for masked
8394   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8395   VPValue *BlockMask = nullptr;
8396 
8397   if (OrigLoop->getHeader() == BB) {
8398     if (!CM.blockNeedsPredicationForAnyReason(BB))
8399       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
8400 
8401     // Introduce the early-exit compare IV <= BTC to form header block mask.
8402     // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8403     // constructing the desired canonical IV in the header block as its first
8404     // non-phi instructions.
8405     assert(CM.foldTailByMasking() && "must fold the tail");
8406     VPBasicBlock *HeaderVPBB = Plan->getEntry()->getEntryBasicBlock();
8407     auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8408     auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV());
8409     HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi());
8410 
8411     VPBuilder::InsertPointGuard Guard(Builder);
8412     Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8413     if (CM.TTI.emitGetActiveLaneMask()) {
8414       VPValue *TC = Plan->getOrCreateTripCount();
8415       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC});
8416     } else {
8417       VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
8418       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
8419     }
8420     return BlockMaskCache[BB] = BlockMask;
8421   }
8422 
8423   // This is the block mask. We OR all incoming edges.
8424   for (auto *Predecessor : predecessors(BB)) {
8425     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8426     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
8427       return BlockMaskCache[BB] = EdgeMask;
8428 
8429     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8430       BlockMask = EdgeMask;
8431       continue;
8432     }
8433 
8434     BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8435   }
8436 
8437   return BlockMaskCache[BB] = BlockMask;
8438 }
8439 
8440 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
8441                                                 ArrayRef<VPValue *> Operands,
8442                                                 VFRange &Range,
8443                                                 VPlanPtr &Plan) {
8444   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8445          "Must be called with either a load or store");
8446 
8447   auto willWiden = [&](ElementCount VF) -> bool {
8448     if (VF.isScalar())
8449       return false;
8450     LoopVectorizationCostModel::InstWidening Decision =
8451         CM.getWideningDecision(I, VF);
8452     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8453            "CM decision should be taken at this point.");
8454     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8455       return true;
8456     if (CM.isScalarAfterVectorization(I, VF) ||
8457         CM.isProfitableToScalarize(I, VF))
8458       return false;
8459     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8460   };
8461 
8462   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8463     return nullptr;
8464 
8465   VPValue *Mask = nullptr;
8466   if (Legal->isMaskRequired(I))
8467     Mask = createBlockInMask(I->getParent(), Plan);
8468 
8469   // Determine if the pointer operand of the access is either consecutive or
8470   // reverse consecutive.
8471   LoopVectorizationCostModel::InstWidening Decision =
8472       CM.getWideningDecision(I, Range.Start);
8473   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8474   bool Consecutive =
8475       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8476 
8477   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8478     return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
8479                                               Consecutive, Reverse);
8480 
8481   StoreInst *Store = cast<StoreInst>(I);
8482   return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
8483                                             Mask, Consecutive, Reverse);
8484 }
8485 
8486 static VPWidenIntOrFpInductionRecipe *
8487 createWidenInductionRecipe(PHINode *Phi, Instruction *PhiOrTrunc,
8488                            VPValue *Start, const InductionDescriptor &IndDesc,
8489                            LoopVectorizationCostModel &CM, Loop &OrigLoop,
8490                            VFRange &Range) {
8491   // Returns true if an instruction \p I should be scalarized instead of
8492   // vectorized for the chosen vectorization factor.
8493   auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) {
8494     return CM.isScalarAfterVectorization(I, VF) ||
8495            CM.isProfitableToScalarize(I, VF);
8496   };
8497 
8498   bool NeedsScalarIV = LoopVectorizationPlanner::getDecisionAndClampRange(
8499       [&](ElementCount VF) {
8500         // Returns true if we should generate a scalar version of \p IV.
8501         if (ShouldScalarizeInstruction(PhiOrTrunc, VF))
8502           return true;
8503         auto isScalarInst = [&](User *U) -> bool {
8504           auto *I = cast<Instruction>(U);
8505           return OrigLoop.contains(I) && ShouldScalarizeInstruction(I, VF);
8506         };
8507         return any_of(PhiOrTrunc->users(), isScalarInst);
8508       },
8509       Range);
8510   bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange(
8511       [&](ElementCount VF) {
8512         return ShouldScalarizeInstruction(PhiOrTrunc, VF);
8513       },
8514       Range);
8515   assert(IndDesc.getStartValue() ==
8516          Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8517   if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8518     return new VPWidenIntOrFpInductionRecipe(Phi, Start, IndDesc, TruncI,
8519                                              NeedsScalarIV, !NeedsScalarIVOnly);
8520   }
8521   assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8522   return new VPWidenIntOrFpInductionRecipe(Phi, Start, IndDesc, NeedsScalarIV,
8523                                            !NeedsScalarIVOnly);
8524 }
8525 
8526 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
8527     PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) const {
8528 
8529   // Check if this is an integer or fp induction. If so, build the recipe that
8530   // produces its scalar and vector values.
8531   if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8532     return createWidenInductionRecipe(Phi, Phi, Operands[0], *II, CM, *OrigLoop,
8533                                       Range);
8534 
8535   return nullptr;
8536 }
8537 
8538 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8539     TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range,
8540     VPlan &Plan) const {
8541   // Optimize the special case where the source is a constant integer
8542   // induction variable. Notice that we can only optimize the 'trunc' case
8543   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8544   // (c) other casts depend on pointer size.
8545 
8546   // Determine whether \p K is a truncation based on an induction variable that
8547   // can be optimized.
8548   auto isOptimizableIVTruncate =
8549       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8550     return [=](ElementCount VF) -> bool {
8551       return CM.isOptimizableIVTruncate(K, VF);
8552     };
8553   };
8554 
8555   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8556           isOptimizableIVTruncate(I), Range)) {
8557 
8558     auto *Phi = cast<PHINode>(I->getOperand(0));
8559     const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8560     VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
8561     return createWidenInductionRecipe(Phi, I, Start, II, CM, *OrigLoop, Range);
8562   }
8563   return nullptr;
8564 }
8565 
8566 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
8567                                                 ArrayRef<VPValue *> Operands,
8568                                                 VPlanPtr &Plan) {
8569   // If all incoming values are equal, the incoming VPValue can be used directly
8570   // instead of creating a new VPBlendRecipe.
8571   VPValue *FirstIncoming = Operands[0];
8572   if (all_of(Operands, [FirstIncoming](const VPValue *Inc) {
8573         return FirstIncoming == Inc;
8574       })) {
8575     return Operands[0];
8576   }
8577 
8578   // We know that all PHIs in non-header blocks are converted into selects, so
8579   // we don't have to worry about the insertion order and we can just use the
8580   // builder. At this point we generate the predication tree. There may be
8581   // duplications since this is a simple recursive scan, but future
8582   // optimizations will clean it up.
8583   SmallVector<VPValue *, 2> OperandsWithMask;
8584   unsigned NumIncoming = Phi->getNumIncomingValues();
8585 
8586   for (unsigned In = 0; In < NumIncoming; In++) {
8587     VPValue *EdgeMask =
8588       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8589     assert((EdgeMask || NumIncoming == 1) &&
8590            "Multiple predecessors with one having a full mask");
8591     OperandsWithMask.push_back(Operands[In]);
8592     if (EdgeMask)
8593       OperandsWithMask.push_back(EdgeMask);
8594   }
8595   return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
8596 }
8597 
8598 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8599                                                    ArrayRef<VPValue *> Operands,
8600                                                    VFRange &Range) const {
8601 
8602   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8603       [this, CI](ElementCount VF) {
8604         return CM.isScalarWithPredication(CI, VF);
8605       },
8606       Range);
8607 
8608   if (IsPredicated)
8609     return nullptr;
8610 
8611   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8612   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8613              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8614              ID == Intrinsic::pseudoprobe ||
8615              ID == Intrinsic::experimental_noalias_scope_decl))
8616     return nullptr;
8617 
8618   auto willWiden = [&](ElementCount VF) -> bool {
8619     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8620     // The following case may be scalarized depending on the VF.
8621     // The flag shows whether we use Intrinsic or a usual Call for vectorized
8622     // version of the instruction.
8623     // Is it beneficial to perform intrinsic call compared to lib call?
8624     bool NeedToScalarize = false;
8625     InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
8626     InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
8627     bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
8628     return UseVectorIntrinsic || !NeedToScalarize;
8629   };
8630 
8631   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8632     return nullptr;
8633 
8634   ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());
8635   return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()));
8636 }
8637 
8638 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8639   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8640          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8641   // Instruction should be widened, unless it is scalar after vectorization,
8642   // scalarization is profitable or it is predicated.
8643   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8644     return CM.isScalarAfterVectorization(I, VF) ||
8645            CM.isProfitableToScalarize(I, VF) ||
8646            CM.isScalarWithPredication(I, VF);
8647   };
8648   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8649                                                              Range);
8650 }
8651 
8652 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8653                                            ArrayRef<VPValue *> Operands) const {
8654   auto IsVectorizableOpcode = [](unsigned Opcode) {
8655     switch (Opcode) {
8656     case Instruction::Add:
8657     case Instruction::And:
8658     case Instruction::AShr:
8659     case Instruction::BitCast:
8660     case Instruction::FAdd:
8661     case Instruction::FCmp:
8662     case Instruction::FDiv:
8663     case Instruction::FMul:
8664     case Instruction::FNeg:
8665     case Instruction::FPExt:
8666     case Instruction::FPToSI:
8667     case Instruction::FPToUI:
8668     case Instruction::FPTrunc:
8669     case Instruction::FRem:
8670     case Instruction::FSub:
8671     case Instruction::ICmp:
8672     case Instruction::IntToPtr:
8673     case Instruction::LShr:
8674     case Instruction::Mul:
8675     case Instruction::Or:
8676     case Instruction::PtrToInt:
8677     case Instruction::SDiv:
8678     case Instruction::Select:
8679     case Instruction::SExt:
8680     case Instruction::Shl:
8681     case Instruction::SIToFP:
8682     case Instruction::SRem:
8683     case Instruction::Sub:
8684     case Instruction::Trunc:
8685     case Instruction::UDiv:
8686     case Instruction::UIToFP:
8687     case Instruction::URem:
8688     case Instruction::Xor:
8689     case Instruction::ZExt:
8690       return true;
8691     }
8692     return false;
8693   };
8694 
8695   if (!IsVectorizableOpcode(I->getOpcode()))
8696     return nullptr;
8697 
8698   // Success: widen this instruction.
8699   return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8700 }
8701 
8702 void VPRecipeBuilder::fixHeaderPhis() {
8703   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8704   for (VPHeaderPHIRecipe *R : PhisToFix) {
8705     auto *PN = cast<PHINode>(R->getUnderlyingValue());
8706     VPRecipeBase *IncR =
8707         getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8708     R->addOperand(IncR->getVPSingleValue());
8709   }
8710 }
8711 
8712 VPBasicBlock *VPRecipeBuilder::handleReplication(
8713     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8714     VPlanPtr &Plan) {
8715   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8716       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8717       Range);
8718 
8719   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8720       [&](ElementCount VF) { return CM.isPredicatedInst(I, VF, IsUniform); },
8721       Range);
8722 
8723   // Even if the instruction is not marked as uniform, there are certain
8724   // intrinsic calls that can be effectively treated as such, so we check for
8725   // them here. Conservatively, we only do this for scalable vectors, since
8726   // for fixed-width VFs we can always fall back on full scalarization.
8727   if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8728     switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8729     case Intrinsic::assume:
8730     case Intrinsic::lifetime_start:
8731     case Intrinsic::lifetime_end:
8732       // For scalable vectors if one of the operands is variant then we still
8733       // want to mark as uniform, which will generate one instruction for just
8734       // the first lane of the vector. We can't scalarize the call in the same
8735       // way as for fixed-width vectors because we don't know how many lanes
8736       // there are.
8737       //
8738       // The reasons for doing it this way for scalable vectors are:
8739       //   1. For the assume intrinsic generating the instruction for the first
8740       //      lane is still be better than not generating any at all. For
8741       //      example, the input may be a splat across all lanes.
8742       //   2. For the lifetime start/end intrinsics the pointer operand only
8743       //      does anything useful when the input comes from a stack object,
8744       //      which suggests it should always be uniform. For non-stack objects
8745       //      the effect is to poison the object, which still allows us to
8746       //      remove the call.
8747       IsUniform = true;
8748       break;
8749     default:
8750       break;
8751     }
8752   }
8753 
8754   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8755                                        IsUniform, IsPredicated);
8756   setRecipe(I, Recipe);
8757   Plan->addVPValue(I, Recipe);
8758 
8759   // Find if I uses a predicated instruction. If so, it will use its scalar
8760   // value. Avoid hoisting the insert-element which packs the scalar value into
8761   // a vector value, as that happens iff all users use the vector value.
8762   for (VPValue *Op : Recipe->operands()) {
8763     auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef());
8764     if (!PredR)
8765       continue;
8766     auto *RepR =
8767         cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef());
8768     assert(RepR->isPredicated() &&
8769            "expected Replicate recipe to be predicated");
8770     RepR->setAlsoPack(false);
8771   }
8772 
8773   // Finalize the recipe for Instr, first if it is not predicated.
8774   if (!IsPredicated) {
8775     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8776     VPBB->appendRecipe(Recipe);
8777     return VPBB;
8778   }
8779   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8780 
8781   VPBlockBase *SingleSucc = VPBB->getSingleSuccessor();
8782   assert(SingleSucc && "VPBB must have a single successor when handling "
8783                        "predicated replication.");
8784   VPBlockUtils::disconnectBlocks(VPBB, SingleSucc);
8785   // Record predicated instructions for above packing optimizations.
8786   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
8787   VPBlockUtils::insertBlockAfter(Region, VPBB);
8788   auto *RegSucc = new VPBasicBlock();
8789   VPBlockUtils::insertBlockAfter(RegSucc, Region);
8790   VPBlockUtils::connectBlocks(RegSucc, SingleSucc);
8791   return RegSucc;
8792 }
8793 
8794 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
8795                                                       VPRecipeBase *PredRecipe,
8796                                                       VPlanPtr &Plan) {
8797   // Instructions marked for predication are replicated and placed under an
8798   // if-then construct to prevent side-effects.
8799 
8800   // Generate recipes to compute the block mask for this region.
8801   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
8802 
8803   // Build the triangular if-then region.
8804   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8805   assert(Instr->getParent() && "Predicated instruction not in any basic block");
8806   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8807   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8808   auto *PHIRecipe = Instr->getType()->isVoidTy()
8809                         ? nullptr
8810                         : new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
8811   if (PHIRecipe) {
8812     Plan->removeVPValueFor(Instr);
8813     Plan->addVPValue(Instr, PHIRecipe);
8814   }
8815   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8816   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8817   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
8818 
8819   // Note: first set Entry as region entry and then connect successors starting
8820   // from it in order, to propagate the "parent" of each VPBasicBlock.
8821   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
8822   VPBlockUtils::connectBlocks(Pred, Exit);
8823 
8824   return Region;
8825 }
8826 
8827 VPRecipeOrVPValueTy
8828 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8829                                         ArrayRef<VPValue *> Operands,
8830                                         VFRange &Range, VPlanPtr &Plan) {
8831   // First, check for specific widening recipes that deal with calls, memory
8832   // operations, inductions and Phi nodes.
8833   if (auto *CI = dyn_cast<CallInst>(Instr))
8834     return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
8835 
8836   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8837     return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
8838 
8839   VPRecipeBase *Recipe;
8840   if (auto Phi = dyn_cast<PHINode>(Instr)) {
8841     if (Phi->getParent() != OrigLoop->getHeader())
8842       return tryToBlend(Phi, Operands, Plan);
8843     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
8844       return toVPRecipeResult(Recipe);
8845 
8846     VPHeaderPHIRecipe *PhiRecipe = nullptr;
8847     if (Legal->isReductionVariable(Phi) || Legal->isFirstOrderRecurrence(Phi)) {
8848       VPValue *StartV = Operands[0];
8849       if (Legal->isReductionVariable(Phi)) {
8850         const RecurrenceDescriptor &RdxDesc =
8851             Legal->getReductionVars().find(Phi)->second;
8852         assert(RdxDesc.getRecurrenceStartValue() ==
8853                Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8854         PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8855                                              CM.isInLoopReduction(Phi),
8856                                              CM.useOrderedReductions(RdxDesc));
8857       } else {
8858         PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8859       }
8860 
8861       // Record the incoming value from the backedge, so we can add the incoming
8862       // value from the backedge after all recipes have been created.
8863       recordRecipeOf(cast<Instruction>(
8864           Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())));
8865       PhisToFix.push_back(PhiRecipe);
8866     } else {
8867       // TODO: record backedge value for remaining pointer induction phis.
8868       assert(Phi->getType()->isPointerTy() &&
8869              "only pointer phis should be handled here");
8870       assert(Legal->getInductionVars().count(Phi) &&
8871              "Not an induction variable");
8872       InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
8873       VPValue *Start = Plan->getOrAddVPValue(II.getStartValue());
8874       PhiRecipe = new VPWidenPHIRecipe(Phi, Start);
8875     }
8876 
8877     return toVPRecipeResult(PhiRecipe);
8878   }
8879 
8880   if (isa<TruncInst>(Instr) &&
8881       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
8882                                                Range, *Plan)))
8883     return toVPRecipeResult(Recipe);
8884 
8885   if (!shouldWiden(Instr, Range))
8886     return nullptr;
8887 
8888   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8889     return toVPRecipeResult(new VPWidenGEPRecipe(
8890         GEP, make_range(Operands.begin(), Operands.end()), OrigLoop));
8891 
8892   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8893     bool InvariantCond =
8894         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
8895     return toVPRecipeResult(new VPWidenSelectRecipe(
8896         *SI, make_range(Operands.begin(), Operands.end()), InvariantCond));
8897   }
8898 
8899   return toVPRecipeResult(tryToWiden(Instr, Operands));
8900 }
8901 
8902 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8903                                                         ElementCount MaxVF) {
8904   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8905 
8906   // Collect instructions from the original loop that will become trivially dead
8907   // in the vectorized loop. We don't need to vectorize these instructions. For
8908   // example, original induction update instructions can become dead because we
8909   // separately emit induction "steps" when generating code for the new loop.
8910   // Similarly, we create a new latch condition when setting up the structure
8911   // of the new loop, so the old one can become dead.
8912   SmallPtrSet<Instruction *, 4> DeadInstructions;
8913   collectTriviallyDeadInstructions(DeadInstructions);
8914 
8915   // Add assume instructions we need to drop to DeadInstructions, to prevent
8916   // them from being added to the VPlan.
8917   // TODO: We only need to drop assumes in blocks that get flattend. If the
8918   // control flow is preserved, we should keep them.
8919   auto &ConditionalAssumes = Legal->getConditionalAssumes();
8920   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
8921 
8922   MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
8923   // Dead instructions do not need sinking. Remove them from SinkAfter.
8924   for (Instruction *I : DeadInstructions)
8925     SinkAfter.erase(I);
8926 
8927   // Cannot sink instructions after dead instructions (there won't be any
8928   // recipes for them). Instead, find the first non-dead previous instruction.
8929   for (auto &P : Legal->getSinkAfter()) {
8930     Instruction *SinkTarget = P.second;
8931     Instruction *FirstInst = &*SinkTarget->getParent()->begin();
8932     (void)FirstInst;
8933     while (DeadInstructions.contains(SinkTarget)) {
8934       assert(
8935           SinkTarget != FirstInst &&
8936           "Must find a live instruction (at least the one feeding the "
8937           "first-order recurrence PHI) before reaching beginning of the block");
8938       SinkTarget = SinkTarget->getPrevNode();
8939       assert(SinkTarget != P.first &&
8940              "sink source equals target, no sinking required");
8941     }
8942     P.second = SinkTarget;
8943   }
8944 
8945   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8946   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8947     VFRange SubRange = {VF, MaxVFPlusOne};
8948     VPlans.push_back(
8949         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
8950     VF = SubRange.End;
8951   }
8952 }
8953 
8954 // Add a VPCanonicalIVPHIRecipe starting at 0 to the header, a
8955 // CanonicalIVIncrement{NUW} VPInstruction to increment it by VF * UF and a
8956 // BranchOnCount VPInstruction to the latch.
8957 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
8958                                   bool HasNUW, bool IsVPlanNative) {
8959   Value *StartIdx = ConstantInt::get(IdxTy, 0);
8960   auto *StartV = Plan.getOrAddVPValue(StartIdx);
8961 
8962   auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8963   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8964   VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8965   if (IsVPlanNative)
8966     Header = cast<VPBasicBlock>(Header->getSingleSuccessor());
8967   Header->insert(CanonicalIVPHI, Header->begin());
8968 
8969   auto *CanonicalIVIncrement =
8970       new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW
8971                                : VPInstruction::CanonicalIVIncrement,
8972                         {CanonicalIVPHI}, DL);
8973   CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8974 
8975   VPBasicBlock *EB = TopRegion->getExitBasicBlock();
8976   if (IsVPlanNative) {
8977     EB = cast<VPBasicBlock>(EB->getSinglePredecessor());
8978     EB->setCondBit(nullptr);
8979   }
8980   EB->appendRecipe(CanonicalIVIncrement);
8981 
8982   auto *BranchOnCount =
8983       new VPInstruction(VPInstruction::BranchOnCount,
8984                         {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8985   EB->appendRecipe(BranchOnCount);
8986 }
8987 
8988 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
8989     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
8990     const MapVector<Instruction *, Instruction *> &SinkAfter) {
8991 
8992   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8993 
8994   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8995 
8996   // ---------------------------------------------------------------------------
8997   // Pre-construction: record ingredients whose recipes we'll need to further
8998   // process after constructing the initial VPlan.
8999   // ---------------------------------------------------------------------------
9000 
9001   // Mark instructions we'll need to sink later and their targets as
9002   // ingredients whose recipe we'll need to record.
9003   for (auto &Entry : SinkAfter) {
9004     RecipeBuilder.recordRecipeOf(Entry.first);
9005     RecipeBuilder.recordRecipeOf(Entry.second);
9006   }
9007   for (auto &Reduction : CM.getInLoopReductionChains()) {
9008     PHINode *Phi = Reduction.first;
9009     RecurKind Kind =
9010         Legal->getReductionVars().find(Phi)->second.getRecurrenceKind();
9011     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9012 
9013     RecipeBuilder.recordRecipeOf(Phi);
9014     for (auto &R : ReductionOperations) {
9015       RecipeBuilder.recordRecipeOf(R);
9016       // For min/max reducitons, where we have a pair of icmp/select, we also
9017       // need to record the ICmp recipe, so it can be removed later.
9018       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
9019              "Only min/max recurrences allowed for inloop reductions");
9020       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
9021         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
9022     }
9023   }
9024 
9025   // For each interleave group which is relevant for this (possibly trimmed)
9026   // Range, add it to the set of groups to be later applied to the VPlan and add
9027   // placeholders for its members' Recipes which we'll be replacing with a
9028   // single VPInterleaveRecipe.
9029   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
9030     auto applyIG = [IG, this](ElementCount VF) -> bool {
9031       return (VF.isVector() && // Query is illegal for VF == 1
9032               CM.getWideningDecision(IG->getInsertPos(), VF) ==
9033                   LoopVectorizationCostModel::CM_Interleave);
9034     };
9035     if (!getDecisionAndClampRange(applyIG, Range))
9036       continue;
9037     InterleaveGroups.insert(IG);
9038     for (unsigned i = 0; i < IG->getFactor(); i++)
9039       if (Instruction *Member = IG->getMember(i))
9040         RecipeBuilder.recordRecipeOf(Member);
9041   };
9042 
9043   // ---------------------------------------------------------------------------
9044   // Build initial VPlan: Scan the body of the loop in a topological order to
9045   // visit each basic block after having visited its predecessor basic blocks.
9046   // ---------------------------------------------------------------------------
9047 
9048   // Create initial VPlan skeleton, with separate header and latch blocks.
9049   VPBasicBlock *HeaderVPBB = new VPBasicBlock();
9050   VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
9051   VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
9052   auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop");
9053   auto Plan = std::make_unique<VPlan>(TopRegion);
9054 
9055   Instruction *DLInst =
9056       getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
9057   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(),
9058                         DLInst ? DLInst->getDebugLoc() : DebugLoc(),
9059                         !CM.foldTailByMasking(), false);
9060 
9061   // Scan the body of the loop in a topological order to visit each basic block
9062   // after having visited its predecessor basic blocks.
9063   LoopBlocksDFS DFS(OrigLoop);
9064   DFS.perform(LI);
9065 
9066   VPBasicBlock *VPBB = HeaderVPBB;
9067   SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove;
9068   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
9069     // Relevant instructions from basic block BB will be grouped into VPRecipe
9070     // ingredients and fill a new VPBasicBlock.
9071     unsigned VPBBsForBB = 0;
9072     VPBB->setName(BB->getName());
9073     Builder.setInsertPoint(VPBB);
9074 
9075     // Introduce each ingredient into VPlan.
9076     // TODO: Model and preserve debug instrinsics in VPlan.
9077     for (Instruction &I : BB->instructionsWithoutDebug()) {
9078       Instruction *Instr = &I;
9079 
9080       // First filter out irrelevant instructions, to ensure no recipes are
9081       // built for them.
9082       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
9083         continue;
9084 
9085       SmallVector<VPValue *, 4> Operands;
9086       auto *Phi = dyn_cast<PHINode>(Instr);
9087       if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
9088         Operands.push_back(Plan->getOrAddVPValue(
9089             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
9090       } else {
9091         auto OpRange = Plan->mapToVPValues(Instr->operands());
9092         Operands = {OpRange.begin(), OpRange.end()};
9093       }
9094       if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
9095               Instr, Operands, Range, Plan)) {
9096         // If Instr can be simplified to an existing VPValue, use it.
9097         if (RecipeOrValue.is<VPValue *>()) {
9098           auto *VPV = RecipeOrValue.get<VPValue *>();
9099           Plan->addVPValue(Instr, VPV);
9100           // If the re-used value is a recipe, register the recipe for the
9101           // instruction, in case the recipe for Instr needs to be recorded.
9102           if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef()))
9103             RecipeBuilder.setRecipe(Instr, R);
9104           continue;
9105         }
9106         // Otherwise, add the new recipe.
9107         VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>();
9108         for (auto *Def : Recipe->definedValues()) {
9109           auto *UV = Def->getUnderlyingValue();
9110           Plan->addVPValue(UV, Def);
9111         }
9112 
9113         if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) &&
9114             HeaderVPBB->getFirstNonPhi() != VPBB->end()) {
9115           // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section
9116           // of the header block. That can happen for truncates of induction
9117           // variables. Those recipes are moved to the phi section of the header
9118           // block after applying SinkAfter, which relies on the original
9119           // position of the trunc.
9120           assert(isa<TruncInst>(Instr));
9121           InductionsToMove.push_back(
9122               cast<VPWidenIntOrFpInductionRecipe>(Recipe));
9123         }
9124         RecipeBuilder.setRecipe(Instr, Recipe);
9125         VPBB->appendRecipe(Recipe);
9126         continue;
9127       }
9128 
9129       // Otherwise, if all widening options failed, Instruction is to be
9130       // replicated. This may create a successor for VPBB.
9131       VPBasicBlock *NextVPBB =
9132           RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan);
9133       if (NextVPBB != VPBB) {
9134         VPBB = NextVPBB;
9135         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
9136                                     : "");
9137       }
9138     }
9139 
9140     VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
9141     VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
9142   }
9143 
9144   // Fold the last, empty block into its predecessor.
9145   VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB);
9146   assert(VPBB && "expected to fold last (empty) block");
9147   // After here, VPBB should not be used.
9148   VPBB = nullptr;
9149 
9150   assert(isa<VPRegionBlock>(Plan->getEntry()) &&
9151          !Plan->getEntry()->getEntryBasicBlock()->empty() &&
9152          "entry block must be set to a VPRegionBlock having a non-empty entry "
9153          "VPBasicBlock");
9154   RecipeBuilder.fixHeaderPhis();
9155 
9156   // ---------------------------------------------------------------------------
9157   // Transform initial VPlan: Apply previously taken decisions, in order, to
9158   // bring the VPlan to its final state.
9159   // ---------------------------------------------------------------------------
9160 
9161   // Apply Sink-After legal constraints.
9162   auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {
9163     auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
9164     if (Region && Region->isReplicator()) {
9165       assert(Region->getNumSuccessors() == 1 &&
9166              Region->getNumPredecessors() == 1 && "Expected SESE region!");
9167       assert(R->getParent()->size() == 1 &&
9168              "A recipe in an original replicator region must be the only "
9169              "recipe in its block");
9170       return Region;
9171     }
9172     return nullptr;
9173   };
9174   for (auto &Entry : SinkAfter) {
9175     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
9176     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
9177 
9178     auto *TargetRegion = GetReplicateRegion(Target);
9179     auto *SinkRegion = GetReplicateRegion(Sink);
9180     if (!SinkRegion) {
9181       // If the sink source is not a replicate region, sink the recipe directly.
9182       if (TargetRegion) {
9183         // The target is in a replication region, make sure to move Sink to
9184         // the block after it, not into the replication region itself.
9185         VPBasicBlock *NextBlock =
9186             cast<VPBasicBlock>(TargetRegion->getSuccessors().front());
9187         Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
9188       } else
9189         Sink->moveAfter(Target);
9190       continue;
9191     }
9192 
9193     // The sink source is in a replicate region. Unhook the region from the CFG.
9194     auto *SinkPred = SinkRegion->getSinglePredecessor();
9195     auto *SinkSucc = SinkRegion->getSingleSuccessor();
9196     VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion);
9197     VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc);
9198     VPBlockUtils::connectBlocks(SinkPred, SinkSucc);
9199 
9200     if (TargetRegion) {
9201       // The target recipe is also in a replicate region, move the sink region
9202       // after the target region.
9203       auto *TargetSucc = TargetRegion->getSingleSuccessor();
9204       VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc);
9205       VPBlockUtils::connectBlocks(TargetRegion, SinkRegion);
9206       VPBlockUtils::connectBlocks(SinkRegion, TargetSucc);
9207     } else {
9208       // The sink source is in a replicate region, we need to move the whole
9209       // replicate region, which should only contain a single recipe in the
9210       // main block.
9211       auto *SplitBlock =
9212           Target->getParent()->splitAt(std::next(Target->getIterator()));
9213 
9214       auto *SplitPred = SplitBlock->getSinglePredecessor();
9215 
9216       VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock);
9217       VPBlockUtils::connectBlocks(SplitPred, SinkRegion);
9218       VPBlockUtils::connectBlocks(SinkRegion, SplitBlock);
9219     }
9220   }
9221 
9222   VPlanTransforms::removeRedundantCanonicalIVs(*Plan);
9223   VPlanTransforms::removeRedundantInductionCasts(*Plan);
9224 
9225   // Now that sink-after is done, move induction recipes for optimized truncates
9226   // to the phi section of the header block.
9227   for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove)
9228     Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
9229 
9230   // Adjust the recipes for any inloop reductions.
9231   adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExit()), Plan,
9232                              RecipeBuilder, Range.Start);
9233 
9234   // Introduce a recipe to combine the incoming and previous values of a
9235   // first-order recurrence.
9236   for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
9237     auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R);
9238     if (!RecurPhi)
9239       continue;
9240 
9241     VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe();
9242     VPBasicBlock *InsertBlock = PrevRecipe->getParent();
9243     auto *Region = GetReplicateRegion(PrevRecipe);
9244     if (Region)
9245       InsertBlock = cast<VPBasicBlock>(Region->getSingleSuccessor());
9246     if (Region || PrevRecipe->isPhi())
9247       Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
9248     else
9249       Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator()));
9250 
9251     auto *RecurSplice = cast<VPInstruction>(
9252         Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
9253                              {RecurPhi, RecurPhi->getBackedgeValue()}));
9254 
9255     RecurPhi->replaceAllUsesWith(RecurSplice);
9256     // Set the first operand of RecurSplice to RecurPhi again, after replacing
9257     // all users.
9258     RecurSplice->setOperand(0, RecurPhi);
9259   }
9260 
9261   // Interleave memory: for each Interleave Group we marked earlier as relevant
9262   // for this VPlan, replace the Recipes widening its memory instructions with a
9263   // single VPInterleaveRecipe at its insertion point.
9264   for (auto IG : InterleaveGroups) {
9265     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
9266         RecipeBuilder.getRecipe(IG->getInsertPos()));
9267     SmallVector<VPValue *, 4> StoredValues;
9268     for (unsigned i = 0; i < IG->getFactor(); ++i)
9269       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
9270         auto *StoreR =
9271             cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
9272         StoredValues.push_back(StoreR->getStoredValue());
9273       }
9274 
9275     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
9276                                         Recipe->getMask());
9277     VPIG->insertBefore(Recipe);
9278     unsigned J = 0;
9279     for (unsigned i = 0; i < IG->getFactor(); ++i)
9280       if (Instruction *Member = IG->getMember(i)) {
9281         if (!Member->getType()->isVoidTy()) {
9282           VPValue *OriginalV = Plan->getVPValue(Member);
9283           Plan->removeVPValueFor(Member);
9284           Plan->addVPValue(Member, VPIG->getVPValue(J));
9285           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
9286           J++;
9287         }
9288         RecipeBuilder.getRecipe(Member)->eraseFromParent();
9289       }
9290   }
9291 
9292   // From this point onwards, VPlan-to-VPlan transformations may change the plan
9293   // in ways that accessing values using original IR values is incorrect.
9294   Plan->disableValue2VPValue();
9295 
9296   VPlanTransforms::sinkScalarOperands(*Plan);
9297   VPlanTransforms::mergeReplicateRegions(*Plan);
9298 
9299   std::string PlanName;
9300   raw_string_ostream RSO(PlanName);
9301   ElementCount VF = Range.Start;
9302   Plan->addVF(VF);
9303   RSO << "Initial VPlan for VF={" << VF;
9304   for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
9305     Plan->addVF(VF);
9306     RSO << "," << VF;
9307   }
9308   RSO << "},UF>=1";
9309   RSO.flush();
9310   Plan->setName(PlanName);
9311 
9312   // Fold Exit block into its predecessor if possible.
9313   // TODO: Fold block earlier once all VPlan transforms properly maintain a
9314   // VPBasicBlock as exit.
9315   VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExit());
9316 
9317   assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
9318   return Plan;
9319 }
9320 
9321 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9322   // Outer loop handling: They may require CFG and instruction level
9323   // transformations before even evaluating whether vectorization is profitable.
9324   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9325   // the vectorization pipeline.
9326   assert(!OrigLoop->isInnermost());
9327   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9328 
9329   // Create new empty VPlan
9330   auto Plan = std::make_unique<VPlan>();
9331 
9332   // Build hierarchical CFG
9333   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9334   HCFGBuilder.buildHierarchicalCFG();
9335 
9336   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
9337        VF *= 2)
9338     Plan->addVF(VF);
9339 
9340   if (EnableVPlanPredication) {
9341     VPlanPredicator VPP(*Plan);
9342     VPP.predicate();
9343 
9344     // Avoid running transformation to recipes until masked code generation in
9345     // VPlan-native path is in place.
9346     return Plan;
9347   }
9348 
9349   SmallPtrSet<Instruction *, 1> DeadInstructions;
9350   VPlanTransforms::VPInstructionsToVPRecipes(
9351       OrigLoop, Plan,
9352       [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
9353       DeadInstructions, *PSE.getSE());
9354 
9355   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(),
9356                         true, true);
9357   return Plan;
9358 }
9359 
9360 // Adjust the recipes for reductions. For in-loop reductions the chain of
9361 // instructions leading from the loop exit instr to the phi need to be converted
9362 // to reductions, with one operand being vector and the other being the scalar
9363 // reduction chain. For other reductions, a select is introduced between the phi
9364 // and live-out recipes when folding the tail.
9365 void LoopVectorizationPlanner::adjustRecipesForReductions(
9366     VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
9367     ElementCount MinVF) {
9368   for (auto &Reduction : CM.getInLoopReductionChains()) {
9369     PHINode *Phi = Reduction.first;
9370     const RecurrenceDescriptor &RdxDesc =
9371         Legal->getReductionVars().find(Phi)->second;
9372     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9373 
9374     if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc))
9375       continue;
9376 
9377     // ReductionOperations are orders top-down from the phi's use to the
9378     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
9379     // which of the two operands will remain scalar and which will be reduced.
9380     // For minmax the chain will be the select instructions.
9381     Instruction *Chain = Phi;
9382     for (Instruction *R : ReductionOperations) {
9383       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
9384       RecurKind Kind = RdxDesc.getRecurrenceKind();
9385 
9386       VPValue *ChainOp = Plan->getVPValue(Chain);
9387       unsigned FirstOpId;
9388       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
9389              "Only min/max recurrences allowed for inloop reductions");
9390       // Recognize a call to the llvm.fmuladd intrinsic.
9391       bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9392       assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) &&
9393              "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9394       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9395         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
9396                "Expected to replace a VPWidenSelectSC");
9397         FirstOpId = 1;
9398       } else {
9399         assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) ||
9400                 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) &&
9401                "Expected to replace a VPWidenSC");
9402         FirstOpId = 0;
9403       }
9404       unsigned VecOpId =
9405           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
9406       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
9407 
9408       auto *CondOp = CM.foldTailByMasking()
9409                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
9410                          : nullptr;
9411 
9412       if (IsFMulAdd) {
9413         // If the instruction is a call to the llvm.fmuladd intrinsic then we
9414         // need to create an fmul recipe to use as the vector operand for the
9415         // fadd reduction.
9416         VPInstruction *FMulRecipe = new VPInstruction(
9417             Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))});
9418         FMulRecipe->setFastMathFlags(R->getFastMathFlags());
9419         WidenRecipe->getParent()->insert(FMulRecipe,
9420                                          WidenRecipe->getIterator());
9421         VecOp = FMulRecipe;
9422       }
9423       VPReductionRecipe *RedRecipe =
9424           new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
9425       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9426       Plan->removeVPValueFor(R);
9427       Plan->addVPValue(R, RedRecipe);
9428       WidenRecipe->getParent()->insert(RedRecipe, WidenRecipe->getIterator());
9429       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9430       WidenRecipe->eraseFromParent();
9431 
9432       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9433         VPRecipeBase *CompareRecipe =
9434             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
9435         assert(isa<VPWidenRecipe>(CompareRecipe) &&
9436                "Expected to replace a VPWidenSC");
9437         assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
9438                "Expected no remaining users");
9439         CompareRecipe->eraseFromParent();
9440       }
9441       Chain = R;
9442     }
9443   }
9444 
9445   // If tail is folded by masking, introduce selects between the phi
9446   // and the live-out instruction of each reduction, at the beginning of the
9447   // dedicated latch block.
9448   if (CM.foldTailByMasking()) {
9449     Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin());
9450     for (VPRecipeBase &R : Plan->getEntry()->getEntryBasicBlock()->phis()) {
9451       VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9452       if (!PhiR || PhiR->isInLoop())
9453         continue;
9454       VPValue *Cond =
9455           RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
9456       VPValue *Red = PhiR->getBackedgeValue();
9457       assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB &&
9458              "reduction recipe must be defined before latch");
9459       Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});
9460     }
9461   }
9462 }
9463 
9464 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9465 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9466                                VPSlotTracker &SlotTracker) const {
9467   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9468   IG->getInsertPos()->printAsOperand(O, false);
9469   O << ", ";
9470   getAddr()->printAsOperand(O, SlotTracker);
9471   VPValue *Mask = getMask();
9472   if (Mask) {
9473     O << ", ";
9474     Mask->printAsOperand(O, SlotTracker);
9475   }
9476 
9477   unsigned OpIdx = 0;
9478   for (unsigned i = 0; i < IG->getFactor(); ++i) {
9479     if (!IG->getMember(i))
9480       continue;
9481     if (getNumStoreOperands() > 0) {
9482       O << "\n" << Indent << "  store ";
9483       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9484       O << " to index " << i;
9485     } else {
9486       O << "\n" << Indent << "  ";
9487       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
9488       O << " = load from index " << i;
9489     }
9490     ++OpIdx;
9491   }
9492 }
9493 #endif
9494 
9495 void VPWidenCallRecipe::execute(VPTransformState &State) {
9496   State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
9497                                   *this, State);
9498 }
9499 
9500 void VPWidenSelectRecipe::execute(VPTransformState &State) {
9501   auto &I = *cast<SelectInst>(getUnderlyingInstr());
9502   State.ILV->setDebugLocFromInst(&I);
9503 
9504   // The condition can be loop invariant  but still defined inside the
9505   // loop. This means that we can't just use the original 'cond' value.
9506   // We have to take the 'vectorized' value and pick the first lane.
9507   // Instcombine will make this a no-op.
9508   auto *InvarCond =
9509       InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr;
9510 
9511   for (unsigned Part = 0; Part < State.UF; ++Part) {
9512     Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part);
9513     Value *Op0 = State.get(getOperand(1), Part);
9514     Value *Op1 = State.get(getOperand(2), Part);
9515     Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
9516     State.set(this, Sel, Part);
9517     State.ILV->addMetadata(Sel, &I);
9518   }
9519 }
9520 
9521 void VPWidenRecipe::execute(VPTransformState &State) {
9522   auto &I = *cast<Instruction>(getUnderlyingValue());
9523   auto &Builder = State.Builder;
9524   switch (I.getOpcode()) {
9525   case Instruction::Call:
9526   case Instruction::Br:
9527   case Instruction::PHI:
9528   case Instruction::GetElementPtr:
9529   case Instruction::Select:
9530     llvm_unreachable("This instruction is handled by a different recipe.");
9531   case Instruction::UDiv:
9532   case Instruction::SDiv:
9533   case Instruction::SRem:
9534   case Instruction::URem:
9535   case Instruction::Add:
9536   case Instruction::FAdd:
9537   case Instruction::Sub:
9538   case Instruction::FSub:
9539   case Instruction::FNeg:
9540   case Instruction::Mul:
9541   case Instruction::FMul:
9542   case Instruction::FDiv:
9543   case Instruction::FRem:
9544   case Instruction::Shl:
9545   case Instruction::LShr:
9546   case Instruction::AShr:
9547   case Instruction::And:
9548   case Instruction::Or:
9549   case Instruction::Xor: {
9550     // Just widen unops and binops.
9551     State.ILV->setDebugLocFromInst(&I);
9552 
9553     for (unsigned Part = 0; Part < State.UF; ++Part) {
9554       SmallVector<Value *, 2> Ops;
9555       for (VPValue *VPOp : operands())
9556         Ops.push_back(State.get(VPOp, Part));
9557 
9558       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
9559 
9560       if (auto *VecOp = dyn_cast<Instruction>(V)) {
9561         VecOp->copyIRFlags(&I);
9562 
9563         // If the instruction is vectorized and was in a basic block that needed
9564         // predication, we can't propagate poison-generating flags (nuw/nsw,
9565         // exact, etc.). The control flow has been linearized and the
9566         // instruction is no longer guarded by the predicate, which could make
9567         // the flag properties to no longer hold.
9568         if (State.MayGeneratePoisonRecipes.contains(this))
9569           VecOp->dropPoisonGeneratingFlags();
9570       }
9571 
9572       // Use this vector value for all users of the original instruction.
9573       State.set(this, V, Part);
9574       State.ILV->addMetadata(V, &I);
9575     }
9576 
9577     break;
9578   }
9579   case Instruction::ICmp:
9580   case Instruction::FCmp: {
9581     // Widen compares. Generate vector compares.
9582     bool FCmp = (I.getOpcode() == Instruction::FCmp);
9583     auto *Cmp = cast<CmpInst>(&I);
9584     State.ILV->setDebugLocFromInst(Cmp);
9585     for (unsigned Part = 0; Part < State.UF; ++Part) {
9586       Value *A = State.get(getOperand(0), Part);
9587       Value *B = State.get(getOperand(1), Part);
9588       Value *C = nullptr;
9589       if (FCmp) {
9590         // Propagate fast math flags.
9591         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
9592         Builder.setFastMathFlags(Cmp->getFastMathFlags());
9593         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
9594       } else {
9595         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
9596       }
9597       State.set(this, C, Part);
9598       State.ILV->addMetadata(C, &I);
9599     }
9600 
9601     break;
9602   }
9603 
9604   case Instruction::ZExt:
9605   case Instruction::SExt:
9606   case Instruction::FPToUI:
9607   case Instruction::FPToSI:
9608   case Instruction::FPExt:
9609   case Instruction::PtrToInt:
9610   case Instruction::IntToPtr:
9611   case Instruction::SIToFP:
9612   case Instruction::UIToFP:
9613   case Instruction::Trunc:
9614   case Instruction::FPTrunc:
9615   case Instruction::BitCast: {
9616     auto *CI = cast<CastInst>(&I);
9617     State.ILV->setDebugLocFromInst(CI);
9618 
9619     /// Vectorize casts.
9620     Type *DestTy = (State.VF.isScalar())
9621                        ? CI->getType()
9622                        : VectorType::get(CI->getType(), State.VF);
9623 
9624     for (unsigned Part = 0; Part < State.UF; ++Part) {
9625       Value *A = State.get(getOperand(0), Part);
9626       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
9627       State.set(this, Cast, Part);
9628       State.ILV->addMetadata(Cast, &I);
9629     }
9630     break;
9631   }
9632   default:
9633     // This instruction is not vectorized by simple widening.
9634     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
9635     llvm_unreachable("Unhandled instruction!");
9636   } // end of switch.
9637 }
9638 
9639 void VPWidenGEPRecipe::execute(VPTransformState &State) {
9640   auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
9641   // Construct a vector GEP by widening the operands of the scalar GEP as
9642   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
9643   // results in a vector of pointers when at least one operand of the GEP
9644   // is vector-typed. Thus, to keep the representation compact, we only use
9645   // vector-typed operands for loop-varying values.
9646 
9647   if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
9648     // If we are vectorizing, but the GEP has only loop-invariant operands,
9649     // the GEP we build (by only using vector-typed operands for
9650     // loop-varying values) would be a scalar pointer. Thus, to ensure we
9651     // produce a vector of pointers, we need to either arbitrarily pick an
9652     // operand to broadcast, or broadcast a clone of the original GEP.
9653     // Here, we broadcast a clone of the original.
9654     //
9655     // TODO: If at some point we decide to scalarize instructions having
9656     //       loop-invariant operands, this special case will no longer be
9657     //       required. We would add the scalarization decision to
9658     //       collectLoopScalars() and teach getVectorValue() to broadcast
9659     //       the lane-zero scalar value.
9660     auto *Clone = State.Builder.Insert(GEP->clone());
9661     for (unsigned Part = 0; Part < State.UF; ++Part) {
9662       Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone);
9663       State.set(this, EntryPart, Part);
9664       State.ILV->addMetadata(EntryPart, GEP);
9665     }
9666   } else {
9667     // If the GEP has at least one loop-varying operand, we are sure to
9668     // produce a vector of pointers. But if we are only unrolling, we want
9669     // to produce a scalar GEP for each unroll part. Thus, the GEP we
9670     // produce with the code below will be scalar (if VF == 1) or vector
9671     // (otherwise). Note that for the unroll-only case, we still maintain
9672     // values in the vector mapping with initVector, as we do for other
9673     // instructions.
9674     for (unsigned Part = 0; Part < State.UF; ++Part) {
9675       // The pointer operand of the new GEP. If it's loop-invariant, we
9676       // won't broadcast it.
9677       auto *Ptr = IsPtrLoopInvariant
9678                       ? State.get(getOperand(0), VPIteration(0, 0))
9679                       : State.get(getOperand(0), Part);
9680 
9681       // Collect all the indices for the new GEP. If any index is
9682       // loop-invariant, we won't broadcast it.
9683       SmallVector<Value *, 4> Indices;
9684       for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
9685         VPValue *Operand = getOperand(I);
9686         if (IsIndexLoopInvariant[I - 1])
9687           Indices.push_back(State.get(Operand, VPIteration(0, 0)));
9688         else
9689           Indices.push_back(State.get(Operand, Part));
9690       }
9691 
9692       // If the GEP instruction is vectorized and was in a basic block that
9693       // needed predication, we can't propagate the poison-generating 'inbounds'
9694       // flag. The control flow has been linearized and the GEP is no longer
9695       // guarded by the predicate, which could make the 'inbounds' properties to
9696       // no longer hold.
9697       bool IsInBounds =
9698           GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0;
9699 
9700       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
9701       // but it should be a vector, otherwise.
9702       auto *NewGEP = IsInBounds
9703                          ? State.Builder.CreateInBoundsGEP(
9704                                GEP->getSourceElementType(), Ptr, Indices)
9705                          : State.Builder.CreateGEP(GEP->getSourceElementType(),
9706                                                    Ptr, Indices);
9707       assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
9708              "NewGEP is not a pointer vector");
9709       State.set(this, NewGEP, Part);
9710       State.ILV->addMetadata(NewGEP, GEP);
9711     }
9712   }
9713 }
9714 
9715 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
9716   assert(!State.Instance && "Int or FP induction being replicated.");
9717   auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0);
9718   State.ILV->widenIntOrFpInduction(IV, this, State, CanonicalIV);
9719 }
9720 
9721 void VPWidenPHIRecipe::execute(VPTransformState &State) {
9722   State.ILV->widenPHIInstruction(cast<PHINode>(getUnderlyingValue()), this,
9723                                  State);
9724 }
9725 
9726 void VPBlendRecipe::execute(VPTransformState &State) {
9727   State.ILV->setDebugLocFromInst(Phi, &State.Builder);
9728   // We know that all PHIs in non-header blocks are converted into
9729   // selects, so we don't have to worry about the insertion order and we
9730   // can just use the builder.
9731   // At this point we generate the predication tree. There may be
9732   // duplications since this is a simple recursive scan, but future
9733   // optimizations will clean it up.
9734 
9735   unsigned NumIncoming = getNumIncomingValues();
9736 
9737   // Generate a sequence of selects of the form:
9738   // SELECT(Mask3, In3,
9739   //        SELECT(Mask2, In2,
9740   //               SELECT(Mask1, In1,
9741   //                      In0)))
9742   // Note that Mask0 is never used: lanes for which no path reaches this phi and
9743   // are essentially undef are taken from In0.
9744   InnerLoopVectorizer::VectorParts Entry(State.UF);
9745   for (unsigned In = 0; In < NumIncoming; ++In) {
9746     for (unsigned Part = 0; Part < State.UF; ++Part) {
9747       // We might have single edge PHIs (blocks) - use an identity
9748       // 'select' for the first PHI operand.
9749       Value *In0 = State.get(getIncomingValue(In), Part);
9750       if (In == 0)
9751         Entry[Part] = In0; // Initialize with the first incoming value.
9752       else {
9753         // Select between the current value and the previous incoming edge
9754         // based on the incoming mask.
9755         Value *Cond = State.get(getMask(In), Part);
9756         Entry[Part] =
9757             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
9758       }
9759     }
9760   }
9761   for (unsigned Part = 0; Part < State.UF; ++Part)
9762     State.set(this, Entry[Part], Part);
9763 }
9764 
9765 void VPInterleaveRecipe::execute(VPTransformState &State) {
9766   assert(!State.Instance && "Interleave group being replicated.");
9767   State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9768                                       getStoredValues(), getMask());
9769 }
9770 
9771 void VPReductionRecipe::execute(VPTransformState &State) {
9772   assert(!State.Instance && "Reduction being replicated.");
9773   Value *PrevInChain = State.get(getChainOp(), 0);
9774   RecurKind Kind = RdxDesc->getRecurrenceKind();
9775   bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc);
9776   // Propagate the fast-math flags carried by the underlying instruction.
9777   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
9778   State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags());
9779   for (unsigned Part = 0; Part < State.UF; ++Part) {
9780     Value *NewVecOp = State.get(getVecOp(), Part);
9781     if (VPValue *Cond = getCondOp()) {
9782       Value *NewCond = State.get(Cond, Part);
9783       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
9784       Value *Iden = RdxDesc->getRecurrenceIdentity(
9785           Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags());
9786       Value *IdenVec =
9787           State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
9788       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
9789       NewVecOp = Select;
9790     }
9791     Value *NewRed;
9792     Value *NextInChain;
9793     if (IsOrdered) {
9794       if (State.VF.isVector())
9795         NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp,
9796                                         PrevInChain);
9797       else
9798         NewRed = State.Builder.CreateBinOp(
9799             (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain,
9800             NewVecOp);
9801       PrevInChain = NewRed;
9802     } else {
9803       PrevInChain = State.get(getChainOp(), Part);
9804       NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
9805     }
9806     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9807       NextInChain =
9808           createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
9809                          NewRed, PrevInChain);
9810     } else if (IsOrdered)
9811       NextInChain = NewRed;
9812     else
9813       NextInChain = State.Builder.CreateBinOp(
9814           (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed,
9815           PrevInChain);
9816     State.set(this, NextInChain, Part);
9817   }
9818 }
9819 
9820 void VPReplicateRecipe::execute(VPTransformState &State) {
9821   if (State.Instance) { // Generate a single instance.
9822     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9823     State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance,
9824                                     IsPredicated, State);
9825     // Insert scalar instance packing it into a vector.
9826     if (AlsoPack && State.VF.isVector()) {
9827       // If we're constructing lane 0, initialize to start from poison.
9828       if (State.Instance->Lane.isFirstLane()) {
9829         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9830         Value *Poison = PoisonValue::get(
9831             VectorType::get(getUnderlyingValue()->getType(), State.VF));
9832         State.set(this, Poison, State.Instance->Part);
9833       }
9834       State.ILV->packScalarIntoVectorValue(this, *State.Instance, State);
9835     }
9836     return;
9837   }
9838 
9839   // Generate scalar instances for all VF lanes of all UF parts, unless the
9840   // instruction is uniform inwhich case generate only the first lane for each
9841   // of the UF parts.
9842   unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
9843   assert((!State.VF.isScalable() || IsUniform) &&
9844          "Can't scalarize a scalable vector");
9845   for (unsigned Part = 0; Part < State.UF; ++Part)
9846     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9847       State.ILV->scalarizeInstruction(getUnderlyingInstr(), this,
9848                                       VPIteration(Part, Lane), IsPredicated,
9849                                       State);
9850 }
9851 
9852 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
9853   assert(State.Instance && "Branch on Mask works only on single instance.");
9854 
9855   unsigned Part = State.Instance->Part;
9856   unsigned Lane = State.Instance->Lane.getKnownLane();
9857 
9858   Value *ConditionBit = nullptr;
9859   VPValue *BlockInMask = getMask();
9860   if (BlockInMask) {
9861     ConditionBit = State.get(BlockInMask, Part);
9862     if (ConditionBit->getType()->isVectorTy())
9863       ConditionBit = State.Builder.CreateExtractElement(
9864           ConditionBit, State.Builder.getInt32(Lane));
9865   } else // Block in mask is all-one.
9866     ConditionBit = State.Builder.getTrue();
9867 
9868   // Replace the temporary unreachable terminator with a new conditional branch,
9869   // whose two destinations will be set later when they are created.
9870   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
9871   assert(isa<UnreachableInst>(CurrentTerminator) &&
9872          "Expected to replace unreachable terminator with conditional branch.");
9873   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
9874   CondBr->setSuccessor(0, nullptr);
9875   ReplaceInstWithInst(CurrentTerminator, CondBr);
9876 }
9877 
9878 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
9879   assert(State.Instance && "Predicated instruction PHI works per instance.");
9880   Instruction *ScalarPredInst =
9881       cast<Instruction>(State.get(getOperand(0), *State.Instance));
9882   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
9883   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
9884   assert(PredicatingBB && "Predicated block has no single predecessor.");
9885   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
9886          "operand must be VPReplicateRecipe");
9887 
9888   // By current pack/unpack logic we need to generate only a single phi node: if
9889   // a vector value for the predicated instruction exists at this point it means
9890   // the instruction has vector users only, and a phi for the vector value is
9891   // needed. In this case the recipe of the predicated instruction is marked to
9892   // also do that packing, thereby "hoisting" the insert-element sequence.
9893   // Otherwise, a phi node for the scalar value is needed.
9894   unsigned Part = State.Instance->Part;
9895   if (State.hasVectorValue(getOperand(0), Part)) {
9896     Value *VectorValue = State.get(getOperand(0), Part);
9897     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
9898     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
9899     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
9900     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
9901     if (State.hasVectorValue(this, Part))
9902       State.reset(this, VPhi, Part);
9903     else
9904       State.set(this, VPhi, Part);
9905     // NOTE: Currently we need to update the value of the operand, so the next
9906     // predicated iteration inserts its generated value in the correct vector.
9907     State.reset(getOperand(0), VPhi, Part);
9908   } else {
9909     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
9910     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
9911     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
9912                      PredicatingBB);
9913     Phi->addIncoming(ScalarPredInst, PredicatedBB);
9914     if (State.hasScalarValue(this, *State.Instance))
9915       State.reset(this, Phi, *State.Instance);
9916     else
9917       State.set(this, Phi, *State.Instance);
9918     // NOTE: Currently we need to update the value of the operand, so the next
9919     // predicated iteration inserts its generated value in the correct vector.
9920     State.reset(getOperand(0), Phi, *State.Instance);
9921   }
9922 }
9923 
9924 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9925   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9926 
9927   // Attempt to issue a wide load.
9928   LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
9929   StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
9930 
9931   assert((LI || SI) && "Invalid Load/Store instruction");
9932   assert((!SI || StoredValue) && "No stored value provided for widened store");
9933   assert((!LI || !StoredValue) && "Stored value provided for widened load");
9934 
9935   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9936 
9937   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9938   const Align Alignment = getLoadStoreAlignment(&Ingredient);
9939   bool CreateGatherScatter = !Consecutive;
9940 
9941   auto &Builder = State.Builder;
9942   InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
9943   bool isMaskRequired = getMask();
9944   if (isMaskRequired)
9945     for (unsigned Part = 0; Part < State.UF; ++Part)
9946       BlockInMaskParts[Part] = State.get(getMask(), Part);
9947 
9948   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
9949     // Calculate the pointer for the specific unroll-part.
9950     GetElementPtrInst *PartPtr = nullptr;
9951 
9952     bool InBounds = false;
9953     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
9954       InBounds = gep->isInBounds();
9955     if (Reverse) {
9956       // If the address is consecutive but reversed, then the
9957       // wide store needs to start at the last vector element.
9958       // RunTimeVF =  VScale * VF.getKnownMinValue()
9959       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
9960       Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF);
9961       // NumElt = -Part * RunTimeVF
9962       Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
9963       // LastLane = 1 - RunTimeVF
9964       Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
9965       PartPtr =
9966           cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
9967       PartPtr->setIsInBounds(InBounds);
9968       PartPtr = cast<GetElementPtrInst>(
9969           Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
9970       PartPtr->setIsInBounds(InBounds);
9971       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
9972         BlockInMaskParts[Part] =
9973             Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse");
9974     } else {
9975       Value *Increment =
9976           createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part);
9977       PartPtr = cast<GetElementPtrInst>(
9978           Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
9979       PartPtr->setIsInBounds(InBounds);
9980     }
9981 
9982     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
9983     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
9984   };
9985 
9986   // Handle Stores:
9987   if (SI) {
9988     State.ILV->setDebugLocFromInst(SI);
9989 
9990     for (unsigned Part = 0; Part < State.UF; ++Part) {
9991       Instruction *NewSI = nullptr;
9992       Value *StoredVal = State.get(StoredValue, Part);
9993       if (CreateGatherScatter) {
9994         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9995         Value *VectorGep = State.get(getAddr(), Part);
9996         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
9997                                             MaskPart);
9998       } else {
9999         if (Reverse) {
10000           // If we store to reverse consecutive memory locations, then we need
10001           // to reverse the order of elements in the stored value.
10002           StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
10003           // We don't want to update the value in the map as it might be used in
10004           // another expression. So don't call resetVectorValue(StoredVal).
10005         }
10006         auto *VecPtr =
10007             CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
10008         if (isMaskRequired)
10009           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
10010                                             BlockInMaskParts[Part]);
10011         else
10012           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
10013       }
10014       State.ILV->addMetadata(NewSI, SI);
10015     }
10016     return;
10017   }
10018 
10019   // Handle loads.
10020   assert(LI && "Must have a load instruction");
10021   State.ILV->setDebugLocFromInst(LI);
10022   for (unsigned Part = 0; Part < State.UF; ++Part) {
10023     Value *NewLI;
10024     if (CreateGatherScatter) {
10025       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
10026       Value *VectorGep = State.get(getAddr(), Part);
10027       NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
10028                                          nullptr, "wide.masked.gather");
10029       State.ILV->addMetadata(NewLI, LI);
10030     } else {
10031       auto *VecPtr =
10032           CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
10033       if (isMaskRequired)
10034         NewLI = Builder.CreateMaskedLoad(
10035             DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
10036             PoisonValue::get(DataTy), "wide.masked.load");
10037       else
10038         NewLI =
10039             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
10040 
10041       // Add metadata to the load, but setVectorValue to the reverse shuffle.
10042       State.ILV->addMetadata(NewLI, LI);
10043       if (Reverse)
10044         NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
10045     }
10046 
10047     State.set(this, NewLI, Part);
10048   }
10049 }
10050 
10051 // Determine how to lower the scalar epilogue, which depends on 1) optimising
10052 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
10053 // predication, and 4) a TTI hook that analyses whether the loop is suitable
10054 // for predication.
10055 static ScalarEpilogueLowering getScalarEpilogueLowering(
10056     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
10057     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
10058     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
10059     LoopVectorizationLegality &LVL) {
10060   // 1) OptSize takes precedence over all other options, i.e. if this is set,
10061   // don't look at hints or options, and don't request a scalar epilogue.
10062   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
10063   // LoopAccessInfo (due to code dependency and not being able to reliably get
10064   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
10065   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
10066   // versioning when the vectorization is forced, unlike hasOptSize. So revert
10067   // back to the old way and vectorize with versioning when forced. See D81345.)
10068   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
10069                                                       PGSOQueryType::IRPass) &&
10070                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
10071     return CM_ScalarEpilogueNotAllowedOptSize;
10072 
10073   // 2) If set, obey the directives
10074   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
10075     switch (PreferPredicateOverEpilogue) {
10076     case PreferPredicateTy::ScalarEpilogue:
10077       return CM_ScalarEpilogueAllowed;
10078     case PreferPredicateTy::PredicateElseScalarEpilogue:
10079       return CM_ScalarEpilogueNotNeededUsePredicate;
10080     case PreferPredicateTy::PredicateOrDontVectorize:
10081       return CM_ScalarEpilogueNotAllowedUsePredicate;
10082     };
10083   }
10084 
10085   // 3) If set, obey the hints
10086   switch (Hints.getPredicate()) {
10087   case LoopVectorizeHints::FK_Enabled:
10088     return CM_ScalarEpilogueNotNeededUsePredicate;
10089   case LoopVectorizeHints::FK_Disabled:
10090     return CM_ScalarEpilogueAllowed;
10091   };
10092 
10093   // 4) if the TTI hook indicates this is profitable, request predication.
10094   if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
10095                                        LVL.getLAI()))
10096     return CM_ScalarEpilogueNotNeededUsePredicate;
10097 
10098   return CM_ScalarEpilogueAllowed;
10099 }
10100 
10101 Value *VPTransformState::get(VPValue *Def, unsigned Part) {
10102   // If Values have been set for this Def return the one relevant for \p Part.
10103   if (hasVectorValue(Def, Part))
10104     return Data.PerPartOutput[Def][Part];
10105 
10106   if (!hasScalarValue(Def, {Part, 0})) {
10107     Value *IRV = Def->getLiveInIRValue();
10108     Value *B = ILV->getBroadcastInstrs(IRV);
10109     set(Def, B, Part);
10110     return B;
10111   }
10112 
10113   Value *ScalarValue = get(Def, {Part, 0});
10114   // If we aren't vectorizing, we can just copy the scalar map values over
10115   // to the vector map.
10116   if (VF.isScalar()) {
10117     set(Def, ScalarValue, Part);
10118     return ScalarValue;
10119   }
10120 
10121   auto *RepR = dyn_cast<VPReplicateRecipe>(Def);
10122   bool IsUniform = RepR && RepR->isUniform();
10123 
10124   unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
10125   // Check if there is a scalar value for the selected lane.
10126   if (!hasScalarValue(Def, {Part, LastLane})) {
10127     // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform.
10128     assert(isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) &&
10129            "unexpected recipe found to be invariant");
10130     IsUniform = true;
10131     LastLane = 0;
10132   }
10133 
10134   auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));
10135   // Set the insert point after the last scalarized instruction or after the
10136   // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
10137   // will directly follow the scalar definitions.
10138   auto OldIP = Builder.saveIP();
10139   auto NewIP =
10140       isa<PHINode>(LastInst)
10141           ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
10142           : std::next(BasicBlock::iterator(LastInst));
10143   Builder.SetInsertPoint(&*NewIP);
10144 
10145   // However, if we are vectorizing, we need to construct the vector values.
10146   // If the value is known to be uniform after vectorization, we can just
10147   // broadcast the scalar value corresponding to lane zero for each unroll
10148   // iteration. Otherwise, we construct the vector values using
10149   // insertelement instructions. Since the resulting vectors are stored in
10150   // State, we will only generate the insertelements once.
10151   Value *VectorValue = nullptr;
10152   if (IsUniform) {
10153     VectorValue = ILV->getBroadcastInstrs(ScalarValue);
10154     set(Def, VectorValue, Part);
10155   } else {
10156     // Initialize packing with insertelements to start from undef.
10157     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
10158     Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
10159     set(Def, Undef, Part);
10160     for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
10161       ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this);
10162     VectorValue = get(Def, Part);
10163   }
10164   Builder.restoreIP(OldIP);
10165   return VectorValue;
10166 }
10167 
10168 // Process the loop in the VPlan-native vectorization path. This path builds
10169 // VPlan upfront in the vectorization pipeline, which allows to apply
10170 // VPlan-to-VPlan transformations from the very beginning without modifying the
10171 // input LLVM IR.
10172 static bool processLoopInVPlanNativePath(
10173     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
10174     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
10175     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
10176     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
10177     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
10178     LoopVectorizationRequirements &Requirements) {
10179 
10180   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
10181     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
10182     return false;
10183   }
10184   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
10185   Function *F = L->getHeader()->getParent();
10186   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
10187 
10188   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10189       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
10190 
10191   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
10192                                 &Hints, IAI);
10193   // Use the planner for outer loop vectorization.
10194   // TODO: CM is not used at this point inside the planner. Turn CM into an
10195   // optional argument if we don't need it in the future.
10196   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints,
10197                                Requirements, ORE);
10198 
10199   // Get user vectorization factor.
10200   ElementCount UserVF = Hints.getWidth();
10201 
10202   CM.collectElementTypesForWidening();
10203 
10204   // Plan how to best vectorize, return the best VF and its cost.
10205   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
10206 
10207   // If we are stress testing VPlan builds, do not attempt to generate vector
10208   // code. Masked vector code generation support will follow soon.
10209   // Also, do not attempt to vectorize if no vector code will be produced.
10210   if (VPlanBuildStressTest || EnableVPlanPredication ||
10211       VectorizationFactor::Disabled() == VF)
10212     return false;
10213 
10214   VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10215 
10216   {
10217     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10218                              F->getParent()->getDataLayout());
10219     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
10220                            &CM, BFI, PSI, Checks);
10221     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
10222                       << L->getHeader()->getParent()->getName() << "\"\n");
10223     LVP.executePlan(VF.Width, 1, BestPlan, LB, DT);
10224   }
10225 
10226   // Mark the loop as already vectorized to avoid vectorizing again.
10227   Hints.setAlreadyVectorized();
10228   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10229   return true;
10230 }
10231 
10232 // Emit a remark if there are stores to floats that required a floating point
10233 // extension. If the vectorized loop was generated with floating point there
10234 // will be a performance penalty from the conversion overhead and the change in
10235 // the vector width.
10236 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
10237   SmallVector<Instruction *, 4> Worklist;
10238   for (BasicBlock *BB : L->getBlocks()) {
10239     for (Instruction &Inst : *BB) {
10240       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
10241         if (S->getValueOperand()->getType()->isFloatTy())
10242           Worklist.push_back(S);
10243       }
10244     }
10245   }
10246 
10247   // Traverse the floating point stores upwards searching, for floating point
10248   // conversions.
10249   SmallPtrSet<const Instruction *, 4> Visited;
10250   SmallPtrSet<const Instruction *, 4> EmittedRemark;
10251   while (!Worklist.empty()) {
10252     auto *I = Worklist.pop_back_val();
10253     if (!L->contains(I))
10254       continue;
10255     if (!Visited.insert(I).second)
10256       continue;
10257 
10258     // Emit a remark if the floating point store required a floating
10259     // point conversion.
10260     // TODO: More work could be done to identify the root cause such as a
10261     // constant or a function return type and point the user to it.
10262     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
10263       ORE->emit([&]() {
10264         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
10265                                           I->getDebugLoc(), L->getHeader())
10266                << "floating point conversion changes vector width. "
10267                << "Mixed floating point precision requires an up/down "
10268                << "cast that will negatively impact performance.";
10269       });
10270 
10271     for (Use &Op : I->operands())
10272       if (auto *OpI = dyn_cast<Instruction>(Op))
10273         Worklist.push_back(OpI);
10274   }
10275 }
10276 
10277 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
10278     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10279                                !EnableLoopInterleaving),
10280       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10281                               !EnableLoopVectorization) {}
10282 
10283 bool LoopVectorizePass::processLoop(Loop *L) {
10284   assert((EnableVPlanNativePath || L->isInnermost()) &&
10285          "VPlan-native path is not enabled. Only process inner loops.");
10286 
10287 #ifndef NDEBUG
10288   const std::string DebugLocStr = getDebugLocString(L);
10289 #endif /* NDEBUG */
10290 
10291   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
10292                     << L->getHeader()->getParent()->getName() << "\" from "
10293                     << DebugLocStr << "\n");
10294 
10295   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
10296 
10297   LLVM_DEBUG(
10298       dbgs() << "LV: Loop hints:"
10299              << " force="
10300              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
10301                      ? "disabled"
10302                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
10303                             ? "enabled"
10304                             : "?"))
10305              << " width=" << Hints.getWidth()
10306              << " interleave=" << Hints.getInterleave() << "\n");
10307 
10308   // Function containing loop
10309   Function *F = L->getHeader()->getParent();
10310 
10311   // Looking at the diagnostic output is the only way to determine if a loop
10312   // was vectorized (other than looking at the IR or machine code), so it
10313   // is important to generate an optimization remark for each loop. Most of
10314   // these messages are generated as OptimizationRemarkAnalysis. Remarks
10315   // generated as OptimizationRemark and OptimizationRemarkMissed are
10316   // less verbose reporting vectorized loops and unvectorized loops that may
10317   // benefit from vectorization, respectively.
10318 
10319   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10320     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10321     return false;
10322   }
10323 
10324   PredicatedScalarEvolution PSE(*SE, *L);
10325 
10326   // Check if it is legal to vectorize the loop.
10327   LoopVectorizationRequirements Requirements;
10328   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
10329                                 &Requirements, &Hints, DB, AC, BFI, PSI);
10330   if (!LVL.canVectorize(EnableVPlanNativePath)) {
10331     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10332     Hints.emitRemarkWithHints();
10333     return false;
10334   }
10335 
10336   // Check the function attributes and profiles to find out if this function
10337   // should be optimized for size.
10338   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10339       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
10340 
10341   // Entrance to the VPlan-native vectorization path. Outer loops are processed
10342   // here. They may require CFG and instruction level transformations before
10343   // even evaluating whether vectorization is profitable. Since we cannot modify
10344   // the incoming IR, we need to build VPlan upfront in the vectorization
10345   // pipeline.
10346   if (!L->isInnermost())
10347     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10348                                         ORE, BFI, PSI, Hints, Requirements);
10349 
10350   assert(L->isInnermost() && "Inner loop expected.");
10351 
10352   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10353   // count by optimizing for size, to minimize overheads.
10354   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
10355   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10356     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10357                       << "This loop is worth vectorizing only if no scalar "
10358                       << "iteration overheads are incurred.");
10359     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10360       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10361     else {
10362       LLVM_DEBUG(dbgs() << "\n");
10363       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10364     }
10365   }
10366 
10367   // Check the function attributes to see if implicit floats are allowed.
10368   // FIXME: This check doesn't seem possibly correct -- what if the loop is
10369   // an integer loop and the vector instructions selected are purely integer
10370   // vector instructions?
10371   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10372     reportVectorizationFailure(
10373         "Can't vectorize when the NoImplicitFloat attribute is used",
10374         "loop not vectorized due to NoImplicitFloat attribute",
10375         "NoImplicitFloat", ORE, L);
10376     Hints.emitRemarkWithHints();
10377     return false;
10378   }
10379 
10380   // Check if the target supports potentially unsafe FP vectorization.
10381   // FIXME: Add a check for the type of safety issue (denormal, signaling)
10382   // for the target we're vectorizing for, to make sure none of the
10383   // additional fp-math flags can help.
10384   if (Hints.isPotentiallyUnsafe() &&
10385       TTI->isFPVectorizationPotentiallyUnsafe()) {
10386     reportVectorizationFailure(
10387         "Potentially unsafe FP op prevents vectorization",
10388         "loop not vectorized due to unsafe FP support.",
10389         "UnsafeFP", ORE, L);
10390     Hints.emitRemarkWithHints();
10391     return false;
10392   }
10393 
10394   bool AllowOrderedReductions;
10395   // If the flag is set, use that instead and override the TTI behaviour.
10396   if (ForceOrderedReductions.getNumOccurrences() > 0)
10397     AllowOrderedReductions = ForceOrderedReductions;
10398   else
10399     AllowOrderedReductions = TTI->enableOrderedReductions();
10400   if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10401     ORE->emit([&]() {
10402       auto *ExactFPMathInst = Requirements.getExactFPInst();
10403       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10404                                                  ExactFPMathInst->getDebugLoc(),
10405                                                  ExactFPMathInst->getParent())
10406              << "loop not vectorized: cannot prove it is safe to reorder "
10407                 "floating-point operations";
10408     });
10409     LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10410                          "reorder floating-point operations\n");
10411     Hints.emitRemarkWithHints();
10412     return false;
10413   }
10414 
10415   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10416   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10417 
10418   // If an override option has been passed in for interleaved accesses, use it.
10419   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
10420     UseInterleaved = EnableInterleavedMemAccesses;
10421 
10422   // Analyze interleaved memory accesses.
10423   if (UseInterleaved) {
10424     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
10425   }
10426 
10427   // Use the cost model.
10428   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10429                                 F, &Hints, IAI);
10430   CM.collectValuesToIgnore();
10431   CM.collectElementTypesForWidening();
10432 
10433   // Use the planner for vectorization.
10434   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints,
10435                                Requirements, ORE);
10436 
10437   // Get user vectorization factor and interleave count.
10438   ElementCount UserVF = Hints.getWidth();
10439   unsigned UserIC = Hints.getInterleave();
10440 
10441   // Plan how to best vectorize, return the best VF and its cost.
10442   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
10443 
10444   VectorizationFactor VF = VectorizationFactor::Disabled();
10445   unsigned IC = 1;
10446 
10447   if (MaybeVF) {
10448     VF = *MaybeVF;
10449     // Select the interleave count.
10450     IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());
10451   }
10452 
10453   // Identify the diagnostic messages that should be produced.
10454   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10455   bool VectorizeLoop = true, InterleaveLoop = true;
10456   if (VF.Width.isScalar()) {
10457     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10458     VecDiagMsg = std::make_pair(
10459         "VectorizationNotBeneficial",
10460         "the cost-model indicates that vectorization is not beneficial");
10461     VectorizeLoop = false;
10462   }
10463 
10464   if (!MaybeVF && UserIC > 1) {
10465     // Tell the user interleaving was avoided up-front, despite being explicitly
10466     // requested.
10467     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10468                          "interleaving should be avoided up front\n");
10469     IntDiagMsg = std::make_pair(
10470         "InterleavingAvoided",
10471         "Ignoring UserIC, because interleaving was avoided up front");
10472     InterleaveLoop = false;
10473   } else if (IC == 1 && UserIC <= 1) {
10474     // Tell the user interleaving is not beneficial.
10475     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10476     IntDiagMsg = std::make_pair(
10477         "InterleavingNotBeneficial",
10478         "the cost-model indicates that interleaving is not beneficial");
10479     InterleaveLoop = false;
10480     if (UserIC == 1) {
10481       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10482       IntDiagMsg.second +=
10483           " and is explicitly disabled or interleave count is set to 1";
10484     }
10485   } else if (IC > 1 && UserIC == 1) {
10486     // Tell the user interleaving is beneficial, but it explicitly disabled.
10487     LLVM_DEBUG(
10488         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10489     IntDiagMsg = std::make_pair(
10490         "InterleavingBeneficialButDisabled",
10491         "the cost-model indicates that interleaving is beneficial "
10492         "but is explicitly disabled or interleave count is set to 1");
10493     InterleaveLoop = false;
10494   }
10495 
10496   // Override IC if user provided an interleave count.
10497   IC = UserIC > 0 ? UserIC : IC;
10498 
10499   // Emit diagnostic messages, if any.
10500   const char *VAPassName = Hints.vectorizeAnalysisPassName();
10501   if (!VectorizeLoop && !InterleaveLoop) {
10502     // Do not vectorize or interleaving the loop.
10503     ORE->emit([&]() {
10504       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10505                                       L->getStartLoc(), L->getHeader())
10506              << VecDiagMsg.second;
10507     });
10508     ORE->emit([&]() {
10509       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10510                                       L->getStartLoc(), L->getHeader())
10511              << IntDiagMsg.second;
10512     });
10513     return false;
10514   } else if (!VectorizeLoop && InterleaveLoop) {
10515     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10516     ORE->emit([&]() {
10517       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10518                                         L->getStartLoc(), L->getHeader())
10519              << VecDiagMsg.second;
10520     });
10521   } else if (VectorizeLoop && !InterleaveLoop) {
10522     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10523                       << ") in " << DebugLocStr << '\n');
10524     ORE->emit([&]() {
10525       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10526                                         L->getStartLoc(), L->getHeader())
10527              << IntDiagMsg.second;
10528     });
10529   } else if (VectorizeLoop && InterleaveLoop) {
10530     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10531                       << ") in " << DebugLocStr << '\n');
10532     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10533   }
10534 
10535   bool DisableRuntimeUnroll = false;
10536   MDNode *OrigLoopID = L->getLoopID();
10537   {
10538     // Optimistically generate runtime checks. Drop them if they turn out to not
10539     // be profitable. Limit the scope of Checks, so the cleanup happens
10540     // immediately after vector codegeneration is done.
10541     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI,
10542                              F->getParent()->getDataLayout());
10543     if (!VF.Width.isScalar() || IC > 1)
10544       Checks.Create(L, *LVL.getLAI(), PSE.getUnionPredicate());
10545 
10546     using namespace ore;
10547     if (!VectorizeLoop) {
10548       assert(IC > 1 && "interleave count should not be 1 or 0");
10549       // If we decided that it is not legal to vectorize the loop, then
10550       // interleave it.
10551       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10552                                  &CM, BFI, PSI, Checks);
10553 
10554       VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10555       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT);
10556 
10557       ORE->emit([&]() {
10558         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10559                                   L->getHeader())
10560                << "interleaved loop (interleaved count: "
10561                << NV("InterleaveCount", IC) << ")";
10562       });
10563     } else {
10564       // If we decided that it is *legal* to vectorize the loop, then do it.
10565 
10566       // Consider vectorizing the epilogue too if it's profitable.
10567       VectorizationFactor EpilogueVF =
10568           CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
10569       if (EpilogueVF.Width.isVector()) {
10570 
10571         // The first pass vectorizes the main loop and creates a scalar epilogue
10572         // to be vectorized by executing the plan (potentially with a different
10573         // factor) again shortly afterwards.
10574         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10575         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10576                                            EPI, &LVL, &CM, BFI, PSI, Checks);
10577 
10578         VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
10579         LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV,
10580                         DT);
10581         ++LoopsVectorized;
10582 
10583         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10584         formLCSSARecursively(*L, *DT, LI, SE);
10585 
10586         // Second pass vectorizes the epilogue and adjusts the control flow
10587         // edges from the first pass.
10588         EPI.MainLoopVF = EPI.EpilogueVF;
10589         EPI.MainLoopUF = EPI.EpilogueUF;
10590         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10591                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
10592                                                  Checks);
10593 
10594         VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
10595 
10596         // Ensure that the start values for any VPReductionPHIRecipes are
10597         // updated before vectorising the epilogue loop.
10598         VPBasicBlock *Header = BestEpiPlan.getEntry()->getEntryBasicBlock();
10599         for (VPRecipeBase &R : Header->phis()) {
10600           if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10601             if (auto *Resume = MainILV.getReductionResumeValue(
10602                     ReductionPhi->getRecurrenceDescriptor())) {
10603               VPValue *StartVal = new VPValue(Resume);
10604               BestEpiPlan.addExternalDef(StartVal);
10605               ReductionPhi->setOperand(0, StartVal);
10606             }
10607           }
10608         }
10609 
10610         LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10611                         DT);
10612         ++LoopsEpilogueVectorized;
10613 
10614         if (!MainILV.areSafetyChecksAdded())
10615           DisableRuntimeUnroll = true;
10616       } else {
10617         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
10618                                &LVL, &CM, BFI, PSI, Checks);
10619 
10620         VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10621         LVP.executePlan(VF.Width, IC, BestPlan, LB, DT);
10622         ++LoopsVectorized;
10623 
10624         // Add metadata to disable runtime unrolling a scalar loop when there
10625         // are no runtime checks about strides and memory. A scalar loop that is
10626         // rarely used is not worth unrolling.
10627         if (!LB.areSafetyChecksAdded())
10628           DisableRuntimeUnroll = true;
10629       }
10630       // Report the vectorization decision.
10631       ORE->emit([&]() {
10632         return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
10633                                   L->getHeader())
10634                << "vectorized loop (vectorization width: "
10635                << NV("VectorizationFactor", VF.Width)
10636                << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
10637       });
10638     }
10639 
10640     if (ORE->allowExtraAnalysis(LV_NAME))
10641       checkMixedPrecision(L, ORE);
10642   }
10643 
10644   Optional<MDNode *> RemainderLoopID =
10645       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10646                                       LLVMLoopVectorizeFollowupEpilogue});
10647   if (RemainderLoopID.hasValue()) {
10648     L->setLoopID(RemainderLoopID.getValue());
10649   } else {
10650     if (DisableRuntimeUnroll)
10651       AddRuntimeUnrollDisableMetaData(L);
10652 
10653     // Mark the loop as already vectorized to avoid vectorizing again.
10654     Hints.setAlreadyVectorized();
10655   }
10656 
10657   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10658   return true;
10659 }
10660 
10661 LoopVectorizeResult LoopVectorizePass::runImpl(
10662     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10663     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
10664     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
10665     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
10666     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10667   SE = &SE_;
10668   LI = &LI_;
10669   TTI = &TTI_;
10670   DT = &DT_;
10671   BFI = &BFI_;
10672   TLI = TLI_;
10673   AA = &AA_;
10674   AC = &AC_;
10675   GetLAA = &GetLAA_;
10676   DB = &DB_;
10677   ORE = &ORE_;
10678   PSI = PSI_;
10679 
10680   // Don't attempt if
10681   // 1. the target claims to have no vector registers, and
10682   // 2. interleaving won't help ILP.
10683   //
10684   // The second condition is necessary because, even if the target has no
10685   // vector registers, loop vectorization may still enable scalar
10686   // interleaving.
10687   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10688       TTI->getMaxInterleaveFactor(1) < 2)
10689     return LoopVectorizeResult(false, false);
10690 
10691   bool Changed = false, CFGChanged = false;
10692 
10693   // The vectorizer requires loops to be in simplified form.
10694   // Since simplification may add new inner loops, it has to run before the
10695   // legality and profitability checks. This means running the loop vectorizer
10696   // will simplify all loops, regardless of whether anything end up being
10697   // vectorized.
10698   for (auto &L : *LI)
10699     Changed |= CFGChanged |=
10700         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10701 
10702   // Build up a worklist of inner-loops to vectorize. This is necessary as
10703   // the act of vectorizing or partially unrolling a loop creates new loops
10704   // and can invalidate iterators across the loops.
10705   SmallVector<Loop *, 8> Worklist;
10706 
10707   for (Loop *L : *LI)
10708     collectSupportedLoops(*L, LI, ORE, Worklist);
10709 
10710   LoopsAnalyzed += Worklist.size();
10711 
10712   // Now walk the identified inner loops.
10713   while (!Worklist.empty()) {
10714     Loop *L = Worklist.pop_back_val();
10715 
10716     // For the inner loops we actually process, form LCSSA to simplify the
10717     // transform.
10718     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10719 
10720     Changed |= CFGChanged |= processLoop(L);
10721   }
10722 
10723   // Process each loop nest in the function.
10724   return LoopVectorizeResult(Changed, CFGChanged);
10725 }
10726 
10727 PreservedAnalyses LoopVectorizePass::run(Function &F,
10728                                          FunctionAnalysisManager &AM) {
10729     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
10730     auto &LI = AM.getResult<LoopAnalysis>(F);
10731     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10732     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10733     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
10734     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10735     auto &AA = AM.getResult<AAManager>(F);
10736     auto &AC = AM.getResult<AssumptionAnalysis>(F);
10737     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10738     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10739 
10740     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
10741     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
10742         [&](Loop &L) -> const LoopAccessInfo & {
10743       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,      SE,
10744                                         TLI, TTI, nullptr, nullptr, nullptr};
10745       return LAM.getResult<LoopAccessAnalysis>(L, AR);
10746     };
10747     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10748     ProfileSummaryInfo *PSI =
10749         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10750     LoopVectorizeResult Result =
10751         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
10752     if (!Result.MadeAnyChange)
10753       return PreservedAnalyses::all();
10754     PreservedAnalyses PA;
10755 
10756     // We currently do not preserve loopinfo/dominator analyses with outer loop
10757     // vectorization. Until this is addressed, mark these analyses as preserved
10758     // only for non-VPlan-native path.
10759     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10760     if (!EnableVPlanNativePath) {
10761       PA.preserve<LoopAnalysis>();
10762       PA.preserve<DominatorTreeAnalysis>();
10763     }
10764 
10765     if (Result.MadeCFGChange) {
10766       // Making CFG changes likely means a loop got vectorized. Indicate that
10767       // extra simplification passes should be run.
10768       // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10769       // be run if runtime checks have been added.
10770       AM.getResult<ShouldRunExtraVectorPasses>(F);
10771       PA.preserve<ShouldRunExtraVectorPasses>();
10772     } else {
10773       PA.preserveSet<CFGAnalyses>();
10774     }
10775     return PA;
10776 }
10777 
10778 void LoopVectorizePass::printPipeline(
10779     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10780   static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10781       OS, MapClassName2PassName);
10782 
10783   OS << "<";
10784   OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10785   OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10786   OS << ">";
10787 }
10788